In [None]:
!pip install requests beautifulsoup4 scikit-learn nltk pandas

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import re
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    return text.strip()


def is_same_domain(base_url, link):
    return urlparse(base_url).netloc == urlparse(link).netloc

In [None]:
def crawl_site(base_url, keywords, max_pages=50, delay=1):
    visited = set()
    to_visit = [base_url]
    results = []

    keywords = [k.lower() for k in keywords]

    while to_visit and len(results) < max_pages:
        url = to_visit.pop(0)
        if url in visited:
            continue

        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            visited.add(url)

            text = clean_text(soup.get_text())

            if any(k in text for k in keywords):
                results.append({
                    "url": url,
                    "content": text[:5000]  # limit size
                })

            for link in soup.find_all("a", href=True):
                full_url = urljoin(base_url, link["href"])
                if is_same_domain(base_url, full_url):
                    if full_url not in visited and full_url not in to_visit:
                        to_visit.append(full_url)

            time.sleep(delay)

        except Exception as e:
            continue

    return pd.DataFrame(results)

In [None]:
def categorize_content(df, n_categories=5):
    vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)
    X = vectorizer.fit_transform(df["content"])

    model = KMeans(n_clusters=n_categories, random_state=42)
    df["category"] = model.fit_predict(X)

    return df

In [None]:
# USER INPUT
base_url = input("Enter website URL (e.g. https://example.com): ").strip()
keywords = input("Enter keywords (comma separated): ").split(",")

keywords = [k.strip() for k in keywords]

# CRAWL
df = crawl_site(base_url, keywords)

print(f"Collected {len(df)} relevant pages")

# CATEGORIZE
df = categorize_content(df)

df[["url", "category"]].head()

In [None]:
for cat in sorted(df["category"].unique()):
    print(f"\nðŸ”¹ Category {cat}")
    display(df[df["category"] == cat][["url"]])