In [1]:
!pip install -q pandas numpy scikit-learn joblib requests beautifulsoup4 matplotlib
# optional (richer article text): newspaper3k
!pip install -q newspaper3k

In [2]:
import os, re, json, joblib, math
from pathlib import Path
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score

# Make folders
os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)
os.makedirs("unlabeled", exist_ok=True)

print("Setup done")

Setup done


In [3]:
# Load Kaggle dataset
# Path to the JSON you have already: data/News_Category_Dataset_v3.json
DATA_PATH = "data/News_Category_Dataset_v3.json"
assert os.path.exists(DATA_PATH), f"Dataset not found: {DATA_PATH}"

df_raw = pd.read_json(DATA_PATH, lines=True)
print("raw shape:", df_raw.shape)
df_raw.head(3)

# Compose text
df_raw["text"] = (df_raw["headline"].fillna("") + ". " + df_raw["short_description"].fillna("")).str.strip()

# Mapping: expand as you like
mapping = {
    # Politics-like
    "POLITICS": "Politics", "WORLD NEWS": "Politics", "U.S. NEWS": "Politics",
    "THE WORLDPOST": "Politics", "WORLDPOST": "Politics", "ENVIRONMENT": "Politics",
    # Sports
    "SPORTS": "Sports",
    # Business
    "BUSINESS": "Business", "MONEY": "Business",
    # Technology / Science -> Technology
    "TECH": "Technology", "TECHNOLOGY": "Technology", "SCIENCE": "Technology",
    # Crime
    "CRIME": "Crime",
    # (Add more HuffPost categories that you want to fold into these five)
}

# Uppercase categories then map
df_raw["super_category"] = df_raw["category"].str.upper().map(mapping)

# Keep only rows mapped to our 5 classes and non-empty text
df = df_raw.dropna(subset=["super_category", "text"]).copy()
df = df[["text", "super_category"]].reset_index(drop=True)
print("Filtered to 5 classes:", df.shape)
print(df["super_category"].value_counts())

raw shape: (209527, 6)
Filtered to 5 classes: (68662, 2)
super_category
Politics      47965
Business       7748
Sports         5077
Technology     4310
Crime          3562
Name: count, dtype: int64


In [4]:
# Clean text (simple, fast), split and vectorize for supervised model
def clean_text_simple(s):
    if not isinstance(s, str):
        s = str(s)
    s = s.lower()
    # keep letters, numbers and spaces
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["text_clean"] = df["text"].apply(clean_text_simple)

# Train/test split (stratify by super_category)
X_train, X_test, y_train, y_test = train_test_split(
    df["text_clean"], df["super_category"], test_size=0.20, random_state=42, stratify=df["super_category"]
)

print("train/test sizes:", len(X_train), len(X_test))

# Supervised TF-IDF (this is the vectorizer the classifier expects)
supervised_tfidf = TfidfVectorizer(stop_words="english", max_features=30000, ngram_range=(1,2), min_df=3)
X_train_tfidf = supervised_tfidf.fit_transform(X_train)
X_test_tfidf  = supervised_tfidf.transform(X_test)

print("TF-IDF shapes:", X_train_tfidf.shape, X_test_tfidf.shape)

train/test sizes: 54929 13733
TF-IDF shapes: (54929, 30000) (13733, 30000)


In [5]:
# Train classifier (LinearSVC) and evaluate — save model & vectorizer
clf = LinearSVC(random_state=42, max_iter=5000)   # robust for text
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print(classification_report(y_test, y_pred))

# Save supervised artifacts
joblib.dump(clf, "models/supervised_linear_svc.joblib")
joblib.dump(supervised_tfidf, "models/supervised_tfidf.joblib")
print("Saved supervised model + vectorizer.")

Accuracy: 0.8709
              precision    recall  f1-score   support

    Business       0.76      0.69      0.72      1550
       Crime       0.74      0.62      0.67       712
    Politics       0.90      0.95      0.92      9593
      Sports       0.86      0.83      0.84      1016
  Technology       0.77      0.60      0.67       862

    accuracy                           0.87     13733
   macro avg       0.81      0.74      0.77     13733
weighted avg       0.87      0.87      0.87     13733

Saved supervised model + vectorizer.


In [6]:
# Scrape BBC and Reuters for unlabeled recent news (full article text when possible)
# NOTE: scraping structure may change on websites. This is a robust, conservative approach:
#  - collect candidate article links from the index pages
#  - fetch each article and join <p> texts

def fetch_article_paragraphs(url):
    try:
        r = requests.get(url, timeout=10, headers={"User-Agent":"Mozilla/5.0"})
        if r.status_code != 200:
            return None
        soup = BeautifulSoup(r.text, "html.parser")
        # collect <p> text
        paras = [p.get_text().strip() for p in soup.find_all("p")]
        text = " ".join([p for p in paras if p])
        # keep if long enough
        return text if len(text.split()) > 40 else None
    except Exception:
        return None

def scrape_bbc(n=30):
    base = "https://www.bbc.com"
    idx = "https://www.bbc.com/news"
    r = requests.get(idx, headers={"User-Agent":"Mozilla/5.0"}, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")
    links = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/news") or href.startswith("/sport"):
            # build absolute URL
            url = href if href.startswith("http") else base + href
            links.add(url)
        if len(links) >= n*2:
            break
    texts = []
    for url in list(links)[: n*3]:
        t = fetch_article_paragraphs(url)
        if t:
            texts.append(t)
        if len(texts) >= n:
            break
    return texts

def scrape_reuters(n=30):
    base = "https://www.reuters.com"
    idx = "https://www.reuters.com/world/"
    r = requests.get(idx, headers={"User-Agent":"Mozilla/5.0"}, timeout=10)
    soup = BeautifulSoup(r.text, "html.parser")
    links = set()
    # Reuters uses <a href="/world/..."> in many places
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("/world") or href.startswith("/business") or href.startswith("/politics") or href.startswith("/technology"):
            url = href if href.startswith("http") else base + href
            links.add(url)
        if len(links) >= n*2:
            break
    texts = []
    for url in list(links)[: n*3]:
        t = fetch_article_paragraphs(url)
        if t:
            texts.append(t)
        if len(texts) >= n:
            break
    return texts

# Try scraping — falls back to reading ./unlabeled/*.txt if no internet or site blocks requests
unlabeled_texts = []
try:
    print("Scraping BBC...")
    unlabeled_texts += scrape_bbc(n=20)
    print("Scraping Reuters...")
    unlabeled_texts += scrape_reuters(n=20)
except Exception as e:
    print("Scrape exception:", e)

# If scraping yields nothing, load local .txt files
if not unlabeled_texts:
    for p in Path("unlabeled").glob("*.txt"):
        t = p.read_text(errors="ignore")
        if len(t.split()) > 30:
            unlabeled_texts.append(t)

# Fallback (very small) demo if still empty (so pipeline runs)
if not unlabeled_texts:
    unlabeled_texts = [
        "Government launches election campaign focusing on healthcare and education reform.",
        "Local football team clinches championship after a dramatic comeback in extra time.",
        "Major tech firms invest heavily in artificial intelligence and cloud infrastructure.",
        "Police investigate a corruption scandal around procurement contracts.",
        "Global markets rise as central banks signal potential rate pauses."
    ]

print("Unlabeled articles count:", len(unlabeled_texts))

Scraping BBC...
Scraping Reuters...
Unlabeled articles count: 20


In [7]:
# Vectorize unlabeled corpus (separate unsupervised vectorizer
# Use a smaller TF-IDF for unsupervised clustering (keeps resources modest)
unsup_tfidf = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
X_unsup = unsup_tfidf.fit_transform([clean_text_simple(t) for t in unlabeled_texts])

print("Unsupervised TF-IDF shape:", X_unsup.shape)

# Save unsupervised vectorizer
joblib.dump(unsup_tfidf, "models/unsup_tfidf.joblib")

Unsupervised TF-IDF shape: (20, 5000)


['models/unsup_tfidf.joblib']

In [8]:
# KMeans clustering
# choose a reasonable set of k to try
candidate_k = [3,4,5,6,7,8] if X_unsup.shape[0] >= 30 else [2,3,4,5]
best_k, best_score, best_km = None, -1, None
for k in candidate_k:
    if k >= X_unsup.shape[0]:
        continue
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_unsup)
    if len(set(labels)) > 1:
        try:
            s = silhouette_score(X_unsup, labels)
        except Exception:
            s = -1
        if s > best_score:
            best_k, best_score, best_km = k, s, km

if best_k is None:
    best_k = min(5, max(2, X_unsup.shape[0]//5))
    best_km = KMeans(n_clusters=best_k, random_state=42, n_init=10).fit(X_unsup)

kmeans = best_km
print("Chosen K:", best_k, "silhouette:", best_score)

# Extract top terms per cluster
def top_terms_kmeans(km_model, vectorizer, topn=10):
    centers = km_model.cluster_centers_
    terms = np.array(vectorizer.get_feature_names_out())
    topics = {}
    for idx, center in enumerate(centers):
        top_idx = np.argsort(center)[::-1][:topn]
        topics[idx] = terms[top_idx].tolist()
    return topics

kmeans_topics = top_terms_kmeans(kmeans, unsup_tfidf, topn=10)
for cid, terms in kmeans_topics.items():
    print(f"\nCluster {cid} top terms: {', '.join(terms)}")

# Save kmeans + cluster terms
joblib.dump(kmeans, "models/kmeans.joblib")
joblib.dump(kmeans_topics, "models/kmeans_topics.joblib")

Chosen K: 5 silhouette: 0.02922705910687092

Cluster 0 top terms: trump, said, plaza, like, afghans, president, ceasefire, film, peace, ukraine

Cluster 1 top terms: minister, hostages, gaza, prime minister, prime, israeli, israel, hamas, senedd, pengelly

Cluster 2 top terms: australia, korea, north korea, north, floods, wen, flash, missing, china, people

Cluster 3 top terms: homes, says, councils, housing, long term, farage, controlled councils, says reform, close hotels, nigel farage

Cluster 4 top terms: m23, phillips, zealand, new zealand, children, mr phillips, group, said, conflict, mr


['models/kmeans_topics.joblib']

In [9]:
# LDA topic modeling on unlabeled texts (CountVectorizer + LDA)
count_vec = CountVectorizer(stop_words="english", max_features=5000, min_df=2)
X_counts = count_vec.fit_transform([clean_text_simple(t) for t in unlabeled_texts])

n_topics = min(8, max(2, len(unlabeled_texts)//5))  # heuristic
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method="batch")
lda.fit(X_counts)

def lda_top_words(model, feature_names, n_top=8):
    out = {}
    for ti, comp in enumerate(model.components_):
        idxs = np.argsort(comp)[::-1][:n_top]
        out[ti] = [feature_names[i] for i in idxs]
    return out

lda_topics = lda_top_words(lda, count_vec.get_feature_names_out(), n_top=10)
for tid, words in lda_topics.items():
    print(f"\nLDA Topic {tid}: {', '.join(words)}")

# Save LDA artifacts
joblib.dump(lda, "models/lda_model.joblib")
joblib.dump(count_vec, "models/lda_countvec.joblib")
joblib.dump(lda_topics, "models/lda_topics.joblib")


LDA Topic 0: trump, said, north, president, ceasefire, say, peace, bbc, taliban, country

LDA Topic 1: minister, bbc, palestinian, israeli, day, prime, gaza, says, israel, 60

LDA Topic 2: says, bbc, external, new, people, uk, australia, children, police, rights

LDA Topic 3: homes, said, councils, bbc, m23, says, housing, children, government, people


['models/lda_topics.joblib']

In [10]:
# Unified prediction function — supervised + unsupervised labels (human readable)
# Load artifacts (if needed)
supervised_tfidf = joblib.load("models/supervised_tfidf.joblib")
clf = joblib.load("models/supervised_linear_svc.joblib")
unsup_tfidf = joblib.load("models/unsup_tfidf.joblib")
kmeans = joblib.load("models/kmeans.joblib")
kmeans_topics = joblib.load("models/kmeans_topics.joblib")
lda = joblib.load("models/lda_model.joblib")
count_vec = joblib.load("models/lda_countvec.joblib")
lda_topics = joblib.load("models/lda_topics.joblib")

def predict_article(text):
    # Clean text first (same cleaning as used in training)
    s = clean_text_simple(text)
    # Supervised category (one of the five)
    v_sup = supervised_tfidf.transform([s])
    cat = clf.predict(v_sup)[0]

    # Unsupervised cluster (KMeans)
    v_unsup = unsup_tfidf.transform([s])
    cl_id = int(kmeans.predict(v_unsup)[0])
    cl_terms = kmeans_topics.get(cl_id, [])
    cl_label = ", ".join(cl_terms[:6]) if cl_terms else f"Cluster {cl_id}"

    # LDA topic (top topic)
    c = count_vec.transform([s])
    topic_dist = lda.transform(c)[0]
    lda_top = int(np.argmax(topic_dist))
    lda_label = ", ".join(lda_topics.get(lda_top, [])[:6]) if lda_topics.get(lda_top) else f"Topic {lda_top}"

    return {
        "supervised_category": cat,
        "kmeans_cluster_id": cl_id,
        "kmeans_topic_label": cl_label,
        "lda_top_topic_id": lda_top,
        "lda_topic_label": lda_label
    }

In [11]:
# Try it — examples and interactive input
# Try a scraped sample
sample = unlabeled_texts[0]
print("SAMPLE TEXT:\n", sample[:1000], "\n")
print("PREDICTION:", predict_article(sample))

# Try arbitrary user text:
user_text = "Authorities opened a corruption probe into government contracts ahead of the election campaign."
print("USER PREDICTION:", predict_article(user_text))

# Optional: interactive loop (comment/uncomment as needed)
# while True:
#     u = input("Paste article (or 'exit'): ")
#     if u.strip().lower() == "exit":
#         break
#     print(predict_article(u))


SAMPLE TEXT:
 In 1980, when Corina Poore, 36 years old and pregnant, first opened the door to a derelict house in New Cross Gate, south-east London, the estate agent refused to step in with her. Inside were dead cats, dog excrement and filthy mattresses. Pigeons flew in through holes in the roof and there was no indoor toilet. The intense rotting smell was overwhelming. Still, Corina decided this was her dream home. It was spacious, the £24,000 price was affordable and she was sure that everything was fixable. After taking out a mortgage, she received a grant of £3,500 from Lewisham council, her local authority, which paid for fixing the ceiling. "At that point, £3,500 was quite a healthy amount, which I desperately needed," recalls Corina. Some 45 years on, her Victorian four-storey house is worth roughly £1m - something Corina, a semi-retired film and TV critic, could never have afforded otherwise. However, times have changed. Lewisham Council has continued to offer grants to the own

In [14]:
# Map cluster IDs → human labels (semi-automatic)
# Create a readable mapping for each KMeans cluster based on top terms
cluster_label_map = {}
for cid, terms in kmeans_topics.items():
    # naive label: join top 3 keywords
    cluster_label_map[cid] = " / ".join(terms[:5])
cluster_label_map
# You can manually rename cluster_label_map[cid] = "Ukraine War" etc. after inspection.

{0: 'trump / said / plaza / like / afghans',
 1: 'minister / hostages / gaza / prime minister / prime',
 2: 'australia / korea / north korea / north / floods',
 3: 'homes / says / councils / housing / long term',
 4: 'm23 / phillips / zealand / new zealand / children'}

In [None]:
# Steps above code is performing according to given Task:
1.	Load & map Kaggle dataset → into 5 categories (Politics, Sports, Business, Technology, Crime).
2.	Preprocess & clean text → train/test split, TF-IDF vectorization.
3.	Train classifier (LinearSVC) → evaluate, save model + vectorizer.
4.	Scrape BBC & Reuters (or fallback to local .txt unlabeled articles).
5.	Unsupervised text processing → separate vectorizer for clustering (no overwrite).
6.	KMeans clustering → auto-select k (via silhouette score), extract top terms, save topics.
7.	LDA topic modeling → discover latent topics with keywords.
8.	Unified prediction function (predict_article) →
9.	Gives supervised category (one of the 5 classes).
10.	Returns unsupervised cluster/topic labels (KMeans + LDA).
11.	Testing & interactive use → sample prediction on scraped or user input.
12.	Optional cluster label mapping → convert numeric clusters into human-readable names.