# Sport vs Politics Text Classification


## 1. Imports and setup

In [91]:
import re
import math
import time
import unicodedata
from collections import Counter
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

SEED = 50


Politics News 

In [92]:
import os
import csv
import requests
from bs4 import BeautifulSoup

output_folder = "toi_politics"
output_file = os.path.join(output_folder, "headlines_politics.csv")

def scrape_politics_headlines(page):
    base = "https://timesofindia.indiatimes.com/politics/news"
    url = base if page == 1 else f"{base}/{page}"

    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    headlines = []
    for a in soup.find_all("a"):
        href = a.get("href", "")
        txt  = a.get_text(strip=True)

        if txt and "/politics/news" in href:
            headlines.append(txt)

    return headlines

# Scrape multiple pages
all_headlines = []
os.makedirs(output_folder, exist_ok=True)

if not os.path.exists(output_file):
    for page_num in range(1, 30):
        try:
            headlines = scrape_politics_headlines(page_num)
            print(f"Scraped Page {page_num} — {len(headlines)} items")
            for h in headlines:
                all_headlines.append([h, "politics"])
        except Exception as e:
            print("Error on page", page_num, e)

    # Save to CSV
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["headline", "category"])
        writer.writerows(all_headlines)
        
else:
    print("Already scraped, skipping...")

print(f"\nSaved {len(all_headlines)} headlines to:\n{output_file}")


Already scraped, skipping...

Saved 0 headlines to:
toi_politics\headlines_politics.csv


Sports News

In [93]:
output_folder = "indianexpress_sports"
output_file = os.path.join(output_folder, "sports_headlines.csv")

def scrape_sports_headlines(page):
    if page == 1:
        url = "https://indianexpress.com/section/sports/"
    else:
        url = f"https://indianexpress.com/section/sports/page/{page}/"

    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")

    headlines = []
    for tag in soup.find_all(["h2","h3"]):
        a = tag.find("a")
        if a and a.get_text(strip=True):
            headlines.append(a.get_text(strip=True))
    return headlines

all_headlines = []
os.makedirs(output_folder, exist_ok=True)

if not os.path.exists(output_file):
    os.makedirs(output_folder, exist_ok=True)
    for i in range(1, 60):
        try:
            items = scrape_sports_headlines(i)
            print(f"Page {i}: {len(items)} headlines")
            for h in items:
                all_headlines.append([h, "sports"])
        except Exception as e:
            print("Error on page", i, e)

    # Save to CSV
    with open(output_file, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["headline", "category"])
        writer.writerows(all_headlines)
else:            
    print("Already scraped, skipping...")

print(f"\nSaved {len(all_headlines)} headlines to:\n{output_file}")


Already scraped, skipping...

Saved 0 headlines to:
indianexpress_sports\sports_headlines.csv


## 2. Load the scraped dataset

In [94]:
sports = pd.read_csv('indianexpress_sports/sports_headlines.csv')
politics = pd.read_csv('toi_politics/headlines_politics.csv')

sports = sports[['headline', 'category']].copy()
politics = politics[['headline', 'category']].copy()

sports['category'] = 'sports'
politics['category'] = 'politics'

df = pd.concat([sports, politics], ignore_index=True)
df['headline'] = df['headline'].fillna('').astype(str)

print('Raw rows:', len(df))
print(df['category'].value_counts())
df.head()


Raw rows: 3135
category
politics    1660
sports      1475
Name: count, dtype: int64


Unnamed: 0,headline,category
0,India Women vs Australia Women 1st T20I ​Live Cricket Score: IND to take AUS in Sydney,sports
1,When Shoaib Malik made MS Dhoni wait for a photoshoot; and why Sehwag and Uthappa were chosen for a tie-breaking bowl-out,sports
2,"Colombo weather update today, India vs Pakistan T20 World Cup 2026: Will rain affect IND-PAK match today?",sports
3,"Sindhu, Srikanth, and the All England question: Gopichand on its relevance and India’s hopes",sports
4,"India vs Pakistan Today Match Playing 11, T20 World Cup 2026: Will Kuldeep Yadav get nod for IND vs PAK in Colombo?",sports


## 3. Normalization, tokenization, and n-gram generation

In [95]:
TOKEN_PATTERN = re.compile(r"[a-z0-9]+(?:'[a-z0-9]+)?")

STOPWORDS = {
    'a', 'an', 'the', 'and', 'or', 'for', 'of', 'to', 'in', 'on', 'at', 'by', 'from',
    'with', 'as', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'it', 'its',
    'that', 'this', 'these', 'those', 'into', 'about', 'over', 'under', 'after',
    'before', 'than', 'then', 'their', 'his', 'her', 'they', 'them', 'you', 'your',
    'i', 'we', 'our'
}


def normalize_text(text: str) -> str:
    replacements = {
        '???': "'",
        '???': "'",
        '???': '"',
        '??': '"',
        '???': '-',
        '???': '-',
        '?': ' ',
        '???': '...',
        ' ': ' ',
    }
    out = unicodedata.normalize('NFKC', text or '')
    for bad, good in replacements.items():
        out = out.replace(bad, good)
    return out.lower().strip()


def tokenize(text: str):
    tokens = TOKEN_PATTERN.findall(normalize_text(text))
    return [tok for tok in tokens if tok not in STOPWORDS and len(tok) > 1]


def build_ngrams(tokens, ngram_range=(1, 1)):
    lo, hi = ngram_range
    feats = []
    for n in range(lo, hi + 1):
        if len(tokens) < n:
            continue
        if n == 1:
            feats.extend(tokens)
            continue
        for i in range(len(tokens) - n + 1):
            feats.append('_'.join(tokens[i:i + n]))
    return feats


## 4. Custom feature extractor

In [96]:
class ScratchVectorizer:
    def __init__(self, ngram_range=(1, 1), min_df=2, max_features=12000, use_idf=False, sublinear_tf=False):
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_features = max_features
        self.use_idf = use_idf
        self.sublinear_tf = sublinear_tf
        self.vocabulary_ = {}
        self.feature_names_ = []
        self.idf_ = None

    def _doc_terms(self, text):
        tokens = tokenize(text)
        return build_ngrams(tokens, self.ngram_range)

    def fit(self, texts):
        doc_freq = Counter()
        for text in texts:
            terms = self._doc_terms(text)
            if not terms:
                continue
            doc_freq.update(set(terms))

        kept = [(term, df) for term, df in doc_freq.items() if df >= self.min_df]
        kept.sort(key=lambda x: (-x[1], x[0]))
        if self.max_features:
            kept = kept[:self.max_features]

        self.feature_names_ = [term for term, _ in kept]
        self.vocabulary_ = {term: i for i, term in enumerate(self.feature_names_)}

        if self.use_idf:
            n_docs = max(1, len(texts))
            idf_vals = np.ones(len(self.feature_names_), dtype=np.float64)
            for term, i in self.vocabulary_.items():
                df = doc_freq[term]
                idf_vals[i] = math.log((1 + n_docs) / (1 + df)) + 1.0
            self.idf_ = idf_vals
        else:
            self.idf_ = None

        return self

    def transform(self, texts):
        row_ind, col_ind, data = [], [], []

        for row, text in enumerate(texts):
            term_counts = Counter(self._doc_terms(text))
            if not term_counts:
                continue

            for term, count in term_counts.items():
                col = self.vocabulary_.get(term)
                if col is None:
                    continue

                val = float(count)
                if self.sublinear_tf and count > 0:
                    val = 1.0 + math.log(count)
                if self.idf_ is not None:
                    val *= float(self.idf_[col])

                row_ind.append(row)
                col_ind.append(col)
                data.append(val)

        return csr_matrix(
            (np.array(data, dtype=np.float64), (np.array(row_ind), np.array(col_ind))),
            shape=(len(texts), len(self.vocabulary_)),
            dtype=np.float64,
        )

    def fit_transform(self, texts):
        return self.fit(texts).transform(texts)


## 5. Clean dataset and make train-test split

In [97]:
df['clean_text'] = df['headline'].map(normalize_text)

rows_before = len(df)
df = df[df['clean_text'].str.len() > 0].drop_duplicates(subset=['clean_text', 'category']).reset_index(drop=True)
rows_after = len(df)

print('Rows after cleaning:', rows_after)
print('Rows removed:', rows_before - rows_after)
print(df['category'].value_counts())

df['token_len'] = df['clean_text'].map(lambda x: len(tokenize(x))).astype(int)
summary = df.groupby('category')['token_len'].describe()
cols = [c for c in ['count', 'mean', '50%', 'max'] if c in summary.columns]
print('Token length summary by class:\n', summary[cols])

X = df['clean_text'].tolist()
y = df['category'].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

print('\nTrain size:', len(X_train), 'Test size:', len(X_test))



Rows after cleaning: 2893
Rows removed: 242
category
politics    1462
sports      1431
Name: count, dtype: int64
Token length summary by class:
            count       mean   50%   max
category                               
politics  1462.0  10.515048  10.0  21.0
sports    1431.0  14.007687  14.0  26.0

Train size: 2314 Test size: 579


## 6. Define 3 feature methods and 3 classifiers

In [98]:
@dataclass
class FeatureSpec:
    name: str
    vectorizer: ScratchVectorizer


@dataclass
class ModelSpec:
    name: str
    estimator: object


feature_specs = [
    FeatureSpec('bow_unigram', ScratchVectorizer(ngram_range=(1, 1), min_df=2, max_features=10000, use_idf=False)),
    FeatureSpec('tfidf_unigram', ScratchVectorizer(ngram_range=(1, 1), min_df=2, max_features=10000, use_idf=True, sublinear_tf=True)),
    FeatureSpec('ngram_count_1_2', ScratchVectorizer(ngram_range=(1, 2), min_df=2, max_features=15000, use_idf=False)),
]

model_specs = [
    ModelSpec('decision_tree', DecisionTreeClassifier(max_depth=30, min_samples_split=4, random_state=SEED)),
    ModelSpec('random_forest', RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=4, random_state=SEED, n_jobs=-1)),
    ModelSpec('knn', KNeighborsClassifier(n_neighbors=7, weights='distance', metric='cosine')),
]

print('Feature methods:', [f.name for f in feature_specs])
print('Classifiers:', [m.name for m in model_specs])
print('Total combinations:', len(feature_specs) * len(model_specs))


Feature methods: ['bow_unigram', 'tfidf_unigram', 'ngram_count_1_2']
Classifiers: ['decision_tree', 'random_forest', 'knn']
Total combinations: 9


## 7. Train and evaluate all 9 combinations

In [99]:
def top_terms(model, feature_names, top_k=15):
    if len(feature_names) == 0:
        return []

    if hasattr(model, 'coef_'):
        coef = np.asarray(model.coef_)[0]
        pos_idx = np.argsort(coef)[-top_k:][::-1]
        neg_idx = np.argsort(coef)[:top_k]
        rows = []
        rows.extend([('sports', feature_names[i], float(coef[i])) for i in pos_idx])
        rows.extend([('politics', feature_names[i], float(coef[i])) for i in neg_idx])
        return rows

    if hasattr(model, 'feature_log_prob_') and hasattr(model, 'classes_'):
        classes = list(model.classes_)
        if 'sports' in classes and 'politics' in classes:
            i_s = classes.index('sports')
            i_p = classes.index('politics')
            score = model.feature_log_prob_[i_s] - model.feature_log_prob_[i_p]
            pos_idx = np.argsort(score)[-top_k:][::-1]
            neg_idx = np.argsort(score)[:top_k]
            rows = []
            rows.extend([('sports', feature_names[i], float(score[i])) for i in pos_idx])
            rows.extend([('politics', feature_names[i], float(score[i])) for i in neg_idx])
            return rows


    if hasattr(model, 'feature_importances_'):
        imp = np.asarray(model.feature_importances_)
        idx = np.argsort(imp)[-top_k:][::-1]
        return [('global', feature_names[i], float(imp[i])) for i in idx if imp[i] > 0]

    return []


results = []
confusions = {}
trained = {}
all_top_terms = []
misclassified = []

for feat in feature_specs:
    vec = feat.vectorizer
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)

    for mdl in model_specs:
        clf = clone(mdl.estimator)

        t0 = time.perf_counter()
        clf.fit(X_train_vec, y_train)
        train_time = time.perf_counter() - t0

        t1 = time.perf_counter()
        pred = clf.predict(X_test_vec)
        infer_time = time.perf_counter() - t1

        acc = accuracy_score(y_test, pred)
        p, r, f1, _ = precision_recall_fscore_support(
            y_test, pred, average='binary', pos_label='sports', zero_division=0
        )

        cm = confusion_matrix(y_test, pred, labels=['politics', 'sports'])

        row = {
            'feature_method': feat.name,
            'classifier': mdl.name,
            'accuracy': float(acc),
            'precision_sports': float(p),
            'recall_sports': float(r),
            'f1_sports': float(f1),
            'train_time_sec': float(train_time),
            'inference_time_sec': float(infer_time),
            'vocab_size': int(len(vec.vocabulary_)),
        }
        results.append(row)

        key = (feat.name, mdl.name)
        confusions[key] = cm
        trained[key] = (vec, clf)

        for klass, term, score in top_terms(clf, vec.feature_names_, top_k=15):
            all_top_terms.append({
                'feature_method': feat.name,
                'classifier': mdl.name,
                'class': klass,
                'term': term,
                'score': score,
            })

        for text, gold, guess in zip(X_test, y_test, pred):
            if gold != guess:
                misclassified.append({
                    'feature_method': feat.name,
                    'classifier': mdl.name,
                    'actual': gold,
                    'predicted': guess,
                    'headline': text,
                })

results_df = pd.DataFrame(results).sort_values(['f1_sports', 'accuracy'], ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,feature_method,classifier,accuracy,precision_sports,recall_sports,f1_sports,train_time_sec,inference_time_sec,vocab_size
0,ngram_count_1_2,knn,0.967185,0.965157,0.968531,0.966841,0.001332,0.037515,4848
1,bow_unigram,knn,0.967185,0.968421,0.965035,0.966725,0.001447,0.038846,2975
2,tfidf_unigram,random_forest,0.967185,0.989011,0.944056,0.966011,0.268546,0.038501,2975
3,ngram_count_1_2,random_forest,0.967185,0.99262,0.940559,0.965889,0.295764,0.037662,4848
4,tfidf_unigram,knn,0.965458,0.96831,0.961538,0.964912,0.001138,0.031375,2975
5,bow_unigram,random_forest,0.965458,0.988971,0.940559,0.964158,0.33605,0.037177,2975
6,bow_unigram,decision_tree,0.8981,0.838806,0.982517,0.904992,0.037542,0.000544,2975
7,tfidf_unigram,decision_tree,0.8981,0.838806,0.982517,0.904992,0.028635,0.000332,2975
8,ngram_count_1_2,decision_tree,0.894646,0.833828,0.982517,0.902087,0.039369,0.000534,4848


## 8. Best model details and confusion matrix

In [100]:
best = results_df.iloc[0]
print('Best combination:', best['feature_method'], '+', best['classifier'])
print('Accuracy:', round(best['accuracy'], 4))
print('Precision (sports):', round(best['precision_sports'], 4))
print('Recall (sports):', round(best['recall_sports'], 4))
print('F1 (sports):', round(best['f1_sports'], 4))

best_key = (best['feature_method'], best['classifier'])
best_cm = confusions[best_key]

cm_df = pd.DataFrame(
    best_cm,
    index=['actual_politics', 'actual_sports'],
    columns=['pred_politics', 'pred_sports']
)
cm_df


Best combination: ngram_count_1_2 + knn
Accuracy: 0.9672
Precision (sports): 0.9652
Recall (sports): 0.9685
F1 (sports): 0.9668


Unnamed: 0,pred_politics,pred_sports
actual_politics,283,10
actual_sports,9,277


## 9. Save Output

In [101]:
out = Path('outputs')
out.mkdir(parents=True, exist_ok=True)

results_df.to_csv(out / 'model_comparison_9combos.csv', index=False)

stats = {
    'rows_before_cleaning': int(rows_before),
    'rows_after_cleaning': int(rows_after),
    'rows_removed': int(rows_before - rows_after),
    'politics_count_after_cleaning': int((df['category'] == 'politics').sum()),
    'sports_count_after_cleaning': int((df['category'] == 'sports').sum()),
    'avg_tokens_per_headline': float(df['token_len'].mean()),
    'median_tokens_per_headline': float(df['token_len'].median()),
}
pd.DataFrame(list(stats.items()), columns=['metric', 'value']).to_csv(out / 'dataset_stats_custom.csv', index=False)

for (feat_name, model_name), cm in confusions.items():
    cm_file = out / f"cm_{feat_name}__{model_name}.csv"
    pd.DataFrame(cm, index=['politics', 'sports'], columns=['politics', 'sports']).to_csv(cm_file)

top_terms_df = pd.DataFrame.from_records(all_top_terms) if all_top_terms else pd.DataFrame(
    columns=['feature_method', 'classifier', 'class', 'term', 'score']
)
top_terms_df.to_csv(out / 'top_terms_all_models.csv', index=False)

misclassified_df = pd.DataFrame.from_records(misclassified) if misclassified else pd.DataFrame(
    columns=['feature_method', 'classifier', 'actual', 'predicted', 'headline']
)
misclassified_df.to_csv(out / 'all_misclassifications.csv', index=False)

best_summary = pd.DataFrame([
    {
        'best_feature_method': best['feature_method'],
        'best_classifier': best['classifier'],
        'accuracy': best['accuracy'],
        'precision_sports': best['precision_sports'],
        'recall_sports': best['recall_sports'],
        'f1_sports': best['f1_sports'],
    }           
])
best_summary.to_csv(out / 'best_model_summary.csv', index=False)

best_model_errors = misclassified_df[
    (misclassified_df['feature_method'] == best['feature_method']) &
    (misclassified_df['classifier'] == best['classifier'])
]
best_model_errors.to_csv(out / 'best_model_errors.csv', index=False)

print('Saved output files in:', out.resolve())


Saved output files in: D:\NLU\Assn 1\prob 4\outputs


## 10. Optional prediction helper

In [102]:
def predict_headline(text: str):
    vec, clf = trained[best_key]
    x = vec.transform([normalize_text(text)])
    return clf.predict(x)[0]

# sample = 'Parliament debate intensifies over new election funding bill'
sample = 'bjp government faces criticism over handling of recent sports events'
print('Sample:', sample)
print('Predicted class:', predict_headline(sample))


Sample: bjp government faces criticism over handling of recent sports events
Predicted class: politics
