<a href="https://colab.research.google.com/github/stillrahim/jupyter-exploration/blob/main/ITAI2373_NewsBot_Midterm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages (run once)
!pip install -q kaggle pandas numpy sklearn nltk tqdm

# Optional for reading jsonlines faster
!pip install -q jsonlines

# NLTK downloads (for later preprocessing if you want)
import nltk
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
# Upload kaggle.json (only if using Kaggle API)
from google.colab import files, drive
import os, sys

print("If you plan to use Kaggle API, upload your kaggle.json now. Otherwise, skip this cell.")
uploaded = files.upload()  # choose kaggle.json

# Move to .kaggle
if 'kaggle.json' in uploaded:
    os.makedirs('/root/.kaggle', exist_ok=True)
    with open('/root/.kaggle/kaggle.json','wb') as f:
        f.write(uploaded['kaggle.json'])
    os.chmod('/root/.kaggle/kaggle.json', 0o600)
    print("✅ kaggle.json installed.")
else:
    print("No kaggle.json uploaded — proceed with manual option or upload later.")


In [None]:
# Choose which dataset to fetch. Options: 'bbc', 'news_category', 'all_the_news', 'manual'
# If 'manual', you will upload file(s) in the next cell.
DATA_CHOICE = 'bbc'   # <-- change this as needed: 'bbc', 'news_category', 'all_the_news', or 'manual'


In [None]:
import os, sys, zipfile, glob, subprocess

def run_cmd(cmd):
    print(">>>", cmd)
    r = subprocess.run(cmd, shell=True, check=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    print(r.stdout.decode('utf-8', errors='ignore'))

if DATA_CHOICE == 'bbc':
    # BBC competition download
    print("Downloading BBC News (learn-ai-bbc competition). This requires kaggle.json uploaded.")
    run_cmd('kaggle competitions download -c learn-ai-bbc -p /content --force')
    # unzip
    if os.path.exists('/content/learn-ai-bbc.zip'):
        run_cmd('unzip -o /content/learn-ai-bbc.zip -d /content')
elif DATA_CHOICE == 'news_category':
    print("Downloading News Category Dataset (rmisra/news-category-dataset).")
    run_cmd('kaggle datasets download -d rmisra/news-category-dataset -p /content --force')
    if os.path.exists('/content/news-category-dataset.zip'):
        run_cmd('unzip -o /content/news-category-dataset.zip -d /content')
elif DATA_CHOICE == 'all_the_news':
    print("Downloading All the News dataset (snapcrack/all-the-news).")
    run_cmd('kaggle datasets download -d snapcrack/all-the-news -p /content --force')
    if os.path.exists('/content/all-the-news.zip'):
        run_cmd('unzip -o /content/all-the-news.zip -d /content')
elif DATA_CHOICE == 'manual':
    print("Manual mode selected. Please upload CSV or JSON files in the next cell.")
else:
    raise ValueError("Unknown DATA_CHOICE. Set to 'bbc', 'news_category', 'all_the_news', or 'manual'.")


In [None]:
# Only run if you chose DATA_CHOICE = 'manual'
from google.colab import files
if DATA_CHOICE == 'manual':
    print("Upload your dataset file(s) now (CSV or JSON/JSONL).")
    uploaded = files.upload()
    print("Uploaded:", list(uploaded.keys()))
else:
    print("Skipping manual upload (DATA_CHOICE != 'manual').")


In [None]:
import pandas as pd, json, os, jsonlines, glob
from tqdm import tqdm

# search for likely datasets in current dir
candidates = glob.glob('/content/**/*.csv', recursive=True) + glob.glob('/content/**/*.json', recursive=True) + glob.glob('/content/**/*.jsonl', recursive=True)
candidates = list(set(candidates))
print("Candidate files found:")
for f in candidates:
    print(" -", f)

# Heuristic loader for common known files
def try_load_any():
    # Try BBC style: bbc-text.csv or train.csv
    bbc_files = [p for p in candidates if os.path.basename(p).lower().startswith('bbc') or os.path.basename(p).lower().startswith('train')]
    if bbc_files:
        print("Trying BBC-style CSV:", bbc_files[0])
        return pd.read_csv(bbc_files[0])
    # Try News Category JSONL
    news_jsonl = [p for p in candidates if 'news' in os.path.basename(p).lower() and p.lower().endswith('.json')]
    if news_jsonl:
        # try reading jsonlines
        path = news_jsonl[0]
        try:
            with open(path, 'r', encoding='utf-8') as f:
                # determine if newline-delimited JSON
                first = f.readline()
                if first.strip().startswith('{'):
                    # assume jsonlines
                    data = []
                    with jsonlines.open(path) as reader:
                        for obj in reader:
                            data.append(obj)
                    return pd.DataFrame(data)
                else:
                    # try normal json
                    f.seek(0)
                    obj = json.load(f)
                    return pd.DataFrame(obj)
        except Exception as e:
            print("Failed json load:", e)
    # pick any CSV
    csvs = [p for p in candidates if p.lower().endswith('.csv')]
    if csvs:
        print("Trying generic CSV:", csvs[0])
        return pd.read_csv(csvs[0], encoding='utf-8', error_bad_lines=False)
    # pick any jsonl
    jsonls = [p for p in candidates if p.lower().endswith('.jsonl') or p.lower().endswith('.json')]
    if jsonls:
        print("Trying generic JSON/JSONL:", jsonls[0])
        try:
            with jsonlines.open(jsonls[0]) as reader:
                data = [obj for obj in reader]
            return pd.DataFrame(data)
        except Exception as e:
            print("Generic JSON read failed:", e)
    raise FileNotFoundError("No suitable dataset file found. Please upload CSV/JSON as instructed.")

# load
df = try_load_any()
print("Loaded dataframe with shape:", df.shape)
display(df.head(3))


In [None]:
# Heuristics for column names
possible_text_cols = ['text','content','article','headline','description','body','clean_text']
possible_cat_cols  = ['category','label','class','topic','news_desk','section']

cols = df.columns.tolist()
text_col = None
cat_col = None

for c in possible_text_cols:
    if c in cols:
        text_col = c
        break
for c in possible_cat_cols:
    if c in cols:
        cat_col = c
        break

# If still None, try to guess: longest text column
if text_col is None:
    # choose column with longest average string length
    text_candidates = [c for c in cols if df[c].dtype == object]
    if text_candidates:
        avg_len = {c: df[c].dropna().astype(str).map(len).mean() for c in text_candidates}
        text_col = max(avg_len, key=avg_len.get)
        print("Guessed text column as:", text_col)
    else:
        raise ValueError("No text-like columns found. Please provide dataset with text content.")

if cat_col is None:
    # try to pick a short string/object column with few unique values
    obj_cols = [c for c in cols if df[c].dtype == object and c != text_col]
    candidate_scores = {}
    for c in obj_cols:
        nunique = df[c].nunique(dropna=True)
        candidate_scores[c] = nunique
    if candidate_scores:
        cat_col = min(candidate_scores, key=candidate_scores.get)
        print("Guessed category column as:", cat_col)
    else:
        print("No obvious category column found — you will need to add category labels before sampling.")
        cat_col = None

print("Using text column:", text_col, "category column:", cat_col)

# Basic cleaning
df[text_col] = df[text_col].astype(str).str.strip()
if cat_col:
    df[cat_col] = df[cat_col].astype(str).str.strip()

# Drop missing
if cat_col:
    df_clean = df.dropna(subset=[text_col, cat_col]).copy()
else:
    df_clean = df.dropna(subset=[text_col]).copy()

print("After dropna shape:", df_clean.shape)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

MAX_ROWS = 2000
MIN_ROWS = 500
MIN_CATEGORIES = 4

if cat_col:
    # Count categories
    cat_counts = df_clean[cat_col].value_counts()
    print("Category counts (top 10):")
    print(cat_counts.head(10))

    num_cats = cat_counts.shape[0]
    total_rows = len(df_clean)
    print(f"Total rows: {total_rows}, Categories: {num_cats}")

    # If too many rows, sample stratified by category proportionally
    if total_rows > MAX_ROWS:
        print(f"Sampling down to {MAX_ROWS} rows (stratified by category).")
        # Compute proportion per class
        proportions = cat_counts / cat_counts.sum()
        # desired per class (at least 1)
        desired = (proportions * MAX_ROWS).round().astype(int)
        # ensure sum matches MAX_ROWS due to rounding
        diff = MAX_ROWS - desired.sum()
        if diff != 0:
            # adjust top classes
            idx = desired.sort_values(ascending=False).index.tolist()
            i = 0
            while diff != 0:
                desired[idx[i % len(idx)]] += np.sign(diff)
                diff = MAX_ROWS - desired.sum()
                i += 1
        # perform sampling per class
        sampled_frames = []
        for cls, n in desired.items():
            subset = df_clean[df_clean[cat_col]==cls]
            if len(subset) <= n:
                sampled_frames.append(subset)
            else:
                sampled_frames.append(subset.sample(n=n, random_state=42))
        df_sampled = pd.concat(sampled_frames).sample(frac=1, random_state=42).reset_index(drop=True)
    else:
        df_sampled = df_clean.copy()

    # Post-sampling checks
    final_rows = len(df_sampled)
    final_cats = df_sampled[cat_col].nunique()
    print(f"Final dataset rows: {final_rows}, categories: {final_cats}")

else:
    # No categories identified — just sample up to MAX_ROWS
    print("No category column detected; sampling up to MAX_ROWS rows.")
    if len(df_clean) > MAX_ROWS:
        df_sampled = df_clean.sample(n=MAX_ROWS, random_state=42)
    else:
        df_sampled = df_clean.copy()

# If we still have more than MAX_ROWS (safety)
if len(df_sampled) > MAX_ROWS:
    df_sampled = df_sampled.sample(n=MAX_ROWS, random_state=42)

# Final sanity
print("Sampled dataset shape:", df_sampled.shape)
display(df_sampled.head(2))


In [None]:
# Validate required constraints
ok = True
msgs = []

if len(df_sampled) < MIN_ROWS:
    ok = False
    msgs.append(f"Dataset has only {len(df_sampled)} rows (< {MIN_ROWS}). Consider adding more articles.")

if cat_col:
    unique_cats = df_sampled[cat_col].nunique()
    if unique_cats < MIN_CATEGORIES:
        ok = False
        msgs.append(f"Only {unique_cats} categories found (< {MIN_CATEGORIES}). Consider using a dataset with more categories or remapping labels.")

print("Validation results:")
if ok:
    print("✅ Dataset meets minimum requirements.")
else:
    print("⚠️ Dataset does NOT meet requirements:")
    for m in msgs:
        print(" -", m)

# rename columns to standard names expected downstream
if cat_col:
    df_final = df_sampled.rename(columns={text_col: 'content', cat_col: 'category'})[['content','category']]
else:
    df_final = df_sampled.rename(columns={text_col: 'content'})[['content']]

# Save locally and to Drive (optional)
outname = 'newsbot_dataset.csv'
df_final.to_csv(outname, index=False)
print(f"Saved prepared dataset to /content/{outname}")

# Optionally mount Drive and save there too
save_to_drive = False  # change to True if you want to save to Drive
if save_to_drive:
    drive.mount('/content/drive', force_remount=True)
    drive_path = '/content/drive/MyDrive/' + outname
    df_final.to_csv(drive_path, index=False)
    print("Also saved to Google Drive:", drive_path)


In [None]:
# Validate required constraints
ok = True
msgs = []

if len(df_sampled) < MIN_ROWS:
    ok = False
    msgs.append(f"Dataset has only {len(df_sampled)} rows (< {MIN_ROWS}). Consider adding more articles.")

if cat_col:
    unique_cats = df_sampled[cat_col].nunique()
    if unique_cats < MIN_CATEGORIES:
        ok = False
        msgs.append(f"Only {unique_cats} categories found (< {MIN_CATEGORIES}). Consider using a dataset with more categories or remapping labels.")

print("Validation results:")
if ok:
    print("✅ Dataset meets minimum requirements.")
else:
    print("⚠️ Dataset does NOT meet requirements:")
    for m in msgs:
        print(" -", m)

# rename columns to standard names expected downstream
if cat_col:
    df_final = df_sampled.rename(columns={text_col: 'content', cat_col: 'category'})[['content','category']]
else:
    df_final = df_sampled.rename(columns={text_col: 'content'})[['content']]

# Save locally and to Drive (optional)
outname = 'newsbot_dataset.csv'
df_final.to_csv(outname, index=False)
print(f"Saved prepared dataset to /content/{outname}")

# Optionally mount Drive and save there too
save_to_drive = False  # change to True if you want to save to Drive
if save_to_drive:
    drive.mount('/content/drive', force_remount=True)
    drive_path = '/content/drive/MyDrive/' + outname
    df_final.to_csv(drive_path, index=False)
    print("Also saved to Google Drive:", drive_path)


In [None]:
print("Final dataset preview:")
display(df_final.sample(n=min(5,len(df_final)), random_state=42))

if 'category' in df_final.columns:
    print("\nCategory distribution:")
    print(df_final['category'].value_counts())
