In [None]:
# Task 3 — NLP (Sentiment Classification) — Full Colab Notebook

## 1. Setup & Installation
!pip install pandas numpy matplotlib scikit-learn seaborn transformers datasets --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import os

print("Setup complete.")

# -------------------------------------------------------------
## 2. Load Dataset - using 20 Newsgroups as sample sentiment dataset
# -------------------------------------------------------------
print("Loading a sample sentiment dataset (20 Newsgroups subset).")

from sklearn.datasets import fetch_20newsgroups

# We'll use 3 categories for demonstration
categories = ['rec.sport.baseball', 'sci.space', 'talk.politics.guns']
newsgroups_train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    remove=('headers', 'footers', 'quotes')
)
newsgroups_test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    remove=('headers', 'footers', 'quotes')
)

# Convert integer targets into category names
train_labels = [newsgroups_train.target_names[i] for i in newsgroups_train.target]
test_labels = [newsgroups_test.target_names[i] for i in newsgroups_test.target]

# Build DataFrame
df = pd.DataFrame({
    'text': newsgroups_train.data + newsgroups_test.data,
    'label': train_labels + test_labels
})

print("Sample sentiment dataset loaded. Shape:", df.shape)
print(df.head())

# -------------------------------------------------------------
## 3. Preprocessing — clean labels, group top categories
# -------------------------------------------------------------
df = df[['text', 'label']].dropna()
df['label'] = df['label'].astype(str).str.strip().str.lower()

# Ensure we don’t group beyond available labels
unique_labels = df['label'].nunique()
top_n = min(5, unique_labels)

# Group top-n labels + 'other'
top_labels = df['label'].value_counts().nlargest(top_n).index.tolist()
df['label_grouped'] = df['label'].apply(lambda x: x if x in top_labels else 'other')

print("Label distribution:")
print(df['label_grouped'].value_counts())

# -------------------------------------------------------------
## 4. Train-Test Split & TF-IDF
# -------------------------------------------------------------
X = df['text'].astype(str)
y = df['label_grouped']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vec = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_t = vec.fit_transform(X_train)
X_test_t = vec.transform(X_test)

print("Vectorization complete. Train shape:", X_train_t.shape)

# -------------------------------------------------------------
## 5. Classical Models
# -------------------------------------------------------------
# MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_t, y_train)
pred_mnb = mnb.predict(X_test_t)
print("MultinomialNB Accuracy:", accuracy_score(y_test, pred_mnb))
print(classification_report(y_test, pred_mnb))

# Logistic Regression (balanced)
lr = LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear')
lr.fit(X_train_t, y_train)
pred_lr = lr.predict(X_test_t)
print("LogReg Accuracy:", accuracy_score(y_test, pred_lr))
print(classification_report(y_test, pred_lr))

# -------------------------------------------------------------
## 6. Confusion Matrix for Logistic Regression
# -------------------------------------------------------------
cm = confusion_matrix(y_test, pred_lr, labels=lr.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=lr.classes_, yticklabels=lr.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Logistic Regression")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# -------------------------------------------------------------
## 7. Optional: Fine-tune DistilBERT (requires GPU)
# -------------------------------------------------------------
import torch
if torch.cuda.is_available():
    print("GPU is available. Proceeding with DistilBERT fine-tuning.")

    from datasets import Dataset
    from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments

    # Sample for faster demo
    sample_size = min(5000, len(df))
    df_small = df.sample(sample_size, random_state=42)

    dataset = Dataset.from_pandas(df_small[['text','label_grouped']])

    # Encode labels
    labels = list(df_small['label_grouped'].unique())
    label2id = {l:i for i,l in enumerate(labels)}
    id2label = {i:l for l,i in label2id.items()}
    dataset = dataset.map(lambda e: {'label_id': [label2id[x] for x in e['label_grouped']]}, batched=True)

    # Tokenization
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
    def tokenize(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=256)
    dataset = dataset.map(tokenize, batched=True)

    # Split
    dataset = dataset.train_test_split(test_size=0.2, seed=42)
    train_ds, test_ds = dataset['train'], dataset['test']
    train_ds.set_format(type='torch', columns=['input_ids','attention_mask','label_id'])
    test_ds.set_format(type='torch', columns=['input_ids','attention_mask','label_id'])

    # Model
    model = DistilBertForSequenceClassification.from_pretrained(
        'distilbert-base-uncased',
        num_labels=len(labels),
        id2label=id2label,
        label2id=label2id
    )

    # Training args
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=2,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./logs',
        logging_steps=50,
        load_best_model_at_end=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        tokenizer=tokenizer,
    )

    trainer.train()
    trainer.evaluate()
else:
    print("GPU not available. Skipping DistilBERT fine-tuning.")

# -------------------------------------------------------------
## 8. Save Models & Vectorizer
# -------------------------------------------------------------
import joblib

joblib.dump(vec, "tfidf_vectorizer.pkl")
joblib.dump(mnb, "model_mnb.pkl")
joblib.dump(lr, "model_logreg.pkl")

print("Models saved locally.")
print("Process completed successfully!")
