In [None]:
# 1) upgrade pip tooling
!pip install --upgrade pip setuptools wheel

# 2) small system deps to reduce build errors (Colab only)
!apt-get update -y -qq && apt-get install -y -qq build-essential

# 3) minimal python packages — note: no `evaluate`
!pip install -q transformers datasets scikit-learn accelerate pyarrow


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [None]:
# === Minimal, reliable pipeline (no transformers) ===

import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# --- Load dataset ---
df = pd.read_csv("IMDB Dataset.csv")
# expected columns: 'review', 'sentiment'
print("Columns:", df.columns.tolist())
df['label'] = df['sentiment'].str.strip().str.lower().map({'positive': 1, 'negative': 0})
df = df[['review', 'label']].dropna().reset_index(drop=True)
print("Total examples:", len(df))

# --- Train/val/test split (80/10/10) ---
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=SEED)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=SEED)
print(len(train_df), len(val_df), len(test_df), "-> train/val/test sizes")

# --- Create a reduced subset for model selection (stratified) ---
subset_train_size = 8000
subset_val_size = 2000

def stratified_subset(df, n, seed=SEED):
    pos = df[df.label==1]
    neg = df[df.label==0]
    n_pos = int(round(n * len(pos) / len(df)))
    n_neg = n - n_pos
    pos_samp = pos.sample(n=n_pos, random_state=seed)
    neg_samp = neg.sample(n=n_neg, random_state=seed)
    return pd.concat([pos_samp, neg_samp]).sample(frac=1, random_state=seed).reset_index(drop=True)

subset_train = stratified_subset(train_df, subset_train_size)
subset_val   = stratified_subset(val_df, subset_val_size)

print("Subset sizes:", len(subset_train), len(subset_val))

# --- Models to compare (5 classical, fast options) ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=400, random_state=SEED),
    "LinearSVC": LinearSVC(max_iter=4000, random_state=SEED),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=SEED, n_jobs=-1),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=200, random_state=SEED),
    "MultinomialNB": MultinomialNB()
}

# We use TF-IDF features (unigram+bigram), max_features limit for speed
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), stop_words='english')

# --- Helper: custom metric (binary F1) ---
def compute_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='binary')
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    acc = accuracy_score(y_true, y_pred)
    return {"f1": f1, "precision": precision, "recall": recall, "accuracy": acc}

# --- Fit TF-IDF on subset training text once and transform ---
X_sub_train = tfidf.fit_transform(subset_train['review'].values)
X_sub_val = tfidf.transform(subset_val['review'].values)
y_sub_train = subset_train['label'].values
y_sub_val   = subset_val['label'].values

# --- Compare models on subset ---
results = []
for name, model in models.items():
    print("Training (subset) ->", name)
    # For SVC which doesn't support predict_proba, we can wrap in CalibratedClassifierCV if you need probabilities.
    clf = model
    # Fit
    try:
        clf.fit(X_sub_train, y_sub_train)
    except Exception as e:
        print(f"  ERROR training {name}: {e}")
        results.append((name, None, None))
        continue
    preds = clf.predict(X_sub_val)
    metrics = compute_metrics(y_sub_val, preds)
    print(f"  {name} metrics: f1={metrics['f1']:.4f} prec={metrics['precision']:.4f} rec={metrics['recall']:.4f} acc={metrics['accuracy']:.4f}")
    results.append((name, metrics['f1'], clf))

# --- Pick best by F1 (fallback to first if tie) ---
results_sorted = sorted([r for r in results if r[1] is not None], key=lambda x: x[1], reverse=True)
if not results_sorted:
    raise RuntimeError("All models failed during subset training.")
best_name, best_f1, best_clf_on_subset = results_sorted[0]
print("\nBest on subset:", best_name, "with F1 =", best_f1)

# --- Now retrain best model on full train+val (the 'full training' stage) ---
# Fit TF-IDF on full training text (train + validation combined) for best final model
full_train = pd.concat([train_df, val_df]).sample(frac=1, random_state=SEED).reset_index(drop=True)
X_full_train = tfidf.fit_transform(full_train['review'].values)
y_full_train = full_train['label'].values

# Recreate the best classifier instance (fresh) from name to avoid carryover
best_model_cls = models[best_name]
print("\nRetraining best model on full training set:", best_name)
best_model_final = best_model_cls
best_model_final.fit(X_full_train, y_full_train)

# Evaluate on test set
X_test = tfidf.transform(test_df['review'].values)
y_test = test_df['label'].values
y_pred_test = best_model_final.predict(X_test)
final_metrics = compute_metrics(y_test, y_pred_test)
print("\nFinal test metrics for", best_name, ":", final_metrics)

# If model supports probabilities, get them; else use decision function and map to pseudo-probability
def get_probs(clf, X):
    if hasattr(clf, "predict_proba"):
        return clf.predict_proba(X)  # shape (n, classes)
    elif hasattr(clf, "decision_function"):
        # for binary, map decision_function to probabilities via logistic sigmoid
        df = clf.decision_function(X)
        # if multiclass, fallback to zeros
        if df.ndim == 1:
            from scipy.special import expit
            p_pos = expit(df)
            return np.vstack([1-p_pos, p_pos]).T
        else:
            # fallback: normalize softmax
            from scipy.special import softmax
            return softmax(df, axis=1)
    else:
        # fallback: deterministic one-hot from predict
        preds = clf.predict(X)
        probs = np.zeros((len(preds), 2))
        probs[np.arange(len(preds)), preds] = 1.0
        return probs

# --- Inference on 10 random test reviews ---
sampled = test_df.sample(10, random_state=SEED).reset_index(drop=True)
X_sample = tfidf.transform(sampled['review'].values)
probs = get_probs(best_model_final, X_sample)
preds_sample = np.argmax(probs, axis=1)

for i, row in sampled.iterrows():
    txt = row['review']
    true_label = "positive" if row['label'] == 1 else "negative"
    pred_label = "positive" if preds_sample[i] == 1 else "negative"
    p_pos = float(probs[i,1])
    print("----- Example", i+1, "-----")
    print("Review (first 400 chars):", txt[:400].replace("\n"," "))
    print(f"True: {true_label}  Pred: {pred_label}  Prob_pos: {p_pos:.3f}")
    print()

# --- Save best model and vectorizer for later use ---
import joblib
joblib.dump(best_model_final, "best_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
print("Saved best_model.pkl and tfidf_vectorizer.pkl")


Columns: ['review', 'sentiment']
Total examples: 50000
40000 5000 5000 -> train/val/test sizes
Subset sizes: 8000 2000
Training (subset) -> LogisticRegression
  LogisticRegression metrics: f1=0.8735 prec=0.8567 rec=0.8910 acc=0.8710
Training (subset) -> LinearSVC
  LinearSVC metrics: f1=0.8742 prec=0.8627 rec=0.8860 acc=0.8725
Training (subset) -> RandomForest
  RandomForest metrics: f1=0.8537 prec=0.8646 rec=0.8430 acc=0.8555
Training (subset) -> GradientBoosting
  GradientBoosting metrics: f1=0.8410 prec=0.8004 rec=0.8860 acc=0.8325
Training (subset) -> MultinomialNB
  MultinomialNB metrics: f1=0.8543 prec=0.8648 rec=0.8440 acc=0.8560

Best on subset: LinearSVC with F1 = 0.874198322644302

Retraining best model on full training set: LinearSVC

Final test metrics for LinearSVC : {'f1': 0.910038068523342, 'precision': 0.911682055399438, 'recall': 0.9084, 'accuracy': 0.9102}
----- Example 1 -----
Review (first 400 chars): 'The Adventures Of Barry McKenzie' started life as a satirical co