# 03_Feature_Engineering_and_Modeling

PhD-level notebook: TF-IDF baselines + DistilBERT fine-tuning for IMDB sentiment classification.

Saves outputs (models, vectorizers, metrics) to the `results/` folder at project root.


In [None]:
%%bash
set -e
python -c "import sys; print('Python', sys.version)"
# Install datasets if missing (transformers assumed installed)
python -c "import importlib; importlib.import_module('datasets')" 2>/dev/null || pip install --quiet datasets


In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Paths (adjust if your project root differs)
PREPROCESSED_PATH = Path('/Users/ravina/Desktop/Text_Classification_Project/data/processed/imdb_preprocessed.csv')
RESULTS_DIR = Path('/Users/ravina/Desktop/Text_Classification_Project/results')
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
print('Preprocessed path:', PREPROCESSED_PATH)
print('Results dir:', RESULTS_DIR)

assert PREPROCESSED_PATH.exists(), f'Preprocessed file not found at {PREPROCESSED_PATH}'
df = pd.read_csv(PREPROCESSED_PATH)
print('Loaded rows:', len(df))
df.head()


## Quick checks and train/test split
Ensure your dataframe contains a processed text column. The notebook will try `processed_text`, `final_review`, `clean_text`, then `review`.

In [None]:
# Choose text column
for col in ['processed_text','final_review','clean_text','review']:
    if col in df.columns:
        TEXT_COL = col
        break
print('Using text column:', TEXT_COL)

if df['sentiment'].dtype == 'object':
    df['sentiment'] = df['sentiment'].map({'neg':0,'pos':1})

X = df[TEXT_COL].astype(str)
y = df['sentiment'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train samples:', len(X_train), 'Test samples:', len(X_test))


## TF-IDF vectorization and Classical ML Baselines
We vectorize with TF-IDF and train Logistic Regression, MultinomialNB, LinearSVC, RandomForest.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
import joblib

vec = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_vec = vec.fit_transform(X_train)
X_test_vec = vec.transform(X_test)
print('TF-IDF shape:', X_train_vec.shape)

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'MultinomialNB': MultinomialNB(),
    'LinearSVC': LinearSVC(max_iter=10000),
    'RandomForest': RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
}

results = {}
for name, model in models.items():
    print('\nTraining', name)
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    try:
        prob = model.predict_proba(X_test_vec)[:,1]
        auc = roc_auc_score(y_test, prob)
    except Exception:
        auc = None
    print(name, 'Accuracy:', acc, 'AUC:', auc)
    print(classification_report(y_test, preds))
    results[name] = {'model': model, 'accuracy': acc, 'auc': auc}
    joblib.dump(model, RESULTS_DIR / f'{name}.joblib')

# save vectorizer
joblib.dump(vec, RESULTS_DIR / 'tfidf_vectorizer.joblib')
print('\nSaved models and vectorizer to', RESULTS_DIR)


### Feature importance from Logistic Regression (top features)
We print top positive and negative coefficients from Logistic Regression for interpretability.

In [None]:
import numpy as np
clf = results['LogisticRegression']['model']
if hasattr(clf, 'coef_'):
    coefs = clf.coef_[0]
    top_pos_idx = np.argsort(coefs)[-30:][::-1]
    top_neg_idx = np.argsort(coefs)[:30]
    feature_names = np.array(vec.get_feature_names_out())
    print('Top positive features:')
    print(feature_names[top_pos_idx])
    print('\nTop negative features:')
    print(feature_names[top_neg_idx])
else:
    print('Model has no coef_')


In [None]:
import json
summary = {k: {'accuracy': v['accuracy'], 'auc': v['auc']} for k,v in results.items()}
with open(RESULTS_DIR / 'classical_results_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)
print('Saved classical results summary')


## DistilBERT Fine-tuning (Hugging Face)
This section fine-tunes DistilBERT. **Warning:** fine-tuning on CPU is slow. For research, use GPU. We run 1 epoch here for a quick experiment.

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

# Build HF Datasets
train_df = pd.DataFrame({'text': X_train.values, 'label': y_train.values})
test_df = pd.DataFrame({'text': X_test.values, 'label': y_test.values})
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=256)

train_ds = train_ds.map(tokenize_function, batched=True)
train_ds = train_ds.remove_columns(['text'])
train_ds.set_format(type='torch', columns=['input_ids','attention_mask','label'])

test_ds = test_ds.map(tokenize_function, batched=True)
test_ds = test_ds.remove_columns(['text'])
test_ds.set_format(type='torch', columns=['input_ids','attention_mask','label'])

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Training args
training_args = TrainingArguments(
    output_dir=str(RESULTS_DIR / 'hf_outputs'),
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='no',
    logging_steps=100,
    fp16=False,
)

# compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train (may be slow on CPU)
trainer.train()

# Evaluate and save
metrics = trainer.evaluate()
print('DistilBERT eval metrics:', metrics)
trainer.save_model(RESULTS_DIR / 'hf_outputs' / 'distilbert_imdb')
print('Saved DistilBERT to', RESULTS_DIR / 'hf_outputs' / 'distilbert_imdb')


## Comparison & Next Steps
- Compare classical results (saved in `results/classical_results_summary.json`) with DistilBERT metrics printed above.
- For full transformer training, use a GPU (Colab or cloud). Consider distillation for production.

Files saved to `results/`:
- TF-IDF vectorizer: `results/tfidf_vectorizer.joblib`
- Classical models: `results/{model}.joblib`
- Classical summary: `results/classical_results_summary.json`
- DistilBERT outputs: `results/hf_outputs/distilbert_imdb`
