In [6]:
# ----------------------------
# Sentiment Analysis Baseline
# Using TF-IDF + Logistic Regression
# Dataset: IMDB reviews (positive / negative)
# ----------------------------

import os
import joblib   # for saving the trained model
import numpy as np
from datasets import load_dataset   # Hugging Face datasets (loads IMDB automatically)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
import argparse

# ----------------------------
# MAIN FUNCTION
# ----------------------------
def main(args):
    # 1. Load dataset (IMDB has 25k train + 25k test)
    ds = load_dataset("imdb")
    train_texts = ds["train"]["text"]   # list of reviews
    train_labels = ds["train"]["label"] # list of labels (0=negative, 1=positive)
    test_texts  = ds["test"]["text"]
    test_labels = ds["test"]["label"]

    # 2. Optionally shrink training size (for quick tests in Colab)
    if args.max_train_samples > 0:
        train_texts = train_texts[:args.max_train_samples]
        train_labels = train_labels[:args.max_train_samples]

    # 3. Define a pipeline
    #    (a) Convert raw text → numeric features (TF-IDF)
    #    (b) Train a Logistic Regression classifier
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=args.max_features,   # only keep top-N most frequent words
            ngram_range=(1,2),                # use unigrams + bigrams
            lowercase=True,
            stop_words="english"              # ignore common words like "the", "and", etc.
        )),
        ("clf", LogisticRegression(
            C=2.0,          # regularization strength (higher = less regularization)
            max_iter=200,   # max number of training iterations
            n_jobs=-1       # use all CPU cores
        ))
    ])

    # 4. Train the pipeline
    print("Training Logistic Regression model...")
    pipe.fit(train_texts, train_labels)

    # 5. Evaluate on test set
    preds = pipe.predict(test_texts)
    acc = accuracy_score(test_labels, preds)
    f1  = f1_score(test_labels, preds)

    print(f"\nBaseline Accuracy: {acc:.4f}")
    print(f"Baseline F1 Score: {f1:.4f}")
    print("\nDetailed Report:\n")
    print(classification_report(test_labels, preds, target_names=["negative","positive"]))

    # 6. Save the model
    os.makedirs(args.out_dir, exist_ok=True)
    joblib.dump(pipe, os.path.join(args.out_dir, "baseline_tfidf_logreg.joblib"))
    print(f"\nModel saved to {os.path.join(args.out_dir, 'baseline_tfidf_logreg.joblib')}")


# ----------------------------
# ENTRY POINT
# ----------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--out_dir", type=str, default="artifacts_baseline",
                        help="Folder to save trained model")
    parser.add_argument("--max_features", type=int, default=50000,
                        help="Number of words/ngrams to keep for TF-IDF")
    parser.add_argument("--max_train_samples", type=int, default=0,
                        help="Use fewer samples for quick tests (0 = use all data)")

    #Use parse_known_args() to avoid Jupyter/Colab crash with extra -f arg
    args, _ = parser.parse_known_args()

    main(args)


Training Logistic Regression model...

Baseline Accuracy: 0.8846
Baseline F1 Score: 0.8844

Detailed Report:

              precision    recall  f1-score   support

    negative       0.88      0.89      0.88     12500
    positive       0.89      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000


Model saved to artifacts_baseline/baseline_tfidf_logreg.joblib
