In [None]:
import os
import joblib
import numpy as np
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
import argparse

def main(args):
    # 1) Load dataset (IMDB: 'text', 'label' where 0=neg,1=pos)
    ds = load_dataset("imdb")
    train_texts = ds["train"]["text"]
    train_labels = ds["train"]["label"]
    test_texts  = ds["test"]["text"]
    test_labels = ds["test"]["label"]

    # Optional: shrink training size for quick runs
    if args.max_train_samples > 0:
        train_texts = train_texts[:args.max_train_samples]
        train_labels = train_labels[:args.max_train_samples]

    # 2) Build pipeline (TF-IDF → Logistic Regression)
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=args.max_features,
            ngram_range=(1,2),
            lowercase=True,
            stop_words="english"
        )),
        ("clf", LogisticRegression(
            C=2.0,
            max_iter=200,
            n_jobs=-1
        ))
    ])

    # 3) Train
    pipe.fit(train_texts, train_labels)

    # 4) Evaluate
    preds = pipe.predict(test_texts)
    acc = accuracy_score(test_labels, preds)
    f1  = f1_score(test_labels, preds)

    print(f"Baseline Accuracy: {acc:.4f}")
    print(f"Baseline F1 (macro-ish for binary): {f1:.4f}")
    print(classification_report(test_labels, preds, target_names=["neg","pos"]))

    # 5) Save
    os.makedirs(args.out_dir, exist_ok=True)
    joblib.dump(pipe, os.path.join(args.out_dir, "baseline_tfidf_logreg.joblib"))
    print(f"Saved model to {os.path.join(args.out_dir, 'baseline_tfidf_logreg.joblib')}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--out_dir", type=str, default="artifacts_baseline")
    parser.add_argument("--max_features", type=int, default=50000)
    parser.add_argument("--max_train_samples", type=int, default=0, help="0 = full dataset")
    args = parser.parse_args()
    main(args)
