In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
03_train_logreg.py
------------------
Purpose:
    Train a Logistic Regression spam classifier:
        TF-IDF (uni+bi-grams) -> RandomOverSampler -> LogisticRegression
    Grid-search a small set of hyperparameters (C, min_df) with 5-fold CV.
    Evaluate on the held-out VAL split.
    Save the trained model + a JSON training report.

Assumptions:
    - Splits exist (created by 02_split.py):
        ../DATA/splits/train.csv
        ../DATA/splits/val.csv
    - Columns: "Label", "SMS_Message" (Label ∈ {"ham","spam"})
    - No schema checks: upstream cleaning guarantees this (01_prepare_data.py).

Outputs (MI3: in OUTPUT/):
    - ../OUTPUT/logreg.joblib
    - ../OUTPUT/train_report.json
"""
import json
from pathlib import Path

import joblib
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import GridSearchCV

# ---- Paths (MI3 rubric: DATA/, OUTPUT/) ----
splits_dir = Path("../DATA/splits")
out_dir = Path("../OUTPUT")
out_dir.mkdir(parents=True, exist_ok=True)

model_path = out_dir / "logreg.joblib"
report_path = out_dir / "train_report.json"

# ---- Load splits ----
train = pd.read_csv(splits_dir / "train.csv")
val = pd.read_csv(splits_dir / "val.csv")

X_train = train["SMS_Message"].tolist()
y_train = (train["Label"].str.lower() == "spam").astype(int).values

X_val = val["SMS_Message"].tolist()
y_val = (val["Label"].str.lower() == "spam").astype(int).values

# ---- Pipeline: TF-IDF -> ROS -> Logistic Regression ----
pipe = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), min_df=2)),
    ("ros", RandomOverSampler(random_state=42)),
    ("clf", LogisticRegression(max_iter=1000, random_state=42))
])

# ---- Hyperparameter grid (light/fast) ----
param_grid = {
    "tfidf__min_df": [1, 2, 3],
    "clf__C": [0.5, 1.0, 2.0, 4.0],
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs"],
}

# Optimize for F1 on the positive class (spam)
gs = GridSearchCV(pipe, param_grid, scoring="f1", cv=5, n_jobs=-1, verbose=1)
gs.fit(X_train, y_train)

best_model = gs.best_estimator_
y_val_pred = best_model.predict(X_val)
y_val_prob = best_model.predict_proba(X_val)[:, 1]

prec, rec, f1, _ = precision_recall_fscore_support(
    y_val, y_val_pred, average="binary", zero_division=0
)
auc = roc_auc_score(y_val, y_val_prob)

report = {
    "best_params": gs.best_params_,
    "val_metrics": {
        "precision": float(prec),
        "recall": float(rec),
        "f1": float(f1),
        "roc_auc": float(auc)
    }
}

joblib.dump(best_model, model_path)
with open(report_path, "w") as f:
    json.dump(report, f, indent=2)

print("Saved model:", model_path)
print("Train report:", report_path)
print("Best params:", gs.best_params_)
print("VAL metrics:", report["val_metrics"])

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Saved model: ../OUTPUT/logreg.joblib
Train report: ../OUTPUT/train_report.json
Best params: {'clf__C': 4.0, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs', 'tfidf__min_df': 2}
VAL metrics: {'precision': 0.9904761904761905, 'recall': 0.9285714285714286, 'f1': 0.9585253456221198, 'roc_auc': 0.9934145619573795}
