In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    RocCurveDisplay, PrecisionRecallDisplay
)

import matplotlib.pyplot as plt

# ----------------------------------------------------
# Load final cleaned user-level data
# ----------------------------------------------------
df = pd.read_csv("../data/user_level_data.csv")

# Target
y = df["y_active"].astype(int)

# Drop target + identifiers (consistent with other models)
drop_cols = [
    "y_active", "userId"  # only these exist in user-level dataset
]

X = df.drop(columns=drop_cols, errors="ignore")

# ----------------------------------------------------
# Train-test split (random, no leakage, same as MLP)
# ----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------------------------------------------
# Pipeline: scaling + logistic regression
# ----------------------------------------------------
logit = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=1000,
    class_weight="balanced"
)

pipe = Pipeline([
    ("scale", StandardScaler()),
    ("model", logit)
])

# ----------------------------------------------------
# Cross-validation (same as RF/XGB/MLP)
# ----------------------------------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_validate(
    pipe,
    X_train,
    y_train,
    cv=cv,
    scoring={
        "roc_auc": "roc_auc",
        "pr_auc": "average_precision",
        "acc": "accuracy"
    },
    n_jobs=-1,
    return_train_score=False
)

print("CV ROC AUC:  ", np.mean(cv_scores["test_roc_auc"]).round(4))
print("CV PR  AUC:  ", np.mean(cv_scores["test_pr_auc"]).round(4))
print("CV Accuracy: ", np.mean(cv_scores["test_acc"]).round(4))

# ----------------------------------------------------
# Fit on train and evaluate on test
# ----------------------------------------------------
pipe.fit(X_train, y_train)

y_prob = pipe.predict_proba(X_test)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("\nHold-out ROC AUC:", round(roc_auc_score(y_test, y_prob), 4))
print("Hold-out PR  AUC:", round(average_precision_score(y_test, y_prob), 4))

print("\nAccuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Precision:", round(precision_score(y_test, y_pred), 4))
print("Recall:", round(recall_score(y_test, y_pred), 4))
print("F1 score:", round(f1_score(y_test, y_pred), 4))

# ----------------------------------------------------
# Curves
# ----------------------------------------------------
RocCurveDisplay.from_predictions(y_test, y_prob)
plt.title("Logistic Regression - ROC Curve")
plt.show()

PrecisionRecallDisplay.from_predictions(y_test, y_prob)
plt.title("Logistic Regression - Precision Recall Curve")
plt.show()

# ----------------------------------------------------
# Coefficients
# ----------------------------------------------------
coefs = pipe.named_steps["model"].coef_.ravel()
feature_names = X_train.columns

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs
}).sort_values("coef", ascending=False)

print("\nTop positive features:\n", coef_df.head(10).to_string(index=False))
print("\nTop negative features:\n", coef_df.tail(10).to_string(index=False))


FileNotFoundError: [Errno 2] No such file or directory: '../data/user_level_data.csv'