In [None]:
# --- Step 1: Install dependencies ---
!pip install mlflow scikit-learn xgboost lightgbm

# --- Step 2: Imports ---
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import mlflow
import mlflow.sklearn

# --- Step 3: Load Task 4 dataset ---
df = pd.read_csv("/content/processed_task4.csv")

# Drop non-numeric passthroughs (IDs, datetimes)
drop_cols = ["is_high_risk","CustomerId","remainder__FirstTxn","remainder__LastTxn"]
X = df.drop(columns=[c for c in drop_cols if c in df.columns])
y = df["is_high_risk"]

# Train/test split with reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Step 4: Define models and hyperparameter grids ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

param_grids = {
    "LogisticRegression": {"C": [0.1, 1, 10]},
    "RandomForest": {"n_estimators": [50, 100], "max_depth": [None, 5, 10]},
    "XGBoost": {"n_estimators": [50, 100], "max_depth": [3, 5], "learning_rate": [0.1, 0.01]}
}

# --- Step 5: Train, tune, and log with MLflow ---
for name, model in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=3, scoring="f1", n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    # Predictions
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:,1]

    # Metrics
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }

    # MLflow logging
    with mlflow.start_run(run_name=name):
        mlflow.log_params(grid.best_params_)
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(best_model, name)

    print(f"{name} metrics:", metrics)
