In [None]:
!pip install optuna xgboost lightgbm "mlflow<3"



In [None]:
from google.colab import drive
from pathlib import Path

drive.mount("/content/drive")

base_folder = "/content/drive/MyDrive/housing_app_fall25"
BASE = Path(base_folder)
%cd "{BASE}"

!ls
!ls -lh telco.db

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/housing_app_fall25
api		     mlruns	  streamlit
data		     models	  telco.db
docker-compose.yml   notebooks	  test_inference.py
housing_pipeline.py  __pycache__  updatesForClassification.md
-rw------- 1 root root 700K Dec 17 03:55 telco.db


In [None]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("telco.db")

df = pd.read_sql_query("""
SELECT s.customer_id,
       s.tenure, s.MonthlyCharges, s.TotalCharges,
       d.name AS Contract,
       s.target
FROM customer_stats s
JOIN customer c ON c.customer_id = s.customer_id
JOIN contract_dim d ON d.contract_id = c.contract_id;
""", conn)

conn.close()

df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
print(df.shape)
print(df["target"].value_counts())
df.head()


(7043, 6)
target
0    5174
1    1869
Name: count, dtype: int64


Unnamed: 0,customer_id,tenure,MonthlyCharges,TotalCharges,Contract,target
0,7590-VHVEG,1.0,29.85,29.85,Month-to-month,0
1,5575-GNVDE,34.0,56.95,1889.5,One year,0
2,3668-QPYBK,2.0,53.85,108.15,Month-to-month,1
3,7795-CFOCW,45.0,42.3,1840.75,One year,0
4,9237-HQITU,2.0,70.7,151.65,Month-to-month,1


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

X = df.drop(columns=["target", "customer_id"])
y = df["target"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
cat_cols = ["Contract"]

# make OHE dense so PCA works without sparse issues (Telco has few categories)
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

preprocessing = ColumnTransformer([
    ("num", Pipeline([("impute", SimpleImputer(strategy="median")),
                      ("scale", StandardScaler())]), num_cols),
    ("cat", ohe, cat_cols),
])


In [None]:
import os, time, joblib
import optuna
from optuna.samplers import TPESampler

import mlflow
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.decomposition import PCA

mlflow.set_experiment("telco_churn_optuna")

def eval_and_log(run_name, pipe):
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)

    f1 = float(f1_score(y_test, pred))
    tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

    with mlflow.start_run(run_name=run_name):
        mlflow.log_metric("test_f1", f1)
        mlflow.log_metric("tn", int(tn))
        mlflow.log_metric("fp", int(fp))
        mlflow.log_metric("fn", int(fn))
        mlflow.log_metric("tp", int(tp))
        mlflow.sklearn.log_model(pipe, artifact_path="model")

    return f1, pipe

def objective_no_pca(trial, model_name):
    if model_name == "logreg":
        C = trial.suggest_float("C", 0.01, 10.0, log=True)
        model = LogisticRegression(max_iter=800, C=C)
    elif model_name == "rf":
        n_estimators = trial.suggest_int("n_estimators", 200, 600, step=100)
        max_depth = trial.suggest_int("max_depth", 3, 12)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    elif model_name == "hgb":
        lr = trial.suggest_float("learning_rate", 0.02, 0.2)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        model = HistGradientBoostingClassifier(learning_rate=lr, max_depth=max_depth, random_state=42)
    else:  # xgb
        lr = trial.suggest_float("learning_rate", 0.02, 0.2)
        max_depth = trial.suggest_int("max_depth", 3, 8)
        n_estimators = trial.suggest_int("n_estimators", 200, 600, step=100)
        model = XGBClassifier(
            learning_rate=lr, max_depth=max_depth, n_estimators=n_estimators,
            subsample=0.9, colsample_bytree=0.9,
            eval_metric="logloss", random_state=42, n_jobs=-1
        )

    pipe = Pipeline([("preprocess", preprocessing), ("model", model)])

    # CV F1 (3-fold)
    cv = cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1")
    return float(cv.mean())

def objective_with_pca(trial, model_name):
    # PCA explained variance target (0.90–0.99)
    pca_var = trial.suggest_float("pca_var", 0.90, 0.99)
    pca = PCA(n_components=pca_var, random_state=42)

    # model params
    if model_name == "logreg":
        C = trial.suggest_float("C", 0.01, 10.0, log=True)
        model = LogisticRegression(max_iter=800, C=C)
    elif model_name == "rf":
        n_estimators = trial.suggest_int("n_estimators", 200, 600, step=100)
        max_depth = trial.suggest_int("max_depth", 3, 12)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    elif model_name == "hgb":
        lr = trial.suggest_float("learning_rate", 0.02, 0.2)
        max_depth = trial.suggest_int("max_depth", 3, 10)
        model = HistGradientBoostingClassifier(learning_rate=lr, max_depth=max_depth, random_state=42)
    else:
        lr = trial.suggest_float("learning_rate", 0.02, 0.2)
        max_depth = trial.suggest_int("max_depth", 3, 8)
        n_estimators = trial.suggest_int("n_estimators", 200, 600, step=100)
        model = XGBClassifier(
            learning_rate=lr, max_depth=max_depth, n_estimators=n_estimators,
            subsample=0.9, colsample_bytree=0.9,
            eval_metric="logloss", random_state=42, n_jobs=-1
        )

    pipe = Pipeline([("preprocess", preprocessing), ("pca", pca), ("model", model)])

    cv = cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1")
    return float(cv.mean())


2025/12/17 04:23:30 INFO mlflow.tracking.fluent: Experiment with name 'telco_churn_optuna' does not exist. Creating a new experiment.


In [None]:
model_names = ["logreg", "rf", "hgb", "xgb"]
all_candidates = []

for name in model_names:
    # NO PCA
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(lambda t: objective_no_pca(t, name), n_trials=10, show_progress_bar=True)

    best_params = study.best_params
    with mlflow.start_run(run_name=f"{name}_NO_PCA_optuna_params"):
        mlflow.log_param("model", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_f1_mean", float(study.best_value))

    # build final NO PCA pipe from best_params (quick rebuild)
    # easiest: do a tiny “trial replay”
    t = optuna.trial.FixedTrial(best_params)
    _ = objective_no_pca(t, name)  # creates consistent param set
    # rebuild exact model:
    # (same logic as objective_no_pca)
    if name == "logreg":
        final_model = LogisticRegression(max_iter=800, C=best_params["C"])
    elif name == "rf":
        final_model = RandomForestClassifier(
            n_estimators=best_params["n_estimators"],
            max_depth=best_params["max_depth"],
            random_state=42
        )
    elif name == "hgb":
        final_model = HistGradientBoostingClassifier(
            learning_rate=best_params["learning_rate"],
            max_depth=best_params["max_depth"],
            random_state=42
        )
    else:
        final_model = XGBClassifier(
            learning_rate=best_params["learning_rate"],
            max_depth=best_params["max_depth"],
            n_estimators=best_params["n_estimators"],
            subsample=0.9, colsample_bytree=0.9,
            eval_metric="logloss", random_state=42, n_jobs=-1
        )

    pipe_no_pca = Pipeline([("preprocess", preprocessing), ("model", final_model)])
    test_f1, fitted = eval_and_log(f"{name}_NO_PCA_best", pipe_no_pca)
    all_candidates.append((f"{name}_NO_PCA", test_f1, fitted))

    # WITH PCA
    study = optuna.create_study(direction="maximize", sampler=TPESampler(seed=42))
    study.optimize(lambda t: objective_with_pca(t, name), n_trials=10, show_progress_bar=True)

    best_params = study.best_params
    with mlflow.start_run(run_name=f"{name}_PCA_optuna_params"):
        mlflow.log_param("model", name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_f1_mean", float(study.best_value))

    # rebuild PCA pipe
    pca = PCA(n_components=best_params["pca_var"], random_state=42)

    if name == "logreg":
        final_model = LogisticRegression(max_iter=800, C=best_params["C"])
    elif name == "rf":
        final_model = RandomForestClassifier(
            n_estimators=best_params["n_estimators"],
            max_depth=best_params["max_depth"],
            random_state=42
        )
    elif name == "hgb":
        final_model = HistGradientBoostingClassifier(
            learning_rate=best_params["learning_rate"],
            max_depth=best_params["max_depth"],
            random_state=42
        )
    else:
        final_model = XGBClassifier(
            learning_rate=best_params["learning_rate"],
            max_depth=best_params["max_depth"],
            n_estimators=best_params["n_estimators"],
            subsample=0.9, colsample_bytree=0.9,
            eval_metric="logloss", random_state=42, n_jobs=-1
        )

    pipe_pca = Pipeline([("preprocess", preprocessing), ("pca", pca), ("model", final_model)])
    test_f1, fitted = eval_and_log(f"{name}_PCA_best", pipe_pca)
    all_candidates.append((f"{name}_PCA", test_f1, fitted))

best_name, best_f1, best_pipe = max(all_candidates, key=lambda x: x[1])
print("GLOBAL BEST:", best_name, "Test F1:", best_f1)

os.makedirs("models", exist_ok=True)
joblib.dump(best_pipe, "models/model.joblib")
print("Saved -> models/model.joblib")


  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]



GLOBAL BEST: xgb_NO_PCA Test F1: 0.556732223903177
Saved -> models/model.joblib


In [13]:
!find api -maxdepth 2 -type f
!find streamlit -maxdepth 2 -type f


api/Dockerfile
api/app.py
api/housing_pipeline.py
api/requirements.txt
streamlit/Dockerfile
streamlit/app.py
streamlit/requirements.txt
