In [7]:
!pip install xgboost lightgbm "mlflow<3"



In [8]:
base_folder = "R:\\Downloads\\housing_app_fall25-main\\housing_app_fall25-main"
%cd "{base_folder}"

R:\Downloads\housing_app_fall25-main\housing_app_fall25-main


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [9]:
import sqlite3
import pandas as pd

conn = sqlite3.connect(f"{base_folder}/data/titanic.db")

titanic = pd.read_sql_query(
    """
    SELECT
        p.passenger_id,
        p.Pclass,
        p.Age,
        p.Fare,
        ps.SibSp,
        ps.Parch,
        ps.Survived,
        s.name AS sex
    FROM passenger AS p
    JOIN passenger_survival AS ps
        ON ps.passenger_id = p.passenger_id
    JOIN sex AS s
        ON s.sex_id = p.sex_id
    ORDER BY p.passenger_id
    """,
    conn,
)

conn.close()

titanic.head()


Unnamed: 0,passenger_id,Pclass,Age,Fare,SibSp,Parch,Survived,sex
0,0,3,22.0,7.25,1,0,0,male
1,1,1,38.0,71.2833,1,0,1,female
2,2,3,26.0,7.925,0,0,1,female
3,3,1,35.0,53.1,1,0,1,female
4,4,3,35.0,8.05,0,0,0,male


In [10]:
from pathlib import Path
import os
import mlflow
from dotenv import load_dotenv

env_path = Path.cwd() / ".env"
print("Exists:", env_path.exists())

load_dotenv(env_path, override=True)

print("URI:", os.getenv("MLFLOW_TRACKING_URI"))
print("USER:", os.getenv("MLFLOW_TRACKING_USERNAME"))
print("PASS:", "SET" if os.getenv("MLFLOW_TRACKING_PASSWORD") else "MISSING")

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
print("Tracking URI:", mlflow.get_tracking_uri())


Exists: False
URI: https://dagshub.com/rahulmugada/housing_fall2025_1.mlflow
USER: rahulmugada
PASS: SET
Tracking URI: https://dagshub.com/rahulmugada/housing_fall2025_1.mlflow


In [12]:
# ============================================================
# TRAIN / TEST WITHOUT OPTUNA (BASELINE + PCA)
# ============================================================

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
import mlflow
import mlflow.sklearn

models = {
    "logistic": LogisticRegression(max_iter=1000),
    "ridge": RidgeClassifier(),
    "histgradientboosting": HistGradientBoostingClassifier(random_state=42),
    "xgboost": XGBClassifier(
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1
    ),
    "lightgbm": LGBMClassifier(
        random_state=42,
        verbose=-1
    ),
}

for model_name, model in models.items():
    for use_pca in [False, True]:

        # Logistic regression PCA is optional ‚Äî skip if you want
        if model_name == "logistic" and use_pca:
            continue

        run_name = f"{model_name}_{'with_pca' if use_pca else 'baseline'}"
        print(f"\nRunning {run_name}")

        steps = [("preprocessing", preprocessing)]
        if use_pca:
            steps.append(("pca", PCA(n_components=0.95)))
        steps.append(("model", model))

        pipeline = make_pipeline(*[s[1] for s in steps])

        cv_f1 = cross_val_score(
            pipeline,
            X_train,
            y_train,
            cv=3,
            scoring="f1"
        ).mean()

        pipeline.fit(X_train, y_train)
        test_f1 = f1_score(y_test, pipeline.predict(X_test))

        with mlflow.start_run(run_name=run_name):
            mlflow.log_param("model_family", model_name)
            mlflow.log_param("uses_pca", use_pca)
            mlflow.log_param("is_tuned", False)
            mlflow.log_param("cv_folds", 3)

            mlflow.log_metric("cv_f1", cv_f1)
            mlflow.log_metric("test_f1", test_f1)

            mlflow.sklearn.log_model(pipeline, "model")

        print(f"{run_name} | CV F1={cv_f1:.4f}, Test F1={test_f1:.4f}")



Running logistic_baseline




üèÉ View run logistic_baseline at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/eb36ea67b23549c08a6e9c7e66d9b0b5
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
logistic_baseline | CV F1=0.7139, Test F1=0.7368

Running ridge_baseline




üèÉ View run ridge_baseline at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/b72c788518d64aecb61b4c015dab74d4
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
ridge_baseline | CV F1=0.7256, Test F1=0.7287

Running ridge_with_pca




üèÉ View run ridge_with_pca at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/9f14ce7fc2de42b2bc7cb115272669e4
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
ridge_with_pca | CV F1=0.7256, Test F1=0.7287

Running histgradientboosting_baseline




üèÉ View run histgradientboosting_baseline at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/cc26f688bca947b9ada610cf9d618444
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
histgradientboosting_baseline | CV F1=0.7404, Test F1=0.7669

Running histgradientboosting_with_pca




üèÉ View run histgradientboosting_with_pca at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/98258097199c4985809013d5d2b58c09
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
histgradientboosting_with_pca | CV F1=0.7270, Test F1=0.7519

Running xgboost_baseline




üèÉ View run xgboost_baseline at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/ddbd0f791c624c8fa6a0da6bb2f1846d
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
xgboost_baseline | CV F1=0.7429, Test F1=0.7353

Running xgboost_with_pca




üèÉ View run xgboost_with_pca at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/791ce09cbf884642b36274a5753b783b
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
xgboost_with_pca | CV F1=0.7254, Test F1=0.7259

Running lightgbm_baseline




üèÉ View run lightgbm_baseline at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/f9be26dbdb8b498dbdb9b379392ad66a
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
lightgbm_baseline | CV F1=0.7387, Test F1=0.7368

Running lightgbm_with_pca




üèÉ View run lightgbm_with_pca at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0/runs/abe8e423a8cd4936b6f3433f3ab5f66e
üß™ View experiment at: https://dagshub.com/rahulmugada/titanic.mlflow/#/experiments/0
lightgbm_with_pca | CV F1=0.7230, Test F1=0.7338
