In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install xgboost lightgbm "mlflow<3"



In [None]:
base_folder = "R:\\Downloads\\housing_app_fall25-main\\housing_app_fall25-main"
%cd "{base_folder}"

R:\Downloads\housing_app_fall25-main\housing_app_fall25-main


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [None]:
import sqlite3
import pandas as pd

conn = sqlite3.connect(f"{base_folder}/data/titanic.db")

titanic = pd.read_sql_query(
    """
    SELECT
        p.passenger_id,
        p.Pclass,
        p.Age,
        p.Fare,
        ps.SibSp,
        ps.Parch,
        ps.Survived,
        s.name AS sex
    FROM passenger AS p
    JOIN passenger_survival AS ps
        ON ps.passenger_id = p.passenger_id
    JOIN sex AS s
        ON s.sex_id = p.sex_id
    ORDER BY p.passenger_id
    """,
    conn,
)

conn.close()

titanic.head()


Unnamed: 0,passenger_id,Pclass,Age,Fare,SibSp,Parch,Survived,sex
0,0,3,22.0,7.25,1,0,0,male
1,1,1,38.0,71.2833,1,0,1,female
2,2,3,26.0,7.925,0,0,1,female
3,3,1,35.0,53.1,1,0,1,female
4,4,3,35.0,8.05,0,0,0,male


In [None]:
from pathlib import Path
import os
import mlflow
from dotenv import load_dotenv

env_path = Path.cwd() / ".env"
print("Exists:", env_path.exists())

load_dotenv(env_path, override=True)

print("URI:", os.getenv("MLFLOW_TRACKING_URI"))
print("USER:", os.getenv("MLFLOW_TRACKING_USERNAME"))
print("PASS:", "SET" if os.getenv("MLFLOW_TRACKING_PASSWORD") else "MISSING")

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
print("Tracking URI:", mlflow.get_tracking_uri())


Exists: False
URI: None
USER: None
PASS: MISSING
Tracking URI: file:///R:/Downloads/housing_app_fall25-main/housing_app_fall25-main/mlruns


In [None]:
# =============================================================================
# FULL PIPELINE (TITANIC CLASSIFICATION – FINAL WORKING VERSION)
# =============================================================================

from pathlib import Path
import os
import time
import joblib
import numpy as np
import pandas as pd

from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score

import mlflow
from mlflow.models import infer_signature

# ---------------------------------------------------------------------
# Base folder (safe for VS Code / Jupyter)
# ---------------------------------------------------------------------
base_folder = (
    Path.cwd().parent
    if Path.cwd().name == "notebooks"
    else Path.cwd()
)

start_time = time.monotonic()

# =============================================================================
# STEP 1: TITANIC PREPROCESSING PIPELINE (FIXES YOUR ERROR)
# =============================================================================

num_features = ["Pclass", "Age", "Fare", "SibSp", "Parch"]
cat_features = ["sex"]

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessing = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_features),
        ("cat", categorical_pipeline, cat_features),
    ]
)

print("✓ STEP 1: Titanic preprocessing pipeline created.")

# =============================================================================
# STEP 2: TRAIN / TEST SPLIT (STRATIFIED)
# =============================================================================

X = titanic.drop(["passenger_id", "Survived"], axis=1)
y = titanic["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=42,
)

print(f"✓ STEP 2: Train={len(X_train)}, Test={len(X_test)}")

# =============================================================================
# STEP 3: DEFINE CLASSIFICATION MODELS
# =============================================================================

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

models = {
    "ridge": RidgeClassifier(),
    "histgradientboosting": HistGradientBoostingClassifier(),
    "xgboost": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False
    ),
    "lightgbm": LGBMClassifier(),
}

pipelines = {
    name: make_pipeline(preprocessing, model)
    for name, model in models.items()
}

print("✓ STEP 3: Classification pipelines defined.")

# =============================================================================
# STEP 4: CONFIGURE MLFLOW
# =============================================================================

load_dotenv(f"{base_folder}\\notebooks\\.env", override=True)

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("titanic_survival_classification")

print("✓ STEP 4: MLflow configured.")

# =============================================================================
# STEP 5: TRAIN & LOG MODELS (NO PCA)
# =============================================================================

results = {}

for name, pipeline in pipelines.items():
    print("\n" + "=" * 80)
    print(f"Training model: {name}")
    print("=" * 80)

    cv_scores = cross_val_score(
        pipeline,
        X_train,
        y_train,
        cv=3,
        scoring="f1",
        n_jobs=-1
    )
    cv_f1 = cv_scores.mean()

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    test_f1 = f1_score(y_test, y_pred)
    test_acc = accuracy_score(y_test, y_pred)

    results[name] = {
        "pipeline": pipeline,
        "cv_f1": cv_f1,
        "test_f1": test_f1,
    }

    print(f"{name} CV F1:   {cv_f1:.4f}")
    print(f"{name} Test F1: {test_f1:.4f}")
    print(f"{name} Accuracy:{test_acc:.4f}")

    with mlflow.start_run(run_name=f"{name}_baseline"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.log_metric("test_accuracy", test_acc)

        signature = infer_signature(X_train, pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            pipeline,
            artifact_path="titanic_model",
            signature=signature,
            input_example=X_train,
        )

print("\n✓ STEP 5: Baseline models trained.")

# =============================================================================
# STEP 6: PCA MODELS
# =============================================================================

pca_results = {}

for name, model in models.items():
    print("\n" + "=" * 80)
    print(f"Training PCA model: {name}")
    print("=" * 80)

    pca_pipeline = make_pipeline(
        preprocessing,
        PCA(n_components=0.95),
        model,
    )

    cv_scores = cross_val_score(
        pca_pipeline,
        X_train,
        y_train,
        cv=3,
        scoring="f1",
        n_jobs=-1
    )
    cv_f1 = cv_scores.mean()

    pca_pipeline.fit(X_train, y_train)
    y_pred = pca_pipeline.predict(X_test)

    test_f1 = f1_score(y_test, y_pred)

    key = f"{name}_with_pca"
    pca_results[key] = {
        "pipeline": pca_pipeline,
        "cv_f1": cv_f1,
        "test_f1": test_f1,
    }

    print(f"{key} Test F1: {test_f1:.4f}")

    with mlflow.start_run(run_name=key):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)

print("\n✓ STEP 6: PCA models trained.")

# =============================================================================
# STEP 7: SELECT GLOBAL BEST MODEL
# =============================================================================

all_results = {**results, **pca_results}
best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
best_pipeline = all_results[best_name]["pipeline"]

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL")
print("=" * 80)
print(f"Model: {best_name}")
print(f"Test F1: {all_results[best_name]['test_f1']:.4f}")

# =============================================================================
# STEP 8: SAVE BEST MODEL
# =============================================================================

os.makedirs(f"{base_folder}/models", exist_ok=True)
joblib.dump(best_pipeline, f"{base_folder}/models/global_best_model.pkl")

print("✓ Global best model saved.")

elapsed = time.monotonic() - start_time
print(f"Elapsed time: {elapsed:.2f} seconds")


Traceback (most recent call last):
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 329, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 427, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 1373, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 1366, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\utils\file_utils.py", line 310, in read_yaml
    r

✓ STEP 1: Titanic preprocessing pipeline created.
✓ STEP 2: Train=712, Test=179
✓ STEP 3: Classification pipelines defined.
✓ STEP 4: MLflow configured.

Training model: ridge
ridge CV F1:   0.7256
ridge Test F1: 0.7287
ridge Accuracy:0.8045


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 317.04it/s]



Training model: histgradientboosting
histgradientboosting CV F1:   0.7404
histgradientboosting Test F1: 0.7669
histgradientboosting Accuracy:0.8268


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 458.90it/s] 



Training model: xgboost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost CV F1:   0.7429
xgboost Test F1: 0.7353
xgboost Accuracy:0.7989


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 554.22it/s]



Training model: lightgbm
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 208
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
lightgbm CV F1:   0.7387
lightgbm Test F1: 0.7368
lightgbm Accuracy:0.8045


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 356.56it/s] 



✓ STEP 5: Baseline models trained.

Training PCA model: ridge
ridge_with_pca Test F1: 0.7287

Training PCA model: histgradientboosting
histgradientboosting_with_pca Test F1: 0.7519

Training PCA model: xgboost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost_with_pca Test F1: 0.7259

Training PCA model: lightgbm
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1424
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
lightgbm_with_pca Test F1: 0.7338





✓ STEP 6: PCA models trained.

GLOBAL BEST MODEL
Model: histgradientboosting
Test F1: 0.7669
✓ Global best model saved.
Elapsed time: 49.61 seconds
