In [15]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!pip install optuna xgboost lightgbm "mlflow<3"



In [None]:
base_folder = "R:\\Downloads\\housing_app_fall25-main\\housing_app_fall25-main"
%cd "{base_folder}"

R:\Downloads\housing_app_fall25-main\housing_app_fall25-main


In [None]:
import sqlite3
import pandas as pd

conn = sqlite3.connect(f"{base_folder}/data/titanic.db")

titanic = pd.read_sql_query(
    """
    SELECT
        p.passenger_id,
        p.Pclass,
        p.Age,
        p.Fare,
        ps.SibSp,
        ps.Parch,
        ps.Survived,
        s.name AS sex
    FROM passenger AS p
    JOIN passenger_survival AS ps
        ON ps.passenger_id = p.passenger_id
    JOIN sex AS s
        ON s.sex_id = p.sex_id
    ORDER BY p.passenger_id
    """,
    conn,
)

conn.close()

titanic.head()


Unnamed: 0,passenger_id,Pclass,Age,Fare,SibSp,Parch,Survived,sex
0,0,3,22.0,7.25,1,0,0,male
1,1,1,38.0,71.2833,1,0,1,female
2,2,3,26.0,7.925,0,0,1,female
3,3,1,35.0,53.1,1,0,1,female
4,4,3,35.0,8.05,0,0,0,male


In [None]:
# =============================================================================
# FULL PIPELINE with OPTUNA (TITANIC CLASSIFICATION – FINAL)
# =============================================================================

from pathlib import Path
import time
import os
import joblib
import numpy as np
import pandas as pd

from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna
from optuna.samplers import TPESampler
from sklearn.base import clone

import mlflow

# ---------------------------------------------------------------------
# Base folder
# ---------------------------------------------------------------------
base_folder = (
    Path.cwd().parent
    if Path.cwd().name == "notebooks"
    else Path.cwd()
)

start_time = time.monotonic()
optuna.logging.set_verbosity(optuna.logging.WARNING)

# =============================================================================
# STEP 1: TITANIC PREPROCESSING (FIXED)
# =============================================================================

num_features = ["Pclass", "Age", "Fare", "SibSp", "Parch"]
cat_features = ["sex"]

preprocessing = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline([
                ("imputer", SimpleImputer(strategy="median")),
                ("scaler", StandardScaler()),
            ]),
            num_features,
        ),
        (
            "cat",
            Pipeline([
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]),
            cat_features,
        ),
    ]
)

print("✓ STEP 1: Titanic preprocessing created.")

# =============================================================================
# STEP 2: STRATIFIED SPLIT
# =============================================================================

X = titanic.drop(["passenger_id", "Survived"], axis=1)
y = titanic["Survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42,
)

print(f"✓ STEP 2: Train={len(X_train)}, Test={len(X_test)}")

# =============================================================================
# STEP 3: MLFLOW CONFIG
# =============================================================================

load_dotenv(f"{base_folder}\\notebooks\\.env", override=True)
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))
mlflow.set_experiment("titanic_classification_optuna")

print("✓ STEP 3: MLflow configured.")

# =============================================================================
# STEP 4: OPTUNA OBJECTIVES (NO PCA)
# =============================================================================

def objective_ridge(trial):
    alpha = trial.suggest_float("alpha", 0.1, 100.0, log=True)
    pipe = make_pipeline(
        clone(preprocessing),
        RidgeClassifier(alpha=alpha),
    )
    return cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1").mean()


def objective_hgb(trial):
    lr = trial.suggest_float("learning_rate", 0.05, 0.2)
    depth = trial.suggest_int("max_depth", 3, 8)
    pipe = make_pipeline(
        clone(preprocessing),
        HistGradientBoostingClassifier(
            learning_rate=lr,
            max_depth=depth,
            random_state=42,
        ),
    )
    return cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1").mean()


def objective_xgb(trial):
    pipe = make_pipeline(
        clone(preprocessing),
        XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 300, step=50),
            learning_rate=trial.suggest_float("learning_rate", 0.05, 0.2),
            max_depth=trial.suggest_int("max_depth", 3, 8),
            eval_metric="logloss",
            random_state=42,
            n_jobs=-1,
        ),
    )
    return cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1").mean()


def objective_lgbm(trial):
    pipe = make_pipeline(
        clone(preprocessing),
        LGBMClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 300, step=50),
            learning_rate=trial.suggest_float("learning_rate", 0.05, 0.2),
            num_leaves=trial.suggest_int("num_leaves", 20, 80),
            random_state=42,
            verbose=-1,
        ),
    )
    return cross_val_score(pipe, X_train, y_train, cv=3, scoring="f1").mean()

objectives = {
    "ridge": objective_ridge,
    "histgradientboosting": objective_hgb,
    "xgboost": objective_xgb,
    "lightgbm": objective_lgbm,
}

results = {}

# =============================================================================
# STEP 5: RUN OPTUNA (NO PCA)
# =============================================================================

for name, obj in objectives.items():
    print(f"\nOptimizing {name.upper()} (NO PCA)")
    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=42),
    )
    study.optimize(obj, n_trials=10)

    best_params = study.best_params
    print("Best params:", best_params)

    # rebuild final model
    if name == "ridge":
        model = RidgeClassifier(alpha=best_params["alpha"])
    elif name == "histgradientboosting":
        model = HistGradientBoostingClassifier(
            learning_rate=best_params["learning_rate"],
            max_depth=best_params["max_depth"],
            random_state=42,
        )
    elif name == "xgboost":
        model = XGBClassifier(
            **best_params,
            eval_metric="logloss",
            random_state=42,
            n_jobs=-1,
        )
    else:
        model = LGBMClassifier(
            **best_params,
            random_state=42,
            verbose=-1,
        )

    final_pipe = make_pipeline(clone(preprocessing), model)
    final_pipe.fit(X_train, y_train)

    test_f1 = f1_score(y_test, final_pipe.predict(X_test))
    results[name] = {"pipeline": final_pipe, "test_f1": test_f1}

    with mlflow.start_run(run_name=f"{name}_optuna"):
        mlflow.log_params(best_params)
        mlflow.log_metric("test_f1", test_f1)
        mlflow.sklearn.log_model(final_pipe, artifact_path="model")

    print(f"{name} Test F1: {test_f1:.4f}")

# =============================================================================
# STEP 6: SELECT & SAVE BEST MODEL
# =============================================================================

best_name = max(results, key=lambda k: results[k]["test_f1"])
best_model = results[best_name]["pipeline"]

os.makedirs(f"{base_folder}/models", exist_ok=True)
joblib.dump(best_model, f"{base_folder}/models/global_best_model_optuna.pkl")

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL (OPTUNA)")
print("=" * 80)
print(f"Model: {best_name}")
print(f"Test F1: {results[best_name]['test_f1']:.4f}")

elapsed = time.monotonic() - start_time
print(f"Elapsed time: {elapsed:.2f} seconds")


Traceback (most recent call last):
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 329, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 427, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 1373, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\store\tracking\file_store.py", line 1366, in _read_helper
    result = read_yaml(root, file_name)
  File "c:\Users\91889\AppData\Local\Programs\Python\Python310\lib\site-packages\mlflow\utils\file_utils.py", line 310, in read_yaml
    r

✓ STEP 1: Titanic preprocessing created.
✓ STEP 2: Train=712, Test=179
✓ STEP 3: MLflow configured.

Optimizing RIDGE (NO PCA)
Best params: {'alpha': 1.3292918943162166}




ridge Test F1: 0.7287

Optimizing HISTGRADIENTBOOSTING (NO PCA)
Best params: {'learning_rate': 0.05871254182522992, 'max_depth': 8}




histgradientboosting Test F1: 0.7231

Optimizing XGBOOST (NO PCA)
Best params: {'n_estimators': 200, 'learning_rate': 0.07340279606636549, 'max_depth': 3}




xgboost Test F1: 0.6935

Optimizing LIGHTGBM (NO PCA)




Best params: {'n_estimators': 100, 'learning_rate': 0.09563633644393066, 'num_leaves': 52}




lightgbm Test F1: 0.7368

GLOBAL BEST MODEL (OPTUNA)
Model: lightgbm
Test F1: 0.7368
Elapsed time: 42.19 seconds
