In [3]:
# ================================================
# 1. Imports
# ================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow
import mlflow.xgboost   # integration for XGBoost models

In [4]:
# ================================================
# 2. Load processed dataset
# ================================================
df = pd.read_csv("/Users/riadanas/Desktop/housing regression MLE/data/processed/train_leakage_safe.csv")

target = "price"
X = df.drop(columns=[target])
y = df[target]

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)


Train shape: (587734, 39)
Validation shape: (146934, 39)


In [5]:
# ================================================
# 3. Define Optuna objective function with MLflow
# ================================================
def objective(trial):
    # Define hyperparameter search space
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    # Start an MLflow run for each Optuna trial
    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        # Evaluate on validation set
        y_pred = model.predict(X_val)
        rmse = float(np.sqrt(mean_squared_error(y_val, y_pred)))
        mae = float(mean_absolute_error(y_val, y_pred))
        r2 = float(r2_score(y_val, y_pred))

        # Log hyperparameters and metrics to MLflow
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    # Optuna will minimize RMSE
    return rmse


In [6]:
import xgboost as xgb
print(xgb.__version__)

3.0.4


In [7]:
import sys, xgboost as xgb
print(sys.executable)        # should point to .../.venv/bin/python
print(xgb.__version__)       # should print 3.0.4
print(xgb.__file__)          # should live under .../.venv/...

/Users/riadanas/Desktop/housing regression MLE/.venv/bin/python
3.0.4
/Users/riadanas/Desktop/housing regression MLE/.venv/lib/python3.11/site-packages/xgboost/__init__.py


In [8]:
import mlflow

# Force MLflow to use the project root `mlruns/`
mlflow.set_tracking_uri("/Users/riadanas/Desktop/housing regression MLE/mlruns")

In [9]:
# ================================================
# 4. Run Optuna study with MLflow experiment
# ================================================
# Set experiment name (creates if not exists)
mlflow.set_experiment("xgboost_optuna_housing")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best params:", study.best_trial.params)


[I 2025-08-28 19:14:26,251] A new study created in memory with name: no-name-05769c6b-2cab-4a43-97b1-e081e30490de
[I 2025-08-28 19:14:29,240] Trial 0 finished with value: 41287.29514443926 and parameters: {'n_estimators': 219, 'max_depth': 6, 'learning_rate': 0.07201748936124948, 'subsample': 0.8459895336019783, 'colsample_bytree': 0.5219774833462373, 'min_child_weight': 4, 'gamma': 3.1265709289183485, 'reg_alpha': 2.391601381318461e-07, 'reg_lambda': 0.035920530172863584}. Best is trial 0 with value: 41287.29514443926.
[I 2025-08-28 19:14:35,559] Trial 1 finished with value: 46533.73796382394 and parameters: {'n_estimators': 870, 'max_depth': 3, 'learning_rate': 0.11820428340009075, 'subsample': 0.5571236882200832, 'colsample_bytree': 0.6867944231339096, 'min_child_weight': 8, 'gamma': 1.7598340336652174, 'reg_alpha': 3.1767384249582707e-07, 'reg_lambda': 8.410310189175342e-07}. Best is trial 0 with value: 41287.29514443926.
[I 2025-08-28 19:14:45,701] Trial 2 finished with value: 388

Best params: {'n_estimators': 695, 'max_depth': 10, 'learning_rate': 0.054627170855435926, 'subsample': 0.9011230951792111, 'colsample_bytree': 0.8449319775427244, 'min_child_weight': 3, 'gamma': 3.9531762500780463, 'reg_alpha': 0.005906605469235324, 'reg_lambda': 1.2831534884852641e-08}


In [10]:
# ================================================
# 5. Train final model with best params and log to MLflow
# ================================================
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

# Log best model and metrics to MLflow
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    # Save model artifact to MLflow
    mlflow.xgboost.log_model(best_model, name="model")


Final tuned model performance:
MAE: 9517.640828780237
RMSE: 18997.705524589262
R²: 0.9965070007130197


  self.get_booster().save_model(fname)
