# versione con cross validation base


In [None]:
import optuna
import xgboost as xgb
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Data Preparation (Regression Dataset) ---
data = load_diabetes()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# --- 2. Objective Function for Optuna ---
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "n_jobs": -1,
        "random_state": 42
    }

    model = xgb.XGBRegressor(**params)

    # 5-Fold CV
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    # Use NEGATIVE RMSE (sklearn convention)
    scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=cv,
        scoring="neg_root_mean_squared_error"
    )

    # Optuna minimizes → return positive RMSE
    return - scores.mean()

# --- 3. Optuna Study ---
print("--- Starting Bayesian Optimization (Regression) ---")

study = optuna.create_study(
    direction="minimize",
    study_name="XGBoost_Diabetes_Regression"
)

study.optimize(objective, n_trials=50, show_progress_bar=True)

# --- 4. Optimization Results ---
print("\n--- Optimization Results ---")
print(f"Best Trial #: {study.best_trial.number}")
print(f"Best CV RMSE: {study.best_value:.4f}")
print("Best Hyperparameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

# --- 5. Final Evaluation on Test Set ---
best_params = study.best_params

final_model = xgb.XGBRegressor(
    **best_params,
    n_jobs=-1,
    random_state=42
)

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Final Test Set Performance ---")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


[I 2026-01-06 20:13:25,172] A new study created in memory with name: XGBoost_Diabetes_Regression


--- Starting Bayesian Optimization (Regression) ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-01-06 20:13:27,425] Trial 0 finished with value: -59.160620618899145 and parameters: {'n_estimators': 782, 'learning_rate': 0.029126856851343173, 'max_depth': 8, 'subsample': 0.6801050869575207, 'colsample_bytree': 0.6363810159280652, 'reg_alpha': 1.6847946474062267e-08, 'reg_lambda': 0.00037438557249132743}. Best is trial 0 with value: -59.160620618899145.
[I 2026-01-06 20:13:27,625] Trial 1 finished with value: -69.76291112989513 and parameters: {'n_estimators': 246, 'learning_rate': 0.0013721165989043123, 'max_depth': 3, 'subsample': 0.712400050957545, 'colsample_bytree': 0.7161593836062156, 'reg_alpha': 1.70188682515387, 'reg_lambda': 4.932328787820536e-07}. Best is trial 1 with value: -69.76291112989513.
[I 2026-01-06 20:13:28,939] Trial 2 finished with value: -61.02442406767277 and parameters: {'n_estimators': 737, 'learning_rate': 0.0015483122333664328, 'max_depth': 6, 'subsample': 0.5315954690998272, 'colsample_bytree': 0.5034811688808406, 'reg_alpha': 4.146116842911017

# versione con validation split

In [7]:
import optuna
import xgboost as xgb
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# --- 1. Data Preparation (Regression Dataset) ---
data = load_diabetes()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train, X_vali, y_train, y_vali = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

# --- 2. Objective Function for Optuna ---
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "n_jobs": -1,
        "random_state": 42
    }

    model = xgb.XGBRegressor(**params)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_vali)

    mse = mean_squared_error(y_vali, y_pred)
    return mse   # Optuna minimizza

# --- 3. Optuna Study ---
print("--- Starting Bayesian Optimization (Regression) ---")

study = optuna.create_study(
    direction="minimize",
    study_name="XGBoost_Diabetes_Regression"
)

study.optimize(objective, n_trials=50, show_progress_bar=True)

# --- 4. Optimization Results ---
print("\n--- Optimization Results ---")
print(f"Best Trial #: {study.best_trial.number}")
print(f"Best CV RMSE: {study.best_value:.4f}")
print("Best Hyperparameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

# --- 5. Final Evaluation on Test Set ---
best_params = study.best_params

final_model = xgb.XGBRegressor(
    **best_params,
    n_jobs=-1,
    random_state=42
)

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Final Test Set Performance ---")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


[I 2026-01-06 20:18:44,376] A new study created in memory with name: XGBoost_Diabetes_Regression


--- Starting Bayesian Optimization (Regression) ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-01-06 20:18:44,427] Trial 0 finished with value: 3827.1893673612594 and parameters: {'n_estimators': 159, 'learning_rate': 0.15946715218037105, 'max_depth': 4, 'subsample': 0.5446560652128124, 'colsample_bytree': 0.5764047373922628, 'reg_alpha': 2.6362158240371255e-06, 'reg_lambda': 1.0552092663442332e-06}. Best is trial 0 with value: 3827.1893673612594.
[I 2026-01-06 20:18:44,506] Trial 1 finished with value: 3461.6668649938156 and parameters: {'n_estimators': 159, 'learning_rate': 0.07481106146872643, 'max_depth': 7, 'subsample': 0.8433361192284605, 'colsample_bytree': 0.5544431941841845, 'reg_alpha': 6.451152162498467e-08, 'reg_lambda': 1.6348838660058825e-07}. Best is trial 1 with value: 3461.6668649938156.
[I 2026-01-06 20:18:44,600] Trial 2 finished with value: 3026.5042808904504 and parameters: {'n_estimators': 431, 'learning_rate': 0.008102362058994388, 'max_depth': 4, 'subsample': 0.9224457804192894, 'colsample_bytree': 0.6942947415983585, 'reg_alpha': 2.74401871125334

# versione migliorata con paramettri ottimizzati + pruning

In [None]:
import optuna
import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from optuna.integration import XGBoostPruningCallback

# =========================
# 1. DATA PREPARATION
# =========================
data = load_diabetes()
X, y = data.data, data.target

# Train / Test
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train / Validation per early stopping
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

# =========================
# 2. OPTUNA OBJECTIVE con pruning
# =========================
def objective(trial):

    # --- Parametri da ottimizzare ---
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 4000,                     # upper bound per early stopping
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "n_jobs": -1,
        "random_state": 42,
        "tree_method": "hist",
        "eval_metric": "rmse",
        "early_stopping_rounds": 50
    }

    # --- Creazione modello ---
    model = xgb.XGBRegressor(**params)

    # --- Fit con validation e pruning callback ---
    # XGBoostPruningCallback monitora "rmse" sul validation set
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_val, y_val)],
        verbose=False,
        callbacks=[XGBoostPruningCallback(trial, "validation_0-rmse")]
    )

    # --- Metriche validation ---
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)  # RMSE
    return rmse  # Minimizzare RMSE

# =========================
# 3. CREAZIONE STUDY
# =========================
study = optuna.create_study(direction="minimize", study_name="XGBoost_Diabetes_Regression")
study.optimize(objective, n_trials=50, show_progress_bar=True)

# =========================
# 4. RISULTATI
# =========================
print("\n--- Best Trial ---")
print(f"Trial #{study.best_trial.number}")
print(f"Best Validation RMSE: {study.best_value:.4f}")
print("Best Hyperparameters:")
for k, v in study.best_params.items():
    print(f"  {k}: {v}")

# =========================
# 5. FINAL MODEL REFIT
# =========================
best_params = study.best_params

# Refit sul train completo con early stopping sul test
final_model = xgb.XGBRegressor(
    **best_params,
    n_estimators=4000,
    early_stopping_rounds=50,
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
    eval_metric="rmse"
)

final_model.fit(
    X_train_full,
    y_train_full,
    eval_set=[(X_test, y_test)],
    verbose=False
)

# Predizioni sul test
y_pred_test = final_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)

print("\n--- Final Test Set Performance ---")
print(f"RMSE: {rmse_test:.4f}")
print(f"R² Score: {r2_test:.4f}")
print(f"Best iteration used: {final_model.best_iteration_}")
