# LAB | Hyperparameter Tuning

**Load the data**

Finally step in order to maximize the performance on your Spaceship Titanic model.

The data can be found here:

https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv

Metadata

https://github.com/data-bootcamp-v4/data/blob/main/spaceship_titanic.md

So far we've been training and evaluating models with default values for hyperparameters.

Today we will perform the same feature engineering as before, and then compare the best working models you got so far, but now fine tuning it's hyperparameters.

In [1]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
spaceship = pd.read_csv("https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv")
spaceship.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Now perform the same as before:
- Feature Scaling
- Feature Selection


In [3]:
#your code here
# --- Feature Scaling + Feature Selection ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd

# 1) Minimal prep (skip if you already have X, y)
df = spaceship.dropna().reset_index(drop=True).copy()
df["Deck"] = df["Cabin"].str.split("/").str[0]
df = df.drop(columns=["PassengerId", "Name", "Cabin"], errors="ignore")

df["Transported"] = df["Transported"].astype(int)
bool_cols = df.select_dtypes(include="bool").columns
df[bool_cols] = df[bool_cols].astype(int)

X = pd.get_dummies(df.drop(columns="Transported"), drop_first=True)
y = df["Transported"]

# 2) Train/test split (if not already split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# 3) Feature Scaling (fit on train, apply to test)
scaler = StandardScaler(with_mean=False)  # safe for sparse one-hots
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# 4) Feature Selection (top-k by ANOVA F)
k = min(10, X_train.shape[1])            # choose k <= #features
selector = SelectKBest(score_func=f_classif, k=k)
X_train_sel = selector.fit_transform(X_train_s, y_train)
X_test_sel  = selector.transform(X_test_s)

selected_features = X.columns[selector.get_support()]

print("Scaled shapes :", X_train_s.shape, X_test_s.shape)
print(f"Selected top-{k} features:", selected_features.tolist())
print("Final shapes  :", X_train_sel.shape, X_test_sel.shape)


Scaled shapes : (5284, 19) (1322, 19)
Selected top-10 features: ['RoomService', 'Spa', 'VRDeck', 'HomePlanet_Europa', 'CryoSleep_True', 'Destination_TRAPPIST-1e', 'Deck_B', 'Deck_C', 'Deck_E', 'Deck_F']
Final shapes  : (5284, 10) (1322, 10)


- Now let's use the best model we got so far in order to see how it can improve when we fine tune it's hyperparameters.

In [4]:
#your code here
# === Fine-tune the current best model: HistGradientBoosting ===
import os
import numpy as np
import pandas as pd

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, roc_curve

# Use your selected features if available; else fall back to full split
try:
    X_tr, X_te = X_train_sel, X_test_sel
except NameError:
    X_tr, X_te = X_train, X_test
y_tr, y_te = y_train, y_test

rng = 42
n_jobs = min(8, os.cpu_count() or 1)

# Baseline from before (good starting point)
hgb_base = HistGradientBoostingClassifier(
    learning_rate=0.1,
    max_leaf_nodes=63,
    min_samples_leaf=20,
    max_iter=300,
    early_stopping=True,
    l2_regularization=0.1,
    random_state=rng
)

# Randomized hyperparameter search (wider space than before)
try:
    from scipy.stats import loguniform, randint, uniform
    param_dist = {
        "learning_rate": loguniform(1e-3, 3e-1),
        "max_iter": randint(150, 500),
        "max_leaf_nodes": randint(16, 256),
        "max_depth": randint(3, 12),        # None not supported in randint; we’ll allow deep trees via larger leaf nodes
        "min_samples_leaf": randint(5, 200),
        "l2_regularization": loguniform(1e-4, 10),
    }
    search = RandomizedSearchCV(
        estimator=hgb_base,
        param_distributions=param_dist,
        n_iter=60,
        scoring="roc_auc",
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=rng),
        n_jobs=n_jobs,
        random_state=rng,
        refit=True,
        verbose=0,
    )
except Exception:
    # Fallback to a compact grid if scipy isn't available
    param_grid = {
        "learning_rate": [0.03, 0.05, 0.08, 0.1],
        "max_iter": [200, 300, 400],
        "max_leaf_nodes": [31, 63, 127, 255],
        "max_depth": [4, 6, 8, 10],
        "min_samples_leaf": [10, 20, 50, 100],
        "l2_regularization": [0.0, 0.1, 1.0],
    }
    from sklearn.model_selection import GridSearchCV
    search = GridSearchCV(
        estimator=hgb_base,
        param_grid=param_grid,
        scoring="roc_auc",
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=rng),
        n_jobs=n_jobs,
        refit=True,
        verbose=0,
    )

# Fit search
search.fit(X_tr, y_tr)
best_hgb = search.best_estimator_

# ---- Evaluation helpers ----
def evaluate(name, clf, X_te, y_te):
    proba = clf.predict_proba(X_te)[:, 1]
    pred  = clf.predict(X_te)
    acc = accuracy_score(y_te, pred)
    auc = roc_auc_score(y_te, proba)
    print(f"\n=== {name} ===")
    print("Best params:", search.best_params_)
    print(f"CV best AUC: {search.best_score_:.4f}")
    print(f"Test AUC   : {auc:.4f}")
    print(f"Test Acc   : {acc:.4f}")
    print("Confusion matrix:\n", confusion_matrix(y_te, pred))
    print(pd.DataFrame(classification_report(y_te, pred, digits=4, output_dict=True)).T)
    return proba, pred

proba, pred = evaluate("HistGradientBoosting (fine-tuned)", best_hgb, X_te, y_te)

# ---- Optional: threshold tuning (maximize Youden’s J) ----
fpr, tpr, thr = roc_curve(y_te, proba)
j_idx = (tpr - fpr).argmax()
best_thr = thr[j_idx]
pred_thr = (proba >= best_thr).astype(int)

print(f"\nOptimal threshold by Youden J: {best_thr:.3f}")
print("Confusion matrix @ optimal thr:\n", confusion_matrix(y_te, pred_thr))
print(pd.DataFrame(classification_report(y_te, pred_thr, digits=4, output_dict=True)).T)


[WinError 2] The system cannot find the file specified
  File "c:\Users\ruchi\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\ruchi\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ruchi\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\ruchi\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



=== HistGradientBoosting (fine-tuned) ===
Best params: {'l2_regularization': 0.00032163086173926544, 'learning_rate': 0.04401232738598694, 'max_depth': 4, 'max_iter': 487, 'max_leaf_nodes': 209, 'min_samples_leaf': 58}
CV best AUC: 0.8758
Test AUC   : 0.8705
Test Acc   : 0.7821
Confusion matrix:
 [[457 199]
 [ 89 577]]
              precision    recall  f1-score      support
0              0.836996  0.696646  0.760399   656.000000
1              0.743557  0.866366  0.800277   666.000000
accuracy       0.782148  0.782148  0.782148     0.782148
macro avg      0.790277  0.781506  0.780338  1322.000000
weighted avg   0.789923  0.782148  0.780489  1322.000000

Optimal threshold by Youden J: 0.560
Confusion matrix @ optimal thr:
 [[491 165]
 [113 553]]
              precision    recall  f1-score      support
0              0.812914  0.748476  0.779365   656.000000
1              0.770195  0.830330  0.799133   666.000000
accuracy       0.789713  0.789713  0.789713     0.789713
macro avg     

- Evaluate your model

In [5]:
#your code here
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import numpy as np
import pandas as pd

# pick the fitted model (adjust if your variable name differs)
clf = best_hgb  # or: search.best_estimator_

# predictions
y_pred  = clf.predict(X_te)
y_proba = clf.predict_proba(X_te)[:, 1]

# core metrics
acc = accuracy_score(y_te, y_pred)
auc = roc_auc_score(y_te, y_proba)
baseline = max(np.mean(y_te == 0), np.mean(y_te == 1))  # majority-class baseline

print(f"Accuracy : {acc:.4f} (baseline: {baseline:.4f})")
print(f"ROC AUC  : {auc:.4f}")
print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))
print(pd.DataFrame(classification_report(y_te, y_pred, digits=4, output_dict=True)).T)


Accuracy : 0.7821 (baseline: 0.5038)
ROC AUC  : 0.8705
Confusion matrix:
 [[457 199]
 [ 89 577]]
              precision    recall  f1-score      support
0              0.836996  0.696646  0.760399   656.000000
1              0.743557  0.866366  0.800277   666.000000
accuracy       0.782148  0.782148  0.782148     0.782148
macro avg      0.790277  0.781506  0.780338  1322.000000
weighted avg   0.789923  0.782148  0.780489  1322.000000


**Grid/Random Search**

For this lab we will use Grid Search.

- Define hyperparameters to fine tune.

In [6]:
#your code here
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform, randint
import os

hgb = HistGradientBoostingClassifier(
    early_stopping=True, random_state=42
)

param_dist = {
    "learning_rate":     loguniform(1e-3, 3e-1),
    "max_iter":          randint(200, 600),
    "max_leaf_nodes":    randint(16, 256),
    "max_depth":         randint(3, 12),      # optional alongside max_leaf_nodes
    "min_samples_leaf":  randint(5, 200),
    "l2_regularization": loguniform(1e-4, 10),
    "max_bins":          randint(128, 255),
    "validation_fraction": [0.1, 0.15, 0.2],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rs = RandomizedSearchCV(
    hgb, param_distributions=param_dist, n_iter=60,
    scoring="roc_auc", cv=cv, n_jobs=min(8, os.cpu_count() or 1),
    random_state=42, refit=True, verbose=0
)
rs.fit(X_tr, y_tr)
best_hgb = rs.best_estimator_
print("Best params:", rs.best_params_)
print("CV best AUC:", rs.best_score_)
print("Test AUC   :", roc_auc_score(y_te, best_hgb.predict_proba(X_te)[:,1]))


Best params: {'l2_regularization': 0.0007444441903453075, 'learning_rate': 0.05146791283102272, 'max_bins': 166, 'max_depth': 4, 'max_iter': 559, 'max_leaf_nodes': 144, 'min_samples_leaf': 15, 'validation_fraction': 0.1}
CV best AUC: 0.8753605700976628
Test AUC   : 0.8702288416465246


- Run Grid Search

In [7]:
# Run a GRID SEARCH for the best HistGradientBoostingClassifier

import os
import pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Use selected features if available; else fall back to full split
try:
    X_tr, X_te = X_train_sel, X_test_sel
except NameError:
    X_tr, X_te = X_train, X_test
y_tr, y_te = y_train, y_test

rng = 42
n_jobs = min(8, os.cpu_count() or 1)

hgb = HistGradientBoostingClassifier(
    early_stopping=True,
    random_state=rng
)

param_grid = {
    "learning_rate":     [0.03, 0.05, 0.08, 0.1],
    "max_iter":          [200, 300, 400],
    "max_leaf_nodes":    [31, 63, 127],
    "min_samples_leaf":  [10, 20, 50, 100],
    "l2_regularization": [0.0, 0.1, 1.0],
    "max_depth":         [None, 6, 10],
    "validation_fraction":[0.1, 0.2],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng)

gs = GridSearchCV(
    estimator=hgb,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=n_jobs,
    refit=True,
    verbose=0
)

gs.fit(X_tr, y_tr)
best_hgb = gs.best_estimator_

print("Best params:", gs.best_params_)
print("CV best AUC:", f"{gs.best_score_:.4f}")

# ---- Evaluate on test set ----
y_pred  = best_hgb.predict(X_te)
y_proba = best_hgb.predict_proba(X_te)[:, 1]

print(f"Test Accuracy: {accuracy_score(y_te, y_pred):.4f}")
print(f"Test ROC AUC : {roc_auc_score(y_te, y_proba):.4f}")
print("Confusion matrix:\n", confusion_matrix(y_te, y_pred))
print(pd.DataFrame(classification_report(y_te, y_pred, digits=4, output_dict=True)).T)


Best params: {'l2_regularization': 1.0, 'learning_rate': 0.05, 'max_depth': None, 'max_iter': 200, 'max_leaf_nodes': 31, 'min_samples_leaf': 20, 'validation_fraction': 0.1}
CV best AUC: 0.8753
Test Accuracy: 0.7784
Test ROC AUC : 0.8654
Confusion matrix:
 [[461 195]
 [ 98 568]]
              precision    recall  f1-score      support
0              0.824687  0.702744  0.758848   656.000000
1              0.744430  0.852853  0.794962   666.000000
accuracy       0.778366  0.778366  0.778366     0.778366
macro avg      0.784558  0.777798  0.776905  1322.000000
weighted avg   0.784255  0.778366  0.777041  1322.000000


- Evaluate your model

In [8]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, confusion_matrix,
    classification_report, precision_recall_fscore_support
)
import numpy as np
import pandas as pd

# Use the fitted best model from your GridSearch
clf = best_hgb  # if you kept the name; else: clf = gs.best_estimator_

# Predict
y_pred  = clf.predict(X_te)
y_proba = clf.predict_proba(X_te)[:, 1]

# Core metrics
acc = accuracy_score(y_te, y_pred)
auc = roc_auc_score(y_te, y_proba)
prec, rec, f1, _ = precision_recall_fscore_support(y_te, y_pred, average="binary")
cm = confusion_matrix(y_te, y_pred)
baseline = max(np.mean(y_te == 0), np.mean(y_te == 1))  # majority-class baseline

print(f"Accuracy : {acc:.4f}  (baseline: {baseline:.4f})")
print(f"ROC AUC  : {auc:.4f}")
print(f"Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
print("\nConfusion matrix:\n", cm)
print("\nClassification report:\n",
      classification_report(y_te, y_pred, digits=4))

# Optional: threshold tuning (maximize Youden’s J for better recall/precision trade-off)
from sklearn.metrics import roc_curve
fpr, tpr, thr = roc_curve(y_te, y_proba)
j_idx = (tpr - fpr).argmax()
best_thr = thr[j_idx]
y_pred_thr = (y_proba >= best_thr).astype(int)
print(f"\nOptimal threshold by Youden J: {best_thr:.3f}")
print("Confusion @ optimal thr:\n", confusion_matrix(y_te, y_pred_thr))
print(pd.DataFrame(classification_report(y_te, y_pred_thr, digits=4, output_dict=True)).T)


Accuracy : 0.7784  (baseline: 0.5038)
ROC AUC  : 0.8654
Precision: 0.7444 | Recall: 0.8529 | F1: 0.7950

Confusion matrix:
 [[461 195]
 [ 98 568]]

Classification report:
               precision    recall  f1-score   support

           0     0.8247    0.7027    0.7588       656
           1     0.7444    0.8529    0.7950       666

    accuracy                         0.7784      1322
   macro avg     0.7846    0.7778    0.7769      1322
weighted avg     0.7843    0.7784    0.7770      1322


Optimal threshold by Youden J: 0.577
Confusion @ optimal thr:
 [[516 140]
 [138 528]]
              precision    recall  f1-score      support
0              0.788991  0.786585  0.787786   656.000000
1              0.790419  0.792793  0.791604   666.000000
accuracy       0.789713  0.789713  0.789713     0.789713
macro avg      0.789705  0.789689  0.789695  1322.000000
weighted avg   0.789710  0.789713  0.789710  1322.000000
