In [20]:
!pip install --upgrade scikit-learn xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 2.1.3
    Uninstalling xgboost-2.1.3:
      Successfully uninstalled xgboost-2.1.3
Successfully installed xgboost-2.1.4


In [5]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install hyperopt



In [9]:
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [15]:
# fmin (Function Minimize):


# This is the main optimization function
# Purpose: Finds the set of parameters that minimize your objective function

# tpe (Tree of Parzen Estimators):


# An optimization algorithm that guides the search for best parameters
# Works by:

# Building probability models of good and bad parameter values
# Using these models to choose new parameters to evaluate


# hp (Hyperparameter):


# Provides methods to define the search space for parameters


# STATUS_OK:


# A constant used to indicate successful evaluation
# Used in objective function return value


# Trials:


# Object that stores the history of all evaluations


# You define the search space (hp)
# Specify how to evaluate parameters (objective function)
# Let TPE guide the search (tpe)
# Track all attempts (Trials)
# Find the best parameters (fmin)

In [6]:
df = pd.read_csv('/content/drive/MyDrive/Indigo Training - Feb 2025/Indigo Training - 2025/To Be Shared with Indigo team/Machine Learning/Exercises for Indigo/Arrival Delay Prediction/Flights_Delay_Reg.csv')

In [7]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,DISTANCE,Day,DEPARTURE_DELAY,ARRIVAL_DELAY,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,TAXI_IN,TAXI_OUT,DIVERTED
0,0,0,17,275,1448,4,-11.0,-22.0,205.0,194.0,169.0,4.0,21.0,0
1,1,1,175,233,2330,4,-8.0,-9.0,280.0,279.0,263.0,4.0,12.0,0
2,2,11,276,66,2296,4,-2.0,5.0,286.0,293.0,266.0,11.0,16.0,0


In [12]:
df = df.sample(frac=.30)

In [13]:
X = df.drop(columns = 'ARRIVAL_DELAY')
y = df.ARRIVAL_DELAY

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [14]:

def objective(params):
    # Convert data to DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train)

    # Set up XGBoost parameters
    xgb_params = {
        'max_depth': int(params['max_depth']),
        'eta': float(params['learning_rate']),  # eta is learning_rate in XGBoost params
        'min_child_weight': int(params['min_child_weight']),
        'subsample': float(params['subsample']),
        'colsample_bytree': float(params['colsample_bytree']),
        'gamma': float(params['gamma']),
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse'
    }

    # Perform cross-validation
    cv_results = xgb.cv(
        xgb_params,
        dtrain,
        num_boost_round=int(params['n_estimators']),
        nfold=5,
        early_stopping_rounds=50,
        verbose_eval=False
    )

    # Get the best score
    best_score = cv_results['test-rmse-mean'].min()

    return {'loss': best_score, 'status': STATUS_OK}

# Define the search space
space = {
    'max_depth': hp.choice('max_depth', range(9, 11)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'n_estimators': hp.choice('n_estimators', range(10, 15, 1)),
    'min_child_weight': hp.choice('min_child_weight', range(1, 3)),
    'subsample': hp.uniform('subsample', 0.85, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.85, 1.0),
    'gamma': hp.uniform('gamma', 0, 0.5)
}

# Run optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

# Convert best parameters
best_params = {
    'max_depth': int(best['max_depth']),
    'learning_rate': float(best['learning_rate']),
    'n_estimators': int(best['n_estimators']),
    'min_child_weight': int(best['min_child_weight']),
    'subsample': float(best['subsample']),
    'colsample_bytree': float(best['colsample_bytree']),
    'gamma': float(best['gamma'])
}

# Train final model
final_model = xgb.XGBRegressor(**best_params, random_state=42)
final_model.fit(X_train, y_train)

# Evaluate
y_pred = final_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Best parameters: {best_params}")
print(f"Test RMSE: {test_rmse}")

100%|██████████| 100/100 [12:16<00:00,  7.37s/trial, best loss: 11.340236172866323]
Best parameters: {'max_depth': 0, 'learning_rate': 0.29987777148947986, 'n_estimators': 4, 'min_child_weight': 1, 'subsample': 0.9994498008505205, 'colsample_bytree': 0.8784206704721849, 'gamma': 0.38554113069417534}
Test RMSE: 21.078669978269552


<a name="section-4"></a>
# 2 Hyperparameter tuning on Big Datasets
    
Tuning hyperparameters can be computationally exhaustive when the data is large.
    
    
Since one of the most critical aspects in computation is the size of dataset, among others, we will perform an experiment to study the effect of using different fractions of a dataset in the hyperparameter optimization phase and understand the trade off between efficiency and accuracy.

An ideal scenario would be to reduce the training data size without losing too much information.

In [20]:
# Import essential libraries
# Importing essential libraries
from sklearn.model_selection import (
    train_test_split,
    RandomizedSearchCV,
    StratifiedKFold,
)
from sklearn.metrics import (
    make_scorer,
    recall_score,
    accuracy_score,
    precision_score,
    confusion_matrix,
    classification_report,
    f1_score,
    log_loss,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import time
import numpy as np
from scipy.stats import randint as sp_randint
import plotly.express as px
from hyperopt import hp,Trials,tpe, fmin
from hyperopt.pyll.stochastic import sample as hp_sample
from hyperopt.pyll.base import scope

np.random.seed(12345)

## Creating a classification dataset

In [21]:
data = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=0,
    n_classes=2,
    n_clusters_per_class=2,
    class_sep=0.7,
    weights=[0.7, 0.3],
    random_state=12345,
)

print(f'Number of records in the data:{data[0].shape[0]} and number of columns: {data[0].shape[1]}')


Number of records in the data:1000 and number of columns: 10


We will now break the dataset into train & test and will use test to track the final model performance.
Let's keep 30% of the dataset as test

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    data[0],
    data[1],
    test_size=0.3,
    random_state=12345,
    stratify = data[1]
)

print(f'''
X_train:{X_train.shape[0]} records
X_test:{X_test.shape[0]} records
''')


X_train:700 records
X_test:300 records



Let's wrap what we learned about Hyperopt in a function as we will be running parameter optimization multiple times
The function should perform the following:

* Take training dataset and parameter space as input
* Perform Hyperparameter tuning on the training dataset using K Fold validation
* Retrain a model with learned hyperparameters on complete data and calculate the test loss
* Return loss on the test dataset, time taken for search

In [23]:
np.random.seed(12345)
# Using the same rf_objective function as defined in the Hyperopt example
# Stratify folds object
stratify_folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=12345)
stratify_folds.get_n_splits(X_train, y_train)

## Function would take a training data, fit a model using 3 folds and return loss
def rf_objective(params):

    # Params type and value checks
    if type(params['n_estimators']==float):
        params['n_estimators'] = int(params['n_estimators'])
    if type(params['max_depth']==float):
        params['max_depth'] = int(params['n_estimators'])
    if params['min_samples_split']==0:
        params['min_samples_split'] = 2
    if params['max_depth']==0:
        params['max_depth'] = None
    if params['max_samples']==1.0:
        params['max_samples'] = None

    # Create RF Classifier
    rf_model = RandomForestClassifier(**params)
    loss = []

    # Fit the model on defined folds
    for train_index, test_index in stratify_folds.split(X_frac,y_frac):
        rf_model.fit(X_frac[train_index], y_frac[train_index])
        val_pred = rf_model.predict_proba(X_frac[test_index])
        fold_loss = log_loss(y_true=y_frac[test_index], y_pred=val_pred)
        loss.append(fold_loss)

    # Return the mean loss
    return np.mean(loss)


def hyperopt_run(training_data, test_data, param_space, n_evals):
    # Perform hyperopt search
    start = time.time()
    trials = Trials()

    best = fmin(
        rf_objective,
        hyperopt_param_dist,
        algo=tpe.suggest,
        max_evals=n_evals,
        trials=trials,
        )

    end = time.time()

    if type(best['n_estimators']==float):
        best['n_estimators'] = int(best['n_estimators'])
    if type(best['max_depth']==float):
        best['max_depth'] = int(best['n_estimators'])
    if best['min_samples_split']==0:
        best['min_samples_split'] = 2
    if best['max_depth']==0:
        best['max_depth'] = None
    if best['max_samples']==1.0:
        best['max_samples'] = None

    # Refit the model on complete training data
    rf_best = RandomForestClassifier(**best)
    rf_best.fit(X_train, y_train)

    # Calculate loss on test dataset
    test_pred = rf_best.predict_proba(test_data[0])
    test_loss = log_loss(test_data[1], test_pred)
    print(f'Best params : {best} and test loss:{test_loss}')

    return test_loss, end - start

In [24]:
hyperopt_param_dist = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 8, 512, 16)),  # Ensures integer
    "max_depth": scope.int(hp.quniform("max_depth", 2, 9, 1)),  # Ensures integer, consider handling `None` separately
    "min_samples_split": hp.quniform("min_samples_split", 0,0.1,0.01),  # Ensures integer
    "max_samples": hp.uniform("max_samples", 0.1, 0.9)
}

In the coming section we will do the following:

* Vary the training data fraction from 20% to 100%
* Run 100 evals and optimize parameters using Hyperopt
* Record and compare test loss & tuning time for each instance

In [25]:
np.random.seed(12345)
loss_time_tracker = {"loss": [], "time": []}

# Set the number of evals
max_evals = 10  # update this value to 100 (will take some time to complete execution)

for frac in [0.2, 0.4, 0.6, 0.8, 1]:
    # Take a fraction of dataset
    index = np.random.randint(len(X_train), size=int(frac * len(X_train)))
    X_frac, y_frac = X_train[index], y_train[index]
    loss, elapsed_time = hyperopt_run(
        (X_frac, y_frac), (X_test, y_test), hyperopt_param_dist, max_evals
    )
    loss_time_tracker["loss"].append(loss)
    loss_time_tracker["time"].append(elapsed_time)

100%|██████████| 10/10 [00:10<00:00,  1.06s/trial, best loss: 0.40745719112121]
Best params : {'max_depth': 128, 'max_samples': 0.8550751216462612, 'min_samples_split': 0.04, 'n_estimators': 128} and test loss:0.4598287778984768
100%|██████████| 10/10 [00:12<00:00,  1.22s/trial, best loss: 0.438849336368575]
Best params : {'max_depth': 224, 'max_samples': 0.5731337684506291, 'min_samples_split': 0.02, 'n_estimators': 224} and test loss:0.44044082998431233
100%|██████████| 10/10 [00:16<00:00,  1.62s/trial, best loss: 0.3405728891475615]
Best params : {'max_depth': 464, 'max_samples': 0.8806628818906026, 'min_samples_split': 0.03, 'n_estimators': 464} and test loss:0.44068068604795657
100%|██████████| 10/10 [00:09<00:00,  1.05trial/s, best loss: 0.34956639137822365]
Best params : {'max_depth': 144, 'max_samples': 0.7628515971384834, 'min_samples_split': 0.03, 'n_estimators': 144} and test loss:0.44391382764381754
100%|██████████| 10/10 [00:10<00:00,  1.10s/trial, best loss: 0.24910903912

In [16]:
!pip install plotly
import plotly.graph_objects as go
import plotly.subplots as py



In [26]:
# Visualize different configuration spaces (q)
fig = py.make_subplots(
    rows=1, cols=2, subplot_titles=["Test Loss", "Computation Time"],
)

trace_1 = go.Bar(
    name=f"Test Loss",
    y=loss_time_tracker["loss"],
    x=[0.2,0.4,0.6,0.8,1],
    orientation="v",
)
trace_2 = go.Bar(
    name=f"Computation Time",
    y=loss_time_tracker["time"],
    x=[0.2,0.4,0.6,0.8,1],
    orientation="v",
)

fig.append_trace(trace_1, 1, 1)
fig.append_trace(trace_2, 1, 2)

fig["layout"].update(
    title="Test loss and Computation time for different fractions of training dataset",
    legend_orientation="h",
    height=480,
    width=900,
)
fig