<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/dphi/juniper_networks_global_challenge/notebooks/02_xgboost_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [49]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('precision', 4)
np.set_printoptions(precision=4)

import xgboost
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [5]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.2', f'Change in Optuna version from 2.10.1 to {optuna.__version__}'
assert xgboost.__version__ == '1.6.2', f'Change in XGBoost version from 1.6.2 to {xgboost.__version__}'

In [6]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    GPU = True
except Exception:
    GPU = False

print(f'GPU available: {GPU}')

GPU available: True


# Data

In [7]:
data_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/dphi/juniper_networks_global_challenge/data/'

train = pd.read_csv(data_url + 'raw/train.csv')
test = pd.read_csv(data_url + 'raw/test.csv')

In [9]:
train['observation_timestamp'] = pd.to_datetime(train['observation_timestamp'])
test['observation_timestamp'] = pd.to_datetime(test['observation_timestamp'])

train['month'] = train.observation_timestamp.dt.month.astype('int')
test['month'] = test.observation_timestamp.dt.month.astype('int')

train['day'] = train.observation_timestamp.dt.day.astype('int')
test['day'] = test.observation_timestamp.dt.day.astype('int')

train['day_of_week'] = train.observation_timestamp.dt.day_of_week.astype('int')
test['day_of_week'] = test.observation_timestamp.dt.day_of_week.astype('int')

In [10]:
train['register__payment_types_accepted'] = train['register__payment_types_accepted'].astype('category')
test['register__payment_types_accepted'] = test['register__payment_types_accepted'].astype('category')

train['store__type_code'] = train['store__type_code'].astype('category')
test['store__type_code'] = test['store__type_code'].astype('category')

In [11]:
train = train.drop(
    labels=[
        'Unnamed: 0', 
        'observation_id',
        'observation_timestamp',
        'region__peak_sales_dollar_amt_per_hour_v2', 
        'region__peak_returns_dollar_amt_per_hour_v2'
    ], 
    axis=1
)

test = test.drop(
    labels=[
        'observation_id',
        'observation_timestamp',
        'region__peak_sales_dollar_amt_per_hour_v2', 
        'region__peak_returns_dollar_amt_per_hour_v2'
    ], 
    axis=1
)

In [21]:
features = list(test.columns)
TARGET = 'register__sales_dollar_amt_this_hour'

# Custom metric

In [22]:
def regression_accuracy(y_meas, y_pred, max_error=20, error_type='relative'):
    '''Compares predicted & measured values, returning the percentage of predictions
       that are within a set error limit. This error limit can be an absolute value
       or a relative percentage'''
    # OPTION 1: Relative percentage
    if ( error_type == 'relative' ):
        mask = 100.0 * abs((y_pred - y_meas) / y_meas) < max_error
    # OPTION 2: Absolute value
    elif ( error_type == 'absolute' ):
        mask = abs(y_pred - y_meas) < max_error

    accuracy = sum(mask) / len(mask)
    return -accuracy # '-' for xgboost eval-metric usage 

# Hyperparameter tuning

In [50]:
def objective(trial, data, base_params):

    scores = []
    X, y = data

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.005),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.005),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.005),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0, step=0.005),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0, step=0.005),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 15),
        'gamma': trial.suggest_float('gamma', 0, 20, step=0.005),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e3, log=True),
        'lambda': trial.suggest_float('lambda', 1e-5, 1e3, log=True)
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        model = XGBRegressor(
            **base_params, 
            **param_grid,
            callbacks=[XGBoostPruningCallback(
                trial=trial, 
                observation_key='validation_0-regression_accuracy')]
        )
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        preds = model.predict(X_val)
        scores.append(regression_accuracy(y_val, preds))
    
    return np.mean(scores)

In [51]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True
    )
    
    return study

# Cross-validation and experiment setup

In [52]:
def evaluate_model(data, model_params):
    
    scores_racc = []
    scores_mae = []
    test_preds = []
    
    X, X_test, y = data
    
    cv_start = time.time()
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        fold_start = time.time()
        model = XGBRegressor(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        
        preds_val = model.predict(X_val)
        racc = regression_accuracy(y_val, preds_val)
        scores_racc.append(racc)
        mae = mean_absolute_error(y_val, preds_val)
        scores_mae.append(mae)
        test_preds.append(model.predict(test))

        fold_end = time.time()
        print(f'Fold #{fold}: {model.best_iteration} rounds, ' \
              f'R-Acc = {-racc:.5f}, MAE = {mae:.5f} [Time: {fold_end - fold_start:.2f}s]')
        _ = gc.collect()
    
    cv_end = time.time()
    print(f'Average R-Acc = {-np.mean(scores_racc):.5f} +/- {np.std(scores_racc):.5f})')
    print(f'Average MAE = {np.mean(scores_mae):.5f} +/- {np.std(scores_mae):.5f})')
    print(f'[Total time: {cv_end - cv_start:.2f}s]\n')
    
    return test_preds

In [53]:
def run_experiment(data, n_trials=5):
        
    X, X_test, y = data
    
    base_params = {
        'objective': 'reg:squarederror',
        'n_estimators': 5000,
        'booster': 'gbtree',
        'eval_metric': regression_accuracy,
        'early_stopping_rounds': 100,
        'tree_method': 'gpu_hist' if GPU else 'hist',
        'single_precision_histogram': True,
        'enable_categorical': GPU,
        'predictor': 'gpu_predictor' if GPU else 'cpu_predictor',
        'max_cat_to_onehot': 6,
        'verbosity': 1,
        'seed': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='minimize'
    )
    print(f'Best trial: {study.best_trial.number} -> Best value(R-Acc): {-study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    test_preds = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params
    )
    return test_preds

In [56]:
%%time
test_preds = run_experiment(
    data=(train[features], test[features], train[TARGET]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 32 -> Best value(R-Acc): 0.86933
Best hyperparameters:
learning_rate        - 0.060000000000000005
max_depth            - 13
subsample            - 0.675
colsample_bytree     - 0.895
colsample_bylevel    - 0.825
colsample_bynode     - 0.94
min_child_weight     - 7
gamma                - 3.68
alpha                - 5.564504561147535e-05
lambda               - 0.28765583798549577
-----------------Cross-validation------------------
Fold #0: 53 rounds, R-Acc = -0.86957, MAE = 113.20236 [Time: 13.50s]
Fold #1: 57 rounds, R-Acc = -0.87578, MAE = 107.85046 [Time: 13.59s]
Fold #2: 55 rounds, R-Acc = -0.87007, MAE = 107.91826 [Time: 12.45s]
Fold #3: 60 rounds, R-Acc = -0.86386, MAE = 100.09152 [Time: 16.50s]
Fold #4: 55 rounds, R-Acc = -0.86737, MAE = 113.52410 [Time: 13.85s]
Average R-Acc = -0.86933 +/- 0.00390)
Average MAE = 108.51734 +/- 4.87473)
[Total time: 70.32s]

CPU times: user 21min 48s, sys: 2.91 s, total: 21min 51s
Wall

In [57]:
preds_df = pd.DataFrame(np.column_stack(test_preds), columns=[f'Fold_{i}' for i in range(5)])
preds_df

Unnamed: 0,Fold_0,Fold_1,Fold_2,Fold_3,Fold_4
0,1736.8906,1676.6162,1774.8201,1900.4858,1901.8218
1,352.7858,352.6450,354.3098,359.2663,350.5804
2,1363.9362,1379.5603,1372.1263,1387.1006,1374.2811
3,264.2657,252.8107,265.6532,322.8358,248.8179
4,351.0677,348.2773,349.5119,357.1149,352.2020
...,...,...,...,...,...
5174,1128.5756,1107.4653,1108.8182,1197.4265,1237.0811
5175,1156.4762,1218.5205,1300.7046,1254.5220,1207.7023
5176,791.9690,799.4627,849.9474,992.3860,846.3983
5177,935.3799,977.1608,942.6738,1077.2567,1089.2401


In [58]:
sub_1 = pd.DataFrame({'prediction': preds_df['Fold_1']})
sub_1.to_csv('02_sub_1.csv', index=False)

In [59]:
avg_preds = np.mean(np.column_stack(test_preds), axis=1)
sub_avg = pd.DataFrame({'prediction': avg_preds})
sub_avg.to_csv('02_sub_avg.csv', index=False)