<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/analytics_vidhya/jobathon_aug22/notebooks/02_xgboost_optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [27]:
%%capture
!pip install --upgrade xgboost
!pip install --upgrade optuna

In [28]:
import os
import gc
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

import xgboost
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from optuna.integration import XGBoostPruningCallback
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [29]:
#remove cell to run future versions
assert optuna.__version__ == '2.10.1', f'Change in Optuna version. Original notebook version: 2.10.1'
assert xgboost.__version__ == '1.6.1', f'Change in XGBoost version. Original notebook version: 1.6.1'

In [30]:
SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [31]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    GPU = True
except Exception:
    GPU = False

print(f'GPU available: {GPU}')

GPU available: True


**Data**

In [32]:
data_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/jobathon_aug22/data/'

train = pd.read_csv(data_url + 'processed/train_proc.csv')
test = pd.read_csv(data_url + 'processed/test_proc.csv')

test_index_url = data_url + 'raw/test.csv'
test_index = pd.read_csv(test_index_url, usecols=['campaign_id']).iloc[:, 0]

target_url = data_url + 'raw/train.csv'
target = pd.read_csv(target_url, usecols=['click_rate']).iloc[:, 0]

In [33]:
features = list(test.columns)

num_features = ['subject_len', 'body_len', 'mean_paragraph_len',
                'no_of_CTA', 'mean_CTA_len', 'is_image', 'is_quote', 
                'is_emoticons']

cat_features = [f for f in features if f not in num_features]

In [34]:
train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')

[From EDA notebook](https://github.com/stiwari-ds/data-science-competitions/blob/main/analytics_vidhya/jobathon_aug22/notebooks/01_eda.ipynb)

In [35]:
features.remove('re_category')

reduced_features = ['sender', 'subject_len', 'body_len', 
                    'is_weekend', 'times_of_day', 're_category',
                    'product', 'no_of_CTA', 'is_personalised',
                    'is_urgency', 'target_audience']

# Hyperparameter tuning

In [36]:
def objective(trial, data, base_params):

    scores = []
    X, y = data
    cat_features = list(X.select_dtypes(include='category').columns)

    param_grid = {
        'objective': trial.suggest_categorical(
            'objective', ['reg:squarederror', 'reg:squaredlogerror']),
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.01, 0.4, step=0.01),
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5, 1.0, step=0.05),
        'colsample_bylevel': trial.suggest_float(
            'colsample_bylevel', 0.5, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float(
            'colsample_bynode', 0.5, 1.0, step=0.05),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 10),
        'gamma': trial.suggest_float('gamma', 0, 20),
        'alpha': trial.suggest_float('alpha', 1e-5, 1e2, log=True),
        'lambda': trial.suggest_float('lambda', 1e-5, 1e2, log=True)
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        model = XGBRegressor(
            **base_params, 
            **param_grid,
            callbacks=[XGBoostPruningCallback(
                trial=trial, 
                observation_key='validation_0-rmse')]
        )
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        preds = model.predict(X_val)
        scores.append(mean_squared_error(y_val, preds, squared=False))
    
    return np.mean(scores)

In [37]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation

In [38]:
def evaluate_model(data, model_params, verbose=True):
    
    preds_test = []
    scores_r2 = [] #validation set r2-scores
    scores_rmse = [] #validation set RMSE loss
    
    X, X_test, y = data
    cat_features = list(X.select_dtypes(include='category').columns)
    
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = XGBRegressor(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=0
        )
        
        preds_val = model.predict(X_val)
        preds_test.append(model.predict(X_test))
        
        scores_rmse.append(mean_squared_error(y_val, preds_val, squared=False))
        scores_r2.append(r2_score(y_val, preds_val))
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            'RMSE': scores_rmse,
            'R2': scores_r2
        })
        scores_df.index.name = 'Fold'
        display(scores_df)
    
    print(f'Average RMSE = {np.mean(scores_rmse):.4f} (with std = {np.std(scores_rmse):.4f})')
    print(f'Average R2-score = {np.mean(scores_r2):.4f} (with std = {np.std(scores_r2):.4f})\n')
    
    preds_test = np.mean(np.column_stack(preds_test), axis=1)
    return preds_test

In [39]:
def run_experiment(data, n_trials=5):
        
    X, X_test, y = data
    
    base_params = {
        'n_estimators': 2000,
        'eval_metric': 'rmse',
        'early_stopping_rounds': 25,
        'tree_method': 'gpu_hist' if GPU else 'hist', 
        'enable_categorical': GPU,
        'max_cat_to_onehot': 7, #internal one-hot encoding
        'verbosity': 1,
        'seed': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='minimize')
    print(f'Best trial: {study.best_trial.number} -> Best value (RMSE): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    preds_test = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params)
    
    return preds_test

In [40]:
%%time
preds_all = run_experiment(
    (train[features], test[features], target), 500)

---------------Hyperparameter tuning---------------
Best trial: 492 -> Best value (RMSE): 0.06835
Best hyperparameters:
objective            - reg:squaredlogerror
learning_rate        - 0.37
max_depth            - 3
subsample            - 1.0
colsample_bytree     - 0.95
colsample_bylevel    - 0.8
colsample_bynode     - 0.75
min_child_weight     - 7
gamma                - 0.005884733628856997
alpha                - 2.30120450386214e-05
lambda               - 93.7169659586593
-----------------Cross-validation------------------


Unnamed: 0_level_0,RMSE,R2
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.068,0.4305
1,0.0689,0.1695
2,0.0715,0.3458
3,0.0683,0.3992
4,0.0651,0.2938


Average RMSE = 0.0683 (with std = 0.0021)
Average R2-score = 0.3278 (with std = 0.0919)

CPU times: user 3min 1s, sys: 6 s, total: 3min 7s
Wall time: 2min 39s


In [41]:
%%time
preds_reduced = run_experiment(
    (train[reduced_features], test[reduced_features], target), 500)

---------------Hyperparameter tuning---------------
Best trial: 391 -> Best value (RMSE): 0.06330
Best hyperparameters:
objective            - reg:squaredlogerror
learning_rate        - 0.37
max_depth            - 3
subsample            - 0.95
colsample_bytree     - 0.95
colsample_bylevel    - 0.8
colsample_bynode     - 0.7
min_child_weight     - 9
gamma                - 0.00042491885046523625
alpha                - 0.001455464664953621
lambda               - 36.54225877593997
-----------------Cross-validation------------------


Unnamed: 0_level_0,RMSE,R2
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0631,0.5092
1,0.0639,0.2868
2,0.0688,0.3935
3,0.0638,0.4759
4,0.0569,0.4596


Average RMSE = 0.0633 (with std = 0.0038)
Average R2-score = 0.4250 (with std = 0.0787)

CPU times: user 2min 59s, sys: 5.32 s, total: 3min 4s
Wall time: 2min 39s


# Generating submissions

In [42]:
preds_all[:5]

array([0.06885812, 0.25991064, 0.19867499, 0.19697079, 0.08483034],
      dtype=float32)

In [43]:
sub_all = pd.DataFrame({
    'campaign_id': test_index,
    'click_rate': preds_all
})
sub_all.to_csv('02_sub_all.csv', index=False)

In [44]:
sub_reduced = pd.DataFrame({
    'campaign_id': test_index,
    'click_rate': preds_reduced
})
sub_reduced.to_csv('02_sub_reduced.csv', index=False)