<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/analytics_olympiad22/notebooks/04_lgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade optuna
!pip install --upgrade lightgbm

In [16]:
import os
import gc
import time
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

In [6]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.3', f'Change in Optuna version. Original notebook version: 3.0.2'
assert lgb.__version__ == '3.3.3', f'Change in LightGBM version. Original notebook version: 3.3.3'

In [7]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    HAVE_GPU = True
except Exception:
    HAVE_GPU = False

print(f'GPU available: {HAVE_GPU}')

GPU available: False


In [8]:
SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

# Data preparation

In [9]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/analytics_olympiad22/data'

train_full = pd.read_csv(f'{DATA_URL}/processed/train_proc.csv')
train_clip = pd.read_csv(f'{DATA_URL}/processed/train_clip.csv')
test = pd.read_csv(f'{DATA_URL}/processed/test_proc.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [10]:
TARGET = 'OUTCOME'

In [11]:
features = [f for f in test.columns 
            if f not in ('ID', 'POSTAL_CODE', 'ANNUAL_MILEAGE')]

num_features = ['ID_COUNT', 'CREDIT_SCORE', 'ANNUAL_MILEAGE_K', 'DUIS', 
                'SPEEDING_VIOLATIONS', 'PAST_ACCIDENTS', 'TOTAL_PAST_INCIDENTS']

cat_features = [f for f in features if f not in num_features]

In [12]:
# train_full[cat_features] = train_full[cat_features].astype('category')
# train_clip[cat_features] = train_clip[cat_features].astype('category')
# test[cat_features] = test[cat_features].astype('category')

In [13]:
original_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
                     'CREDIT_SCORE', 'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 
                     'CHILDREN', 'POSTAL_CODE_SUBREGION', 'ANNUAL_MILEAGE_K', 
                     'SPEEDING_VIOLATIONS', 'DUIS', 'PAST_ACCIDENTS', 'TYPE_OF_VEHICLE']

cat_only_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
                     'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
                     'TYPE_OF_VEHICLE', 'IS_ID_REPEATED', 'CREDIT_SCORE_BINS', 
                     'POSTAL_CODE_REGION', 'ANNUAL_MILEAGE_RANGE', 'HAS_PRIOR_DUIS', 
                     'HAS_PRIOR_SPEEDING_VIOLATIONS', 'HAS_PAST_ACCIDENTS',
                     'HAS_PAST_INCIDENTS']

#based on F-test and Chi2-test
reduced_features = ['GENDER', 'DRIVING_EXPERIENCE', 'POSTAL_CODE_SUBREGION']

#based on mutual information score
mi_features = ['AGE', 'GENDER', 'DRIVING_EXPERIENCE', 'EDUCATION', 'INCOME', 
               'VEHICLE_OWNERSHIP', 'VEHICLE_YEAR', 'MARRIED', 'CHILDREN', 
               'TYPE_OF_VEHICLE', 'ID_COUNT', 'CREDIT_SCORE_BINS', 'POSTAL_CODE_REGION', 
               'POSTAL_CODE_SUBREGION', 'ANNUAL_MILEAGE_RANGE', 'TOTAL_PAST_INCIDENTS', 
               'HAS_PAST_INCIDENTS']

# Baseline

In [18]:
%%time
scores = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train_full[original_features], train_full[TARGET]
feature_name = list(X.columns)
categorical_feature = [f for f in feature_name if f in cat_features]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        objective='binary',
        boosting_type='goss',
        device_type='cpu',
        random_state=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        eval_metric='binary_logloss',
        feature_name=feature_name,
        categorical_feature=categorical_feature,
        verbose=0
    )
    val_preds = model.predict_proba(X_val)[:, 1]

    score = log_loss(y_val, val_preds)
    scores.append(score)
    print(f'Fold #{fold}: ({model.best_iteration_} rounds) Logloss = {score:.6f}')
    _ = gc.collect()

print(f'\nAvg Logloss = {np.mean(scores):.6f} +/- {np.std(scores):.6f}\n')

Fold #0: (8 rounds) Logloss = 0.681001
Fold #1: (11 rounds) Logloss = 0.680963
Fold #2: (1 rounds) Logloss = 0.681105
Fold #3: (2 rounds) Logloss = 0.681002
Fold #4: (7 rounds) Logloss = 0.681159

Avg Logloss = 0.681046 +/- 0.000074

CPU times: user 19.1 s, sys: 122 ms, total: 19.2 s
Wall time: 11.5 s


# Hyperparameter tuning

In [19]:
def objective(trial, data, base_params):

    scores = []
    X, y = data
    feature_name = list(X.columns)
    categorical_feature = [f for f in feature_name if f in cat_features]

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step=0.01),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 1000, step=5),
        # 'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 500, step=2),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 15, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95, step=0.05),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.75, 1.5, step=0.05)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**base_params, **param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            early_stopping_rounds=100,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            verbose=False
        )
        val_preds = model.predict_proba(X_val)[:, 1]
        scores.append(log_loss(y_val, val_preds))

    return np.mean(scores)

In [21]:
def tune_params(data, base_params, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True
    )
    
    return study

# Cross-validation and experiment setup

In [22]:
def evaluate_model(data, model_params, verbose=True):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #F1 scores on validation set

    X, y, X_test = data
    feature_name = list(X.columns)
    categorical_feature = [f for f in feature_name if f in cat_features]

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            early_stopping_rounds=100,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            verbose=False
        )
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_idx, val_preds)))
        
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]

        score = log_loss(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold}: ({model.best_iteration_} rounds) Logloss = {score:.6f}')
        
        _ = gc.collect()

    print(f'\nAvg Logloss = {np.mean(scores):.6f} +/- {np.std(scores):.6f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)

    return oof_preds, test_preds

In [23]:
def run_experiment(data, n_trials=5):
        
    X, y, X_test = data
    
    base_params = {
        'objective': 'binary',
        'n_estimators': 10000,
        'boosting_type': 'goss',
        'extra_trees': True,
        'verbosity': -1,
        'device_type': 'cpu',
        'random_state': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='minimize' #logloss -> lower is better
    )
    print(f'Best trial: {study.best_trial.number} -> Best value(Logloss): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    oof_preds, test_preds = evaluate_model(
        data=data, 
        model_params=model_params
    )
    return oof_preds, test_preds

### Trial run

In [24]:
%%time
optuna.logging.set_verbosity(optuna.logging.INFO)
op, tp = run_experiment(
    data=(train_full[original_features], train_full[TARGET], test[original_features]),
    n_trials=3
)

[32m[I 2022-11-06 10:49:47,312][0m A new study created in memory with name: no-name-e6e4953a-433b-4715-a91e-c703b7099dbb[0m


---------------Hyperparameter tuning---------------


[32m[I 2022-11-06 10:49:53,028][0m Trial 0 finished with value: 0.6812176390865174 and parameters: {'learning_rate': 0.16, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 295, 'min_child_samples': 110, 'min_split_gain': 10.3, 'colsample_bytree': 0.55, 'top_rate': 0.25, 'other_rate': 0.35000000000000003, 'scale_pos_weight': 1.05}. Best is trial 0 with value: 0.6812176390865174.[0m
[32m[I 2022-11-06 10:49:56,736][0m Trial 1 finished with value: 0.6811322019561752 and parameters: {'learning_rate': 0.01, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 315, 'min_child_samples': 294, 'min_split_gain': 14.68, 'colsample_bytree': 0.9, 'top_rate': 0.1, 'other_rate': 0.15000000000000002, 'scale_pos_weight': 0.95}. Best is trial 1 with value: 0.6811322019561752.[0m
[32m[I 2022-11-06 10:50:01,791][0m Trial 2 finished with value: 0.6811442185514551 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'min_child_samples': 472, 'min_s

Best trial: 1 -> Best value(Logloss): 0.68113
Best hyperparameters:
learning_rate        - 0.01
reg_alpha            - 176.8
reg_lambda           - 177.0
num_leaves           - 315
min_child_samples    - 294
min_split_gain       - 14.68
colsample_bytree     - 0.9
top_rate             - 0.1
other_rate           - 0.15000000000000002
scale_pos_weight     - 0.95
-----------------Cross-validation------------------
Fold #0: (1 rounds) Logloss = 0.681123
Fold #1: (1 rounds) Logloss = 0.681123
Fold #2: (1 rounds) Logloss = 0.681138
Fold #3: (1 rounds) Logloss = 0.681138
Fold #4: (1 rounds) Logloss = 0.681138

Avg Logloss = 0.681132 +/- 0.000007
CPU times: user 34.5 s, sys: 162 ms, total: 34.7 s
Wall time: 19 s


In [25]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [26]:
%%time
op1, tp1 = run_experiment(
    data=(train_full[original_features], train_full[TARGET], test[original_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 73 -> Best value(Logloss): 0.68094
Best hyperparameters:
learning_rate        - 0.19
reg_alpha            - 22.5
reg_lambda           - 151.9
num_leaves           - 215
min_child_samples    - 354
min_split_gain       - 1.35
colsample_bytree     - 0.5
top_rate             - 0.25
other_rate           - 0.15000000000000002
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (17 rounds) Logloss = 0.680838
Fold #1: (25 rounds) Logloss = 0.680912
Fold #2: (9 rounds) Logloss = 0.681052
Fold #3: (14 rounds) Logloss = 0.680839
Fold #4: (5 rounds) Logloss = 0.681071

Avg Logloss = 0.680942 +/- 0.000101
CPU times: user 24min 22s, sys: 5.28 s, total: 24min 27s
Wall time: 12min 52s


In [27]:
%%time
op2, tp2 = run_experiment(
    data=(train_clip[original_features], train_clip[TARGET], test[original_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 99 -> Best value(Logloss): 0.68091
Best hyperparameters:
learning_rate        - 0.17
reg_alpha            - 11.9
reg_lambda           - 181.8
num_leaves           - 905
min_child_samples    - 62
min_split_gain       - 1.08
colsample_bytree     - 0.7
top_rate             - 0.35
other_rate           - 0.1
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (2 rounds) Logloss = 0.681083
Fold #1: (10 rounds) Logloss = 0.680983
Fold #2: (9 rounds) Logloss = 0.680846
Fold #3: (11 rounds) Logloss = 0.680812
Fold #4: (8 rounds) Logloss = 0.680822

Avg Logloss = 0.680909 +/- 0.000106
CPU times: user 20min 45s, sys: 4.64 s, total: 20min 50s
Wall time: 10min 58s


In [28]:
%%time
op3, tp3 = run_experiment(
    data=(train_full[cat_only_features], train_full[TARGET], test[cat_only_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 84 -> Best value(Logloss): 0.68093
Best hyperparameters:
learning_rate        - 0.29000000000000004
reg_alpha            - 106.80000000000001
reg_lambda           - 186.4
num_leaves           - 865
min_child_samples    - 114
min_split_gain       - 0.29
colsample_bytree     - 0.5
top_rate             - 0.45000000000000007
other_rate           - 0.1
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (139 rounds) Logloss = 0.680711
Fold #1: (5 rounds) Logloss = 0.681043
Fold #2: (11 rounds) Logloss = 0.681062
Fold #3: (36 rounds) Logloss = 0.680827
Fold #4: (11 rounds) Logloss = 0.681002

Avg Logloss = 0.680929 +/- 0.000137
CPU times: user 23min 7s, sys: 4.09 s, total: 23min 12s
Wall time: 12min 9s


In [29]:
%%time
op4, tp4 = run_experiment(
    data=(train_clip[cat_only_features], train_clip[TARGET], test[cat_only_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 32 -> Best value(Logloss): 0.68092
Best hyperparameters:
learning_rate        - 0.28
reg_alpha            - 49.800000000000004
reg_lambda           - 11.8
num_leaves           - 750
min_child_samples    - 288
min_split_gain       - 2.94
colsample_bytree     - 0.5
top_rate             - 0.25
other_rate           - 0.1
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (7 rounds) Logloss = 0.680989
Fold #1: (3 rounds) Logloss = 0.681026
Fold #2: (8 rounds) Logloss = 0.680899
Fold #3: (10 rounds) Logloss = 0.680794
Fold #4: (8 rounds) Logloss = 0.680916

Avg Logloss = 0.680925 +/- 0.000080
CPU times: user 20min 13s, sys: 3.74 s, total: 20min 17s
Wall time: 10min 39s


In [30]:
%%time
op5, tp5 = run_experiment(
    data=(train_full[mi_features], train_full[TARGET], test[mi_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 76 -> Best value(Logloss): 0.68096
Best hyperparameters:
learning_rate        - 0.12
reg_alpha            - 21.1
reg_lambda           - 99.80000000000001
num_leaves           - 290
min_child_samples    - 290
min_split_gain       - 0.78
colsample_bytree     - 0.75
top_rate             - 0.30000000000000004
other_rate           - 0.3
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (33 rounds) Logloss = 0.680832
Fold #1: (4 rounds) Logloss = 0.681023
Fold #2: (20 rounds) Logloss = 0.680963
Fold #3: (16 rounds) Logloss = 0.680899
Fold #4: (6 rounds) Logloss = 0.681087

Avg Logloss = 0.680961 +/- 0.000090
CPU times: user 20min 56s, sys: 4.92 s, total: 21min 1s
Wall time: 11min 5s


In [31]:
%%time
op6, tp6 = run_experiment(
    data=(train_clip[mi_features], train_clip[TARGET], test[mi_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 91 -> Best value(Logloss): 0.68094
Best hyperparameters:
learning_rate        - 0.26
reg_alpha            - 52.5
reg_lambda           - 10.5
num_leaves           - 785
min_child_samples    - 380
min_split_gain       - 1.1
colsample_bytree     - 0.7
top_rate             - 0.15000000000000002
other_rate           - 0.35000000000000003
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (1 rounds) Logloss = 0.681060
Fold #1: (7 rounds) Logloss = 0.680914
Fold #2: (10 rounds) Logloss = 0.680976
Fold #3: (10 rounds) Logloss = 0.680803
Fold #4: (25 rounds) Logloss = 0.680953

Avg Logloss = 0.680941 +/- 0.000084
CPU times: user 20min 55s, sys: 4.83 s, total: 21min
Wall time: 11min 1s


In [32]:
%%time
op7, tp7 = run_experiment(
    data=(train_full[reduced_features], train_full[TARGET], test[reduced_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 71 -> Best value(Logloss): 0.68101
Best hyperparameters:
learning_rate        - 0.15000000000000002
reg_alpha            - 21.200000000000003
reg_lambda           - 81.5
num_leaves           - 175
min_child_samples    - 250
min_split_gain       - 0.5700000000000001
colsample_bytree     - 0.6
top_rate             - 0.15000000000000002
other_rate           - 0.5
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (29 rounds) Logloss = 0.680903
Fold #1: (11 rounds) Logloss = 0.680983
Fold #2: (4 rounds) Logloss = 0.681126
Fold #3: (22 rounds) Logloss = 0.680950
Fold #4: (26 rounds) Logloss = 0.681096

Avg Logloss = 0.681012 +/- 0.000086
CPU times: user 14min 17s, sys: 2.74 s, total: 14min 20s
Wall time: 7min 33s


In [33]:
%%time
op8, tp8 = run_experiment(
    data=(train_clip[reduced_features], train_clip[TARGET], test[reduced_features]),
    n_trials=100
)

---------------Hyperparameter tuning---------------
Best trial: 61 -> Best value(Logloss): 0.68098
Best hyperparameters:
learning_rate        - 0.26
reg_alpha            - 34.9
reg_lambda           - 160.0
num_leaves           - 895
min_child_samples    - 284
min_split_gain       - 0.28
colsample_bytree     - 0.95
top_rate             - 0.35
other_rate           - 0.3
scale_pos_weight     - 1.0
-----------------Cross-validation------------------
Fold #0: (2 rounds) Logloss = 0.681091
Fold #1: (9 rounds) Logloss = 0.680977
Fold #2: (6 rounds) Logloss = 0.681004
Fold #3: (54 rounds) Logloss = 0.680891
Fold #4: (8 rounds) Logloss = 0.680950

Avg Logloss = 0.680983 +/- 0.000066
CPU times: user 12min 1s, sys: 2.42 s, total: 12min 4s
Wall time: 6min 22s


# Generating submission files

In [34]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
NOTEBOOK = '04'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/analytics_olympiad22/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [36]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

In [37]:
create_submission_files(tp1, '01')
create_submission_files(tp2, '02')
create_submission_files(tp3, '03')
create_submission_files(tp4, '04')
create_submission_files(tp5, '05')
create_submission_files(tp6, '06')
create_submission_files(tp7, '07')
create_submission_files(tp8, '08')