<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/analytics_vidhya/ml_summer_training_2022/notebooks/04_lightgbm_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade lightgbm
!pip install --upgrade optuna

In [42]:
import os
import gc
import warnings
import subprocess

gc.enable()
warnings.filterwarnings('ignore')

import scipy.stats as st
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

import lightgbm
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score

In [43]:
#remove cell to run future versions
assert optuna.__version__ == '2.10.1', f'Change in Optuna version. Original notebook version: 2.10.1'
assert lightgbm.__version__ == '3.3.2', f'Change in CatBoost version. Original notebook version: 3.3.2'

In [44]:
SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [45]:
#Check GPU availability
try:
    subprocess.check_output('nvidia-smi')
    GPU = True
except Exception:
    GPU = False

print(f'GPU available: {GPU}')

GPU available: False


**Data setup**

In [46]:
data_url = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/analytics_vidhya/ml_summer_training_2022/data/'

train_all = pd.read_csv(data_url + 'processed/all_features/train.csv')
test_all = pd.read_csv(data_url + 'processed/all_features/test.csv')

train_original = pd.read_csv(data_url + 'processed/original_filled/train.csv')
test_original = pd.read_csv(data_url + 'processed/original_filled/test.csv')

train_selected = pd.read_csv(data_url + 'processed/selected_features/train.csv')
test_selected = pd.read_csv(data_url + 'processed/selected_features/test.csv')

In [47]:
target = pd.read_csv(data_url + 'raw/train.csv', 
                     usecols=['loan_default']).iloc[:, 0] #converted to series

test_index = pd.read_csv(data_url + 'raw/test.csv', 
                         usecols=['loan_id']).iloc[:, 0]

**Categorical data conversion**

In [48]:
categorical_features = ['proof_submitted', 'last_delinq_none', 
                        'education_fill_1', 'education_fill_2', 
                        'age_range', 'loan_amount_range', 
                        'asset_cost_range', 'no_of_loans_range', 
                        'no_of_curr_loans_range', 'proof_is_aadhar']

train_all[categorical_features] = train_all[categorical_features].astype('category')
test_all[categorical_features] = test_all[categorical_features].astype('category')

In [49]:
cat_cols = [f for f in train_original.columns if f in categorical_features]
train_original[cat_cols] = train_original[cat_cols].astype('category')
test_original[cat_cols] = test_original[cat_cols].astype('category')

In [50]:
cat_cols = [f for f in train_selected.columns if f in categorical_features]
train_selected[cat_cols] = train_selected[cat_cols].astype('category')
test_selected[cat_cols] = test_selected[cat_cols].astype('category')

# Baselines

In [51]:
def fit_baseline(X, y):
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, shuffle=True, stratify=target, random_state=SEED)
    
    features = list(X.columns)
    cat_features = list(X.select_dtypes(include='category').columns)
    baseline = LGBMClassifier()
    baseline.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        feature_name=features,
        categorical_feature=cat_features,
        verbose=False)
    
    preds = baseline.predict(X_val)
    print(classification_report(y_val, preds))

**All features**

In [52]:
fit_baseline(train_all, target)

              precision    recall  f1-score   support

           0       0.61      0.78      0.69       840
           1       0.45      0.27      0.33       560

    accuracy                           0.57      1400
   macro avg       0.53      0.52      0.51      1400
weighted avg       0.55      0.57      0.55      1400



**Original features**

In [53]:
fit_baseline(train_original, target)

              precision    recall  f1-score   support

           0       0.62      0.80      0.70       840
           1       0.47      0.27      0.35       560

    accuracy                           0.59      1400
   macro avg       0.55      0.53      0.52      1400
weighted avg       0.56      0.59      0.56      1400



**Selected features**

In [54]:
fit_baseline(train_selected, target)

              precision    recall  f1-score   support

           0       0.62      0.81      0.70       840
           1       0.47      0.25      0.33       560

    accuracy                           0.59      1400
   macro avg       0.54      0.53      0.51      1400
weighted avg       0.56      0.59      0.55      1400



# Hyperparameter tuning

In [55]:
def eval_f1_macro(y_true, y_pred):
    return ('eval_f1_macro', 
            f1_score(y_true, np.round(y_pred), average='macro'), 
            True)

In [74]:
def objective(trial, data, base_params):
    
    scores = []
    X, y = data
    features = list(X.columns)
    cat_features = list(X.select_dtypes(include='category').columns)
    
    #Defining hyperparameter search space
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.4, step=0.01),
        # 'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'goss', 'dart']),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt']),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.01),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.01),
        'num_leaves': trial.suggest_int('num_leaves', 2, 2000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int(
            'min_child_samples', 0, 1000, step=5),
        # 'max_bin': trial.suggest_int('max_bin', 200, 300),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 15, step=0.01),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.5, 0.99, step=0.01),
        'scale_pos_weight': trial.suggest_float(
            'scale_pos_weight', 1, 1.5, step=0.01)
    }
    # if param_grid['boosting_type'] != 'goss':
    param_grid['subsample'] = trial.suggest_float(
        'subsample', 0.5, 0.99, step=0.01),
    param_grid['subsample_freq'] = trial.suggest_int(
        'subsample_freq', 1, 50)
    # else:
    #     param_grid['top_rate'] = trial.suggest_float(
    #         'drop_rate', 0.01, 0.5, step=0.01)
    #     param_grid['other_rate'] = trial.suggest_float(
    #         'other_rate', 0.01, 0.49, step=0.01)
        
    # if param_grid['boosting_type'] == 'dart':
    #     param_grid['drop_rate'] = trial.suggest_float(
    #         'drop_rate', 0.0, 1.0, step=0.01),
    #     param_grid['skip_drop'] = trial.suggest_float(
    #         'skip_drop', 0.0, 1.0, step=0.01)
    
    #cross-validating for each trial
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**base_params, **param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='average_precision',
            early_stopping_rounds=50,
            feature_name=features,
            categorical_feature=cat_features,
            verbose=False)
        preds = model.predict(X_val)
        scores.append(f1_score(y_val, preds, average='macro'))
    
    return np.mean(scores)

In [75]:
def tune_params(data, base_params, n_trials=10, direction='maximize'):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction)
    
    study.optimize(
        func=lambda trial: objective(trial, data, base_params),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation

In [76]:
def evaluate_model(data, model_params, verbose=True):
    
    preds_test = []
    scores_f1 = [] #validation set weighted-f1 scores
    scores_acc = [] #validation set accuracy
    
    X, X_test, y = data
    features = list(X.columns)
    cat_features = list(X.select_dtypes(include='category').columns)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model = LGBMClassifier(**model_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='average_precision',
            early_stopping_rounds=50,
            feature_name=features,
            categorical_feature=cat_features,
            verbose=False)
        
        preds_val = model.predict(X_val)
        preds_test.append(model.predict(X_test))
        
        scores_acc.append(accuracy_score(y_val, preds_val))
        scores_f1.append(f1_score(y_val, preds_val, average='macro'))
    
    if verbose:
        scores_df = pd.DataFrame.from_dict({
            'Accuracy': scores_acc,
            'Macro-f1': scores_f1
        })
        scores_df.index.name = 'Fold'
        display(scores_df)
    
    print(f'Average Accuracy = {np.mean(scores_acc):.4f} (with std = {np.std(scores_acc):.4f})')
    print(f'Average Macro-f1 = {np.mean(scores_f1):.4f} (with std = {np.std(scores_f1):.4f})\n')
    
    preds_test = st.mode(np.column_stack(preds_test), axis=1).mode
    return preds_test.ravel()

In [77]:
def run_experiment(data, n_trials=5):
        
    X, X_test, y = data
    
    base_params = {
        'objective': 'binary',
        'n_estimators': 2000,
        'early_stopping_round': 50,
        'extra_trees': True,
        'max_cat_to_onehot': 5,
        'device_type': 'gpu' if GPU else 'cpu',
        'random_state': SEED
    }
    
    print(f'---------------Hyperparameter tuning---------------')
    study = tune_params(
        data=(X, y), 
        base_params=base_params,
        n_trials=n_trials,
        direction='maximize')
    print(f'Best trial: {study.best_trial.number} -> Best value (f1-macro): {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    
    model_params = {**base_params, **study.best_params}
    print(f'-----------------Cross-validation------------------')
    preds_test = evaluate_model(
        data=(X, X_test, y), 
        model_params=model_params)
    
    return preds_test

In [78]:
%%time
preds_all = run_experiment((train_all, test_all, target), 500)

---------------Hyperparameter tuning---------------
Best trial: 449 -> Best value (f1-macro): 0.56630
Best hyperparameters:
learning_rate        - 0.17
boosting_type        - gbdt
reg_alpha            - 134.88
reg_lambda           - 17.03
num_leaves           - 1136
max_depth            - 8
min_child_samples    - 400
min_split_gain       - 2.06
colsample_bytree     - 0.87
scale_pos_weight     - 1.02
subsample            - 0.98
subsample_freq       - 14
-----------------Cross-validation------------------


Unnamed: 0_level_0,Accuracy,Macro-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.5929,0.5801
1,0.56,0.5579
2,0.5779,0.5705
3,0.5679,0.5603
4,0.5764,0.5626


Average Accuracy = 0.5750 (with std = 0.0110)
Average Macro-f1 = 0.5663 (with std = 0.0081)

CPU times: user 9min 46s, sys: 5.96 s, total: 9min 52s
Wall time: 5min 47s


In [79]:
%%time
preds_original = run_experiment((train_original, test_original, target), 500)

---------------Hyperparameter tuning---------------
Best trial: 406 -> Best value (f1-macro): 0.56650
Best hyperparameters:
learning_rate        - 0.16
boosting_type        - gbdt
reg_alpha            - 152.66
reg_lambda           - 66.36
num_leaves           - 1181
max_depth            - 12
min_child_samples    - 675
min_split_gain       - 0.48
colsample_bytree     - 0.59
scale_pos_weight     - 1.01
subsample            - 0.91
subsample_freq       - 11
-----------------Cross-validation------------------


Unnamed: 0_level_0,Accuracy,Macro-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.585,0.5725
1,0.5757,0.5642
2,0.5879,0.5684
3,0.5836,0.5642
4,0.5843,0.5631


Average Accuracy = 0.5833 (with std = 0.0041)
Average Macro-f1 = 0.5665 (with std = 0.0035)

CPU times: user 9min 59s, sys: 7.66 s, total: 10min 7s
Wall time: 5min 52s


In [80]:
%%time
preds_selected = run_experiment((train_selected, test_selected, target), 500)

---------------Hyperparameter tuning---------------
Best trial: 480 -> Best value (f1-macro): 0.56454
Best hyperparameters:
learning_rate        - 0.11
boosting_type        - gbdt
reg_alpha            - 122.88
reg_lambda           - 40.94
num_leaves           - 1406
max_depth            - 11
min_child_samples    - 575
min_split_gain       - 1.74
colsample_bytree     - 0.56
scale_pos_weight     - 1.06
subsample            - 0.86
subsample_freq       - 31
-----------------Cross-validation------------------


Unnamed: 0_level_0,Accuracy,Macro-f1
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.5771,0.5726
1,0.5621,0.5582
2,0.5743,0.5693
3,0.5779,0.5638
4,0.5679,0.5588


Average Accuracy = 0.5719 (with std = 0.0060)
Average Macro-f1 = 0.5645 (with std = 0.0057)

CPU times: user 7min 36s, sys: 5.74 s, total: 7min 42s
Wall time: 4min 34s


# Generating submission files

In [81]:
sub_all = pd.DataFrame({
    'loan_id': test_index,
    'loan_default': preds_all
})
sub_all.to_csv('04_sub_all.csv', index=False)

In [82]:
sub_original = pd.DataFrame({
    'loan_id': test_index,
    'loan_default': preds_original
})
sub_original.to_csv('04_sub_original.csv', index=False)

In [83]:
sub_selected = pd.DataFrame({
    'loan_id': test_index,
    'loan_default': preds_selected
})
sub_selected.to_csv('04_sub_selected.csv', index=False)