<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/others/devfolio_oracleofdelphi/notebooks/02_lightgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade optuna
!pip install --upgrade lightgbm

In [2]:
import os
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.4', f'Change in Optuna version. Original notebook version: 3.0.4'
assert lgb.__version__ == '3.3.3', f'Change in LightGBM version. Original notebook version: 3.3.3'

# Data preparation

In [4]:
#Data
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/others/devfolio_oracleofdelphi/data'
train = pd.read_csv(f'{DATA_URL}/train.csv')
test = pd.read_csv(f'{DATA_URL}/test.csv')

In [5]:
train = train.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
test = test.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)

In [6]:
TARGET = 'Credit Score'
label_mapping = {'Standard': 0, 'Good': 1, 'Best': 2}
train[TARGET] = train[TARGET].replace(label_mapping)

**Feature sets from EDA notebook.**

In [7]:
all_features = list(test.columns)

top_features = ['Utlization Ratio', 'Credit Inquiries']

mi_features = top_features + ['Monthly_Inhand_Salary ', 'Interest Rate', 
              'Changed_Credit_Limit', 'Outstanding_Debt', 'Total_EMI_per_month']

anova_features = top_features + ['Accounts of user', 'Credit cards user have', 
                 'Interest Rate', 'Num_of_Loan', 'Delay_from_due_date', 
                 'Num_of_Delayed_Payment', 'Outstanding_Debt']

**Outlier removal**

In [8]:
train = train[train['Accounts of user'] <= 10]
train = train[train['Credit cards user have'] <= 10]
train = train[train['Interest Rate'] <= 35]
train = train[(train['Num_of_Loan'] < 10) & (train['Num_of_Loan'] >= 0)]
train = train[(train['Num_of_Delayed_Payment'] < 30) & (train['Num_of_Delayed_Payment'] >= 0)]
train = train[train['Changed_Credit_Limit'] >= 0]
train = train[train['Total_EMI_per_month'] < 1000]

train = train.reset_index(drop=True)

**Original train set for comparison during modeling**

In [9]:
train_og = pd.read_csv(f'{DATA_URL}/train.csv')

train_og = train_og.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
train_og[TARGET] = train_og[TARGET].replace(label_mapping)

# Baseline

In [10]:
%%time
scores = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train[all_features], train[TARGET]
feature_name = list(X.columns)
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        objective='multiclass',
        num_class=3,
        boosting_type='goss',
        device_type='cpu',
        random_state=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        eval_metric='multi_error',
        feature_name=feature_name,
        verbose=0
    )
    val_preds = model.predict(X_val)
    score = f1_score(y_val, val_preds, average='macro')
    scores.append(score)
    print(f'Fold #{fold}: ({model.best_iteration_} rounds) F1-macro = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg F1-macro = {np.mean(scores):.5f} +/- {np.std(scores):.5f}\n')

Fold #0: (41 rounds) F1-macro = 0.88647
Fold #1: (75 rounds) F1-macro = 0.88924
Fold #2: (50 rounds) F1-macro = 0.90198
Fold #3: (26 rounds) F1-macro = 0.89050
Fold #4: (15 rounds) F1-macro = 0.87343

Avg F1-macro = 0.88832 +/- 0.00913

CPU times: user 6.11 s, sys: 164 ms, total: 6.27 s
Wall time: 3.82 s


# Hyperparameter tuning

In [11]:
def objective(trial, data, model):

    scores = []
    X, y = data
    feature_name = list(X.columns)

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.3, step=0.025),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 2000, step=5),
        'max_depth': trial.suggest_int('max_depth', 3, 13),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 1000, step=2),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 10, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95, step=0.05),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05)        
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='multi_error',
            early_stopping_rounds=100,
            feature_name=feature_name,
            verbose=False
        )
        val_preds = model.predict(X_val)
        scores.append(f1_score(y_val, val_preds, average='macro'))

    return np.mean(scores)

In [12]:
def tune_params(data, model, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, model),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [13]:
def cross_val_predict(data, model, n_splits=5):
    scores = {
        'Fold': [str(i) for i in range(n_splits)],
        'F1-micro': [],
        'F1-macro': [],
        'F1-weighted': []
    }
    test_preds = {}
    
    X, y, X_test = data
    feature_name = list(X.columns)

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='multi_error',
            early_stopping_rounds=100,
            feature_name=feature_name,
            verbose=False
        )
        val_preds = model.predict(X_val)
        test_preds[f'fold{fold}'] = model.predict(X_test)

        f1micro = f1_score(y_val, val_preds, average='micro')
        scores['F1-micro'].append(f1micro)
        f1macro = f1_score(y_val, val_preds, average='macro')
        scores['F1-macro'].append(f1macro)
        f1weight = f1_score(y_val, val_preds, average='weighted')
        scores['F1-weighted'].append(f1weight)
        _ = gc.collect()

    scores['Fold'].append('Avg.')
    for metric in ['F1-micro', 'F1-macro', 'F1-weighted']:
        mean_score = np.mean(scores[metric])
        scores[metric].append(mean_score)
    score_df = pd.DataFrame.from_dict(scores).set_index('Fold')
    display(score_df)

    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mode'] = test_preds.mode(axis=1)[0].astype('int')
    return test_preds

In [14]:
def run_experiment(data, n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'objective': 'multiclass',
        'n_estimators': 10000,
        'boosting_type': 'goss',
        'extra_trees': True,
        'verbosity': -1,
        'device_type': 'cpu',
        'random_state': SEED
    }
    
    model = LGBMClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y),
        model=model,
        n_trials=n_trials, 
        direction='maximize' #metric: f1-score -> higher is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    test_preds = cross_val_predict(data, model)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return test_preds

**Trial run**

In [15]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [16]:
%%time
tp_trial1 = run_experiment(
    data=(train[all_features], train[TARGET], test[all_features]),
    n_trials=3
)

[32m[I 2022-12-11 07:33:22,384][0m A new study created in memory with name: no-name-90babd33-c2bc-4745-bc11-3edbf0c3325c[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-11 07:33:24,475][0m Trial 0 finished with value: 0.5976734935481789 and parameters: {'learning_rate': 0.17500000000000002, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 580, 'max_depth': 5, 'min_child_samples': 686, 'min_split_gain': 1.67, 'colsample_bytree': 0.65, 'top_rate': 0.35, 'other_rate': 0.25}. Best is trial 0 with value: 0.5976734935481789.[0m
[32m[I 2022-12-11 07:33:26,754][0m Trial 1 finished with value: 0.2635404263130791 and parameters: {'learning_rate': 0.025, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 615, 'max_depth': 9, 'min_child_samples': 980, 'min_split_gain': 8.45, 'colsample_bytree': 0.5, 'top_rate': 0.2, 'other_rate': 0.15000000000000002}. Best is trial 0 with value: 0.5976734935481789.[0m
[32m[I 2022-12-11 07:33:38,327][0m Trial 2 finished with value: 0.8421550038920064 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'max_depth': 13, 'min_child_samples': 140, 'min_spli

Best trial: 2 -> Best value: 0.84216
Best hyperparameters:
learning_rate   - 0.25
reg_alpha       - 125.2
reg_lambda      - 22.1
num_leaves      - 20
max_depth       - 13
min_child_samples - 140
min_split_gain  - 4.22
colsample_bytree - 0.65
top_rate        - 0.45000000000000007
other_rate      - 0.25
[Time taken: 16.16s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8924,0.8372,0.8883
1,0.8789,0.835,0.8748
2,0.8944,0.8596,0.8909
3,0.8855,0.8496,0.882
4,0.8788,0.8294,0.8728
Avg.,0.886,0.8422,0.8818


[Time taken: 5.55s]

CPU times: user 24.7 s, sys: 217 ms, total: 24.9 s
Wall time: 21.7 s


In [17]:
%%time
tp_trial2 = run_experiment(
    data=(train_og[all_features], train_og[TARGET], test[all_features]),
    n_trials=3
)

[32m[I 2022-12-11 07:33:44,129][0m A new study created in memory with name: no-name-7702e115-1659-428b-8cdb-47a0d173f565[0m


----------Hyperparameter tuning----------


[32m[I 2022-12-11 07:33:46,592][0m Trial 0 finished with value: 0.7495582751596368 and parameters: {'learning_rate': 0.17500000000000002, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 580, 'max_depth': 5, 'min_child_samples': 686, 'min_split_gain': 1.67, 'colsample_bytree': 0.65, 'top_rate': 0.35, 'other_rate': 0.25}. Best is trial 0 with value: 0.7495582751596368.[0m
[32m[I 2022-12-11 07:33:47,458][0m Trial 1 finished with value: 0.26350433227207104 and parameters: {'learning_rate': 0.025, 'reg_alpha': 176.8, 'reg_lambda': 177.0, 'num_leaves': 615, 'max_depth': 9, 'min_child_samples': 980, 'min_split_gain': 8.45, 'colsample_bytree': 0.5, 'top_rate': 0.2, 'other_rate': 0.15000000000000002}. Best is trial 0 with value: 0.7495582751596368.[0m
[32m[I 2022-12-11 07:33:49,505][0m Trial 2 finished with value: 0.8399562433480939 and parameters: {'learning_rate': 0.25, 'reg_alpha': 125.2, 'reg_lambda': 22.1, 'num_leaves': 20, 'max_depth': 13, 'min_child_samples': 140, 'min_spl

Best trial: 2 -> Best value: 0.83996
Best hyperparameters:
learning_rate   - 0.25
reg_alpha       - 125.2
reg_lambda      - 22.1
num_leaves      - 20
max_depth       - 13
min_child_samples - 140
min_split_gain  - 4.22
colsample_bytree - 0.65
top_rate        - 0.45000000000000007
other_rate      - 0.25
[Time taken: 5.52s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.886,0.8417,0.8821
1,0.8802,0.8279,0.8757
2,0.8893,0.8436,0.8856
3,0.8761,0.8278,0.871
4,0.8966,0.8588,0.8932
Avg.,0.8856,0.84,0.8815


[Time taken: 2.90s]

CPU times: user 14.9 s, sys: 160 ms, total: 15 s
Wall time: 8.43 s


In [18]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Exp 1 - clipped data + all features

In [19]:
%%time
tp_clip_all = run_experiment(
    data=(train[all_features], train[TARGET], test[all_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 14 -> Best value: 0.88297
Best hyperparameters:
learning_rate   - 0.125
reg_alpha       - 0.8
reg_lambda      - 20.5
num_leaves      - 995
max_depth       - 12
min_child_samples - 204
min_split_gain  - 1.25
colsample_bytree - 0.9
top_rate        - 0.4
other_rate      - 0.2
[Time taken: 220.25s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9215,0.8876,0.9206
1,0.9089,0.8787,0.9077
2,0.9176,0.8974,0.9166
3,0.9049,0.8776,0.9044
4,0.9069,0.8736,0.9057
Avg.,0.912,0.883,0.911


[Time taken: 3.25s]

CPU times: user 6min 44s, sys: 8.54 s, total: 6min 53s
Wall time: 3min 43s


# Exp 2 - original data + all features

In [20]:
%%time
tp_og_all = run_experiment(
    data=(train_og[all_features], train_og[TARGET], test[all_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 72 -> Best value: 0.88437
Best hyperparameters:
learning_rate   - 0.3
reg_alpha       - 1.9000000000000001
reg_lambda      - 38.300000000000004
num_leaves      - 890
max_depth       - 9
min_child_samples - 452
min_split_gain  - 0.01
colsample_bytree - 0.8500000000000001
top_rate        - 0.5
other_rate      - 0.35000000000000003
[Time taken: 232.98s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9106,0.8824,0.9101
1,0.913,0.8825,0.9121
2,0.9139,0.8857,0.9133
3,0.904,0.8721,0.9024
4,0.922,0.8991,0.9219
Avg.,0.9127,0.8844,0.912


[Time taken: 3.16s]

CPU times: user 7min 16s, sys: 7.21 s, total: 7min 23s
Wall time: 3min 56s


# Exp 3 - clipped data + MI features

In [21]:
%%time
tp_clip_mi = run_experiment(
    data=(train[mi_features], train[TARGET], test[mi_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 72 -> Best value: 0.87491
Best hyperparameters:
learning_rate   - 0.275
reg_alpha       - 1.9000000000000001
reg_lambda      - 56.0
num_leaves      - 810
max_depth       - 12
min_child_samples - 112
min_split_gain  - 0.12
colsample_bytree - 0.8
top_rate        - 0.5
other_rate      - 0.35000000000000003
[Time taken: 164.99s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9215,0.8865,0.9207
1,0.9099,0.8784,0.9088
2,0.9118,0.8856,0.9109
3,0.8904,0.8537,0.8897
4,0.904,0.8703,0.9033
Avg.,0.9075,0.8749,0.9067


[Time taken: 3.24s]

CPU times: user 5min 2s, sys: 7.15 s, total: 5min 9s
Wall time: 2min 48s


# Exp 4 - clipped data + ANOVA features

In [22]:
%%time
tp_clip_anova = run_experiment(
    data=(train[anova_features], train[TARGET], test[anova_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 72 -> Best value: 0.88301
Best hyperparameters:
learning_rate   - 0.275
reg_alpha       - 1.9000000000000001
reg_lambda      - 46.0
num_leaves      - 1625
max_depth       - 11
min_child_samples - 308
min_split_gain  - 0.0
colsample_bytree - 0.95
top_rate        - 0.4
other_rate      - 0.35000000000000003
[Time taken: 180.04s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.9225,0.8906,0.9218
1,0.9109,0.8862,0.9101
2,0.9157,0.8944,0.9153
3,0.902,0.8773,0.9018
4,0.902,0.8666,0.9012
Avg.,0.9106,0.883,0.91


[Time taken: 4.32s]

CPU times: user 5min 35s, sys: 6.51 s, total: 5min 41s
Wall time: 3min 4s


# Exp 5 - clipped data + top features

In [23]:
%%time
tp_clip_top = run_experiment(
    data=(train[top_features], train[TARGET], test[top_features]),
    n_trials=100
)

----------Hyperparameter tuning----------
Best trial: 61 -> Best value: 0.83936
Best hyperparameters:
learning_rate   - 0.2
reg_alpha       - 12.200000000000001
reg_lambda      - 45.7
num_leaves      - 1275
max_depth       - 11
min_child_samples - 576
min_split_gain  - 8.43
colsample_bytree - 0.8500000000000001
top_rate        - 0.45000000000000007
other_rate      - 0.15000000000000002
[Time taken: 145.97s]

-----Cross-validation and prediction-----


Unnamed: 0_level_0,F1-micro,F1-macro,F1-weighted
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.8953,0.8394,0.892
1,0.8866,0.834,0.8825
2,0.8915,0.8553,0.8879
3,0.8788,0.8334,0.8765
4,0.8855,0.8347,0.8803
Avg.,0.8876,0.8394,0.8839


[Time taken: 2.06s]

CPU times: user 4min 26s, sys: 4.35 s, total: 4min 31s
Wall time: 2min 28s


# Submission files

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [29]:
NOTEBOOK = '02'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/others/devfolio_oracleofdelphi/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [30]:
def create_submission_files(test_preds, dataset, feature_set, path=SUBMISSION_PATH):
    for col in test_preds.columns:
        sub = pd.DataFrame()
        sub[TARGET] = test_preds[col].replace({0: 'Standard', 1: 'Good', 2: 'Best'})
        sub.to_csv(f'{path}/{dataset}_{feature_set}_{col}.csv', index=False)

In [31]:
create_submission_files(tp_clip_all, 'clip', 'all')
create_submission_files(tp_clip_mi, 'clip', 'mi')
create_submission_files(tp_clip_anova, 'clip', 'anova')
create_submission_files(tp_clip_top, 'clip', 'top')

In [32]:
create_submission_files(tp_og_all, 'og', 'all')