<a href="https://colab.research.google.com/github/stiwari-ds/data-science-competitions/blob/main/machinehack/subscriber_prediction_hackathon/notebooks/03_lgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
%%capture
!pip install --upgrade optuna
!pip install --upgrade lightgbm

In [2]:
import os
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)
np.set_printoptions(precision=4)

import optuna
from optuna.samplers import TPESampler
from optuna.pruners import HyperbandPruner

import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

SEED = 23
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [3]:
#remove cell to run future versions
assert optuna.__version__ == '3.0.3', f'Change in Optuna version. Original notebook version: 3.0.2'
assert lgb.__version__ == '3.3.3', f'Change in LightGBM version. Original notebook version: 3.3.3'

In [4]:
DATA_URL = 'https://raw.githubusercontent.com/stiwari-ds/data-science-competitions/main/machinehack/subscriber_prediction_hackathon/data'

train = pd.read_csv(f'{DATA_URL}/processed/train.csv') #processed dataset from notebook 00
test = pd.read_csv(f'{DATA_URL}/processed/test.csv')
sample_sub = pd.read_csv(f'{DATA_URL}/raw/submission.csv')

In [5]:
TARGET = 'y_bool'

In [6]:
features = list(test.columns)
num_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
cat_features = [f for f in features if f not in num_features]

In [7]:
original_features = ['age', 'job', 'marital', 'education', 'default', 'balance',
                     'housing', 'loan', 'contact', 'day', 'month', 'duration', 
                     'campaign', 'pdays', 'previous', 'poutcome']

cat_only_features = ['age_bins', 'job_groups', 'marital', 'education', 'default',
                     'balance_bins', 'housing', 'loan', 'contact', 'day_bins', 
                     'month_bins', 'duration_bins', 'campaign_bins', 'pdays_bins',
                     'pdays_bool', 'previous_bins', 'previous_bool', 'poutcome']

# Baseline

In [8]:
%%time
scores = []
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
X, y = train[features], train[TARGET]
feature_name = list(X.columns)
categorical_feature = [f for f in feature_name if f in cat_features]
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.loc[val_idx], y.iloc[val_idx]

    model = LGBMClassifier(
        objective='binary',
        boosting_type='goss',
        device_type='cpu',
        random_state=SEED
    ) 
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=100,
        eval_metric='binary_logloss',
        feature_name=feature_name,
        categorical_feature=categorical_feature,
        verbose=0
    )
    val_preds = model.predict_proba(X_val)[:, 1]
    score = log_loss(y_val, val_preds)
    scores.append(score)
    print(f'Fold #{fold}: ({model.best_iteration_} rounds) Logloss = {score:.5f}')
    _ = gc.collect()

print(f'\nAvg Logloss = {np.mean(scores):.5f} +/- {np.std(scores):.5f}\n')

Fold #0: (9 rounds) Logloss = 0.58086
Fold #1: (4 rounds) Logloss = 0.58116
Fold #2: (10 rounds) Logloss = 0.57947
Fold #3: (4 rounds) Logloss = 0.58144
Fold #4: (7 rounds) Logloss = 0.57997

Avg Logloss = 0.58058 +/- 0.00074

CPU times: user 5.22 s, sys: 98.1 ms, total: 5.32 s
Wall time: 3.1 s


# Hyperparameter tuning

In [9]:
def objective(trial, data, model):

    scores = []
    X, y = data

    feature_name = list(X.columns)
    categorical_feature = [f for f in feature_name if f in cat_features]

    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.025, 0.3, step=0.025),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 2000, step=5),
        'max_depth': trial.suggest_int('max_depth', 3, 13),
        'min_child_samples': trial.suggest_int('min_child_samples', 0, 1000, step=2),
        'min_split_gain': trial.suggest_float('min_split_gain', 0, 10, step=0.01),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.95, step=0.05),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 5, step=0.05),
        'max_cat_to_onehot': trial.suggest_categorical('max_cat_to_onehot', [2, 5, 12])
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.set_params(**param_grid)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            early_stopping_rounds=100,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            verbose=False
        )
        val_preds = model.predict_proba(X_val)[:, 1]
        scores.append(log_loss(y_val, val_preds))

    return np.mean(scores)

In [10]:
def tune_params(data, model, n_trials, direction):
    study = optuna.create_study(
        sampler=TPESampler(seed=SEED),
        pruner=HyperbandPruner(),
        direction=direction
    )
    study.optimize(
        func=lambda trial: objective(trial, data, model),
        n_trials=n_trials,
        gc_after_trial=True
    )
    return study

# Cross-validation

In [11]:
def cross_validate_predict(data, model, n_splits=5):
    oof_preds = {}  #out-of-fold predictions on train set
    test_preds = {} #predictions on test set for each fold
    scores = [] #scores on validation set

    X, y, X_test = data
    feature_name = list(X.columns)
    categorical_feature = [f for f in feature_name if f in cat_features]
       
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        X_train, y_train = X.loc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.loc[val_idx], y.iloc[val_idx]
        
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='binary_logloss',
            early_stopping_rounds=100,
            feature_name=feature_name,
            categorical_feature=categorical_feature,
            verbose=False
        )

        val_preds = model.predict_proba(X_val)[:, 1]
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        oof_preds.update(dict(zip(val_idx, val_preds)))

        score = log_loss(y_val, val_preds)
        scores.append(score)
        print(f'Fold #{fold}: ({model.best_iteration_} rounds) Logloss = {score:.5f}')
        _ = gc.collect()
    print(f'Avg. Logloss = {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    oof_preds = pd.Series(oof_preds).sort_index()
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1)

    return oof_preds, test_preds

In [12]:
def run_experiment(data, n_trials=10):
    
    X, y, X_test = data

    base_params = {
        'objective': 'binary',
        'n_estimators': 10000,
        'boosting_type': 'goss',
        'extra_trees': True,
        'verbosity': -1,
        'device_type': 'cpu',
        'random_state': SEED
    }
    
    model = LGBMClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    study = tune_params(
        data=(X, y),
        model=model,
        n_trials=n_trials, 
        direction='minimize' #metric: logloss -> lower is better
    )
    end = time.time()
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:15} - {v}')
    print(f'[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    model.set_params(**study.best_params)
    oof_preds, test_preds = cross_validate_predict(data, model)
    end = time.time()
    print(f'[Time taken: {end - start:.2f}s]\n')

    return oof_preds, test_preds

**Trial run**

In [13]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [14]:
%%time
op, tp = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_trials=3
)

[32m[I 2022-11-16 04:16:39,392][0m A new study created in memory with name: no-name-9791dab7-709e-4439-b2e3-00e20b7c6a75[0m


----------Hyperparameter tuning----------


[32m[I 2022-11-16 04:16:40,698][0m Trial 0 finished with value: 0.5823568784232547 and parameters: {'learning_rate': 0.17500000000000002, 'reg_alpha': 189.4, 'reg_lambda': 153.1, 'num_leaves': 580, 'max_depth': 5, 'min_child_samples': 686, 'min_split_gain': 1.67, 'colsample_bytree': 0.65, 'top_rate': 0.35, 'other_rate': 0.25, 'scale_pos_weight': 1.0, 'max_cat_to_onehot': 5}. Best is trial 0 with value: 0.5823568784232547.[0m
[32m[I 2022-11-16 04:16:41,788][0m Trial 1 finished with value: 0.5823568784232547 and parameters: {'learning_rate': 0.2, 'reg_alpha': 195.70000000000002, 'reg_lambda': 169.10000000000002, 'num_leaves': 145, 'max_depth': 6, 'min_child_samples': 288, 'min_split_gain': 8.23, 'colsample_bytree': 0.8, 'top_rate': 0.1, 'other_rate': 0.05, 'scale_pos_weight': 4.800000000000001, 'max_cat_to_onehot': 5}. Best is trial 0 with value: 0.5823568784232547.[0m
[32m[I 2022-11-16 04:16:43,307][0m Trial 2 finished with value: 0.5823568784232547 and parameters: {'learning_ra

Best trial: 0 -> Best value: 0.58236
Best hyperparameters:
learning_rate   - 0.17500000000000002
reg_alpha       - 189.4
reg_lambda      - 153.1
num_leaves      - 580
max_depth       - 5
min_child_samples - 686
min_split_gain  - 1.67
colsample_bytree - 0.65
top_rate        - 0.35
other_rate      - 0.25
scale_pos_weight - 1.0
max_cat_to_onehot - 5
[Time taken: 4.00s]

-----Cross-validation and prediction-----
Fold #0: (1 rounds) Logloss = 0.58231
Fold #1: (1 rounds) Logloss = 0.58231
Fold #2: (1 rounds) Logloss = 0.58231
Fold #3: (1 rounds) Logloss = 0.58231
Fold #4: (1 rounds) Logloss = 0.58255
Avg. Logloss = 0.58236 +/- 0.00010
[Time taken: 4.22s]

CPU times: user 11.4 s, sys: 87.3 ms, total: 11.5 s
Wall time: 8.23 s


In [15]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Exp 1

In [16]:
%%time
op1, tp1 = run_experiment(
    data=(train[features], train[TARGET], test[features]),
    n_trials=200
)

----------Hyperparameter tuning----------
Best trial: 174 -> Best value: 0.57972
Best hyperparameters:
learning_rate   - 0.225
reg_alpha       - 4.3
reg_lambda      - 163.8
num_leaves      - 1640
max_depth       - 7
min_child_samples - 968
min_split_gain  - 1.08
colsample_bytree - 0.95
top_rate        - 0.15000000000000002
other_rate      - 0.2
scale_pos_weight - 1.0
max_cat_to_onehot - 2
[Time taken: 472.37s]

-----Cross-validation and prediction-----
Fold #0: (13 rounds) Logloss = 0.58032
Fold #1: (13 rounds) Logloss = 0.58109
Fold #2: (53 rounds) Logloss = 0.57746
Fold #3: (20 rounds) Logloss = 0.58051
Fold #4: (51 rounds) Logloss = 0.57924
Avg. Logloss = 0.57972 +/- 0.00128
[Time taken: 2.71s]

CPU times: user 14min 16s, sys: 5.29 s, total: 14min 21s
Wall time: 7min 55s


# Exp 2

In [17]:
%%time
op2, tp2 = run_experiment(
    data=(train[original_features], train[TARGET], test[original_features]),
    n_trials=200
)

----------Hyperparameter tuning----------
Best trial: 73 -> Best value: 0.58014
Best hyperparameters:
learning_rate   - 0.2
reg_alpha       - 20.400000000000002
reg_lambda      - 77.7
num_leaves      - 1565
max_depth       - 7
min_child_samples - 716
min_split_gain  - 0.04
colsample_bytree - 0.95
top_rate        - 0.25
other_rate      - 0.15000000000000002
scale_pos_weight - 1.0
max_cat_to_onehot - 2
[Time taken: 368.24s]

-----Cross-validation and prediction-----
Fold #0: (46 rounds) Logloss = 0.58049
Fold #1: (9 rounds) Logloss = 0.58170
Fold #2: (170 rounds) Logloss = 0.57886
Fold #3: (31 rounds) Logloss = 0.58090
Fold #4: (115 rounds) Logloss = 0.57876
Avg. Logloss = 0.58014 +/- 0.00116
[Time taken: 3.17s]

CPU times: user 10min 55s, sys: 4.44 s, total: 10min 59s
Wall time: 6min 11s


# Exp 3

In [18]:
%%time
op3, tp3 = run_experiment(
    data=(train[cat_only_features], train[TARGET], test[cat_only_features]),
    n_trials=200
)

----------Hyperparameter tuning----------
Best trial: 131 -> Best value: 0.57983
Best hyperparameters:
learning_rate   - 0.3
reg_alpha       - 17.2
reg_lambda      - 153.5
num_leaves      - 1160
max_depth       - 8
min_child_samples - 980
min_split_gain  - 0.29
colsample_bytree - 0.5
top_rate        - 0.2
other_rate      - 0.3
scale_pos_weight - 1.0
max_cat_to_onehot - 2
[Time taken: 380.67s]

-----Cross-validation and prediction-----
Fold #0: (25 rounds) Logloss = 0.58013
Fold #1: (14 rounds) Logloss = 0.58074
Fold #2: (82 rounds) Logloss = 0.57833
Fold #3: (25 rounds) Logloss = 0.58078
Fold #4: (182 rounds) Logloss = 0.57915
Avg. Logloss = 0.57983 +/- 0.00095
[Time taken: 3.27s]

CPU times: user 11min 36s, sys: 4.04 s, total: 11min 40s
Wall time: 6min 23s


# Submission files

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
NOTEBOOK = '03'
SUBMISSION_PATH = f'/content/drive/MyDrive/data_science_competitions/machinehack/subscriber_prediction_hackathon/submissions/nb_{NOTEBOOK}'
if not os.path.isdir(SUBMISSION_PATH):
    os.makedirs(SUBMISSION_PATH)

In [21]:
def create_submission_files(test_preds: pd.DataFrame, expt_num: int):
    for col in (test_preds.columns):
        sub = sample_sub.copy()
        sub[TARGET] = test_preds[col]
        sub.to_csv(f'{SUBMISSION_PATH}/{expt_num}_{col}.csv', index=False)

In [22]:
create_submission_files(tp1, '01')
create_submission_files(tp2, '02')
create_submission_files(tp3, '03')