# Setup

In [1]:
%%capture
!pip install lightgbm --upgrade

In [2]:
import gc
import time
import warnings

gc.enable()
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold

import lightgbm as lgb
import optuna

SEED = 55

In [3]:
assert lgb.__version__ == '4.2.0', 'LightGBM version differs from original notebook.' 

# Data preparation

In [4]:
DATA_DIR = '/kaggle/input/playground-series-s4e1'
train = pd.read_csv(f'{DATA_DIR}/train.csv')
test = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

original = pd.read_csv('/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv')

In [5]:
train = train.drop(['id', 'CustomerId', 'Surname'], axis=1)
test = test.drop(['id', 'CustomerId', 'Surname'], axis=1)
original = original.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [6]:
features = list(test.columns)
TARGET = 'Exited'

In [7]:
original['HasCrCard'].fillna(value=1, inplace=True)
original['Geography'].fillna(value='France', inplace=True)
original['IsActiveMember'].fillna(value=1, inplace=True)
original['Age'].fillna(value=36, inplace=True)

In [8]:
cat_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']

gender_mapping = {'Female': 0, 'Male': 1}
train['Gender'] = train['Gender'].replace(gender_mapping)
test['Gender'] = test['Gender'].replace(gender_mapping)
original['Gender'] = original['Gender'].replace(gender_mapping)

geo_mapping = {'Spain': 0, 'France': 1, 'Germany': 2}
train['Geography'] = train['Geography'].replace(geo_mapping)
test['Geography'] = test['Geography'].replace(geo_mapping)
original['Geography'] = original['Geography'].replace(geo_mapping)

train[cat_features] = train[cat_features].astype('category')
test[cat_features] = test[cat_features].astype('category')
original[cat_features] = original[cat_features].astype('category')

In [9]:
train = train.drop_duplicates(keep='first', ignore_index=True) # repeating
train = train.drop_duplicates(subset=features, keep=False, ignore_index=True) # contradictory

original = original.drop_duplicates(keep='first', ignore_index=True)

# Baseline

In [10]:
# competition metric
def comp_metric(y_true, y_pred):
    return roc_auc_score(y_true, y_pred)

In [11]:
X, y = train[features], train[TARGET]
oof_preds = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold, (train_ids, val_ids) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_ids], y.iloc[train_ids]
    X_val, y_val = X.iloc[val_ids], y.iloc[val_ids]
    
    model = lgb.LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        data_sample_strategy='goss',
        force_row_wise=True,
        verbosity=-1,
        n_jobs=-1,
        random_state=SEED)
    
    early_stopping_callback = lgb.early_stopping(
        stopping_rounds=50, 
        first_metric_only=True, 
        verbose=False, 
        min_delta=5e-5)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[early_stopping_callback])
        
    val_preds = model.predict_proba(X_val)[:, 1]
    oof_preds.update(dict(zip(val_ids, val_preds)))
    
    score = comp_metric(y_val, val_preds)
    print(f'Fold #{fold}: {score:.5f} ({model.best_iteration_} rounds)')        
    _ = gc.collect()
    
oof_preds = pd.Series(oof_preds).sort_index()
print(f'\nOOF score: {comp_metric(y, oof_preds):.5f}\n')

Fold #0: 0.89046 (58 rounds)
Fold #1: 0.88984 (39 rounds)
Fold #2: 0.88885 (60 rounds)
Fold #3: 0.88639 (66 rounds)
Fold #4: 0.88688 (59 rounds)

OOF score: 0.88830



# Hyperparameter tuning

In [12]:
def objective(trial, features, model, extend, folds, seed):
    oof_preds = {}
    
    param_grid = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, step=0.01),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 200, step=0.5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 200, step=0.5),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
#         'min_child_samples': trial.suggest_int('min_child_samples', 2, 100, step=2),
#         'min_split_gain': trial.suggest_float('min_split_gain', 0, 10, step=0.01),
        'subsample': trial.suggest_float('subsample', 0.65, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.65, 1.0, step=0.05),
        'top_rate': trial.suggest_float('top_rate', 0.1, 0.5, step=0.05),
        'other_rate': trial.suggest_float('other_rate', 0.05, 0.5, step=0.05),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 4, step=0.1)
    }
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train = train.iloc[train_ids]
        X_val = train.iloc[val_ids]
        if extend: # concat original dataset and remove duplicates, repeating followed by contradictory
            X_train = pd.concat([X_train, original], axis=0) \
                        .drop_duplicates(keep='first') \
                        .drop_duplicates(subset=features, keep=False, ignore_index=True)
        
        y_train, y_val = X_train.pop(TARGET), X_val.pop(TARGET)
        X_train, X_val = X_train[features], X_val[features]
        
        model.set_params(**param_grid)
        
        early_stopping_callback = lgb.early_stopping(
            stopping_rounds=100, 
            first_metric_only=True, 
            verbose=False, 
            min_delta=5e-5)
    
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[early_stopping_callback])
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        
    oof_preds = pd.Series(oof_preds).sort_index()
    return comp_metric(train[TARGET], oof_preds)

In [13]:
def tune_params(features, model, extend, folds, seed, n_trials, direction):
    study = optuna.create_study(
        sampler=optuna.samplers.TPESampler(
            consider_endpoints=True,
            multivariate=True,
            group=True,
            seed=seed),
        pruner=optuna.pruners.HyperbandPruner(),
        direction=direction)
    study.optimize(
        func=lambda trial: objective(
            trial, features, model, extend, folds, seed),
        n_trials=n_trials,
        gc_after_trial=True)
    
    return study

# Cross-validation framework

In [14]:
def custom_cv(features, model, extend, folds, seed, verbose=True):
    oof_preds = {}
    test_preds = {}
    scores = []
    
    X_test = test[features]
    
    cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    for fold, (train_ids, val_ids) in enumerate(cv.split(train, train[TARGET])):
        X_train = train.iloc[train_ids]
        X_val = train.iloc[val_ids]
        if extend: # concat original dataset and remove duplicates, repeating followed by contradictory
            X_train = pd.concat([X_train, original], axis=0) \
                        .drop_duplicates(keep='first') \
                        .drop_duplicates(subset=features, keep=False, ignore_index=True)
        
        y_train, y_val = X_train.pop(TARGET), X_val.pop(TARGET)
        X_train, X_val = X_train[features], X_val[features]
        
        early_stopping_callback = lgb.early_stopping(
            stopping_rounds=100, 
            first_metric_only=True, 
            verbose=False, 
            min_delta=5e-5)
    
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric='auc',
            callbacks=[early_stopping_callback])
        
        val_preds = model.predict_proba(X_val)[:, 1]
        oof_preds.update(dict(zip(val_ids, val_preds)))
        test_preds[f'fold{fold}'] = model.predict_proba(X_test)[:, 1]
        
        score = comp_metric(y_val, val_preds)
        scores.append(score)
        if verbose:
            print(f'Fold #{fold}: {score:.5f} ({model.best_iteration_} rounds)')
            
        _ = gc.collect()
    
    test_preds = pd.DataFrame.from_dict(test_preds)
    test_preds['mean'] = test_preds.mean(axis=1) # mean of fold-wise predictions
    
    oof_preds = pd.Series(oof_preds).sort_index()
    print(f'\nOOF score: {comp_metric(train[TARGET], oof_preds):.5f}')
    print(f'Avg. score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    
    return oof_preds, test_preds

In [15]:
def run_experiment(features, extend=False, folds=5, seed=SEED, n_trials=50):
    
    base_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'n_estimators': 10000,
        'data_sample_strategy': 'goss',
        'extra_trees': True,
        'force_row_wise': True,
        'max_cat_to_onehot': 3, # for Geography column
        'verbosity': -1,
        'n_jobs': -1,
        'seed': seed
    }
    model = lgb.LGBMClassifier(**base_params)
    
    print(f'----------Hyperparameter tuning----------')
    start = time.time()
    
    study = tune_params(features, model, extend, folds, seed, n_trials, direction='maximize') 
    # metric: AUC -> higher is better
    
    end = time.time()
    
    print(f'Best trial: {study.best_trial.number} -> Best value: {study.best_value:.5f}')
    print(f'Best hyperparameters:')
    for k, v in study.best_params.items():
        print(f'{k:20} - {v}')
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    print(f'-----Cross-validation and prediction-----')
    start = time.time()
    
    model.set_params(**study.best_params)
    oof_preds, test_preds = custom_cv(features, model, extend, folds, seed)
    
    end = time.time()
    print(f'\n[Time taken: {end - start:.2f}s]\n')
    
    return oof_preds, test_preds

In [16]:
def create_submission_files(test_preds, config, notebook='02'):
    sub = sample_sub.copy()
    sub[TARGET] = test_preds['mean']
    sub.to_csv(f'{notebook}_{config}.csv', index=False)

**Trial run:**

In [17]:
optuna.logging.set_verbosity(optuna.logging.INFO)

In [18]:
_ , _ = run_experiment(features=features, n_trials=3)

[I 2024-01-07 04:13:45,033] A new study created in memory with name: no-name-460c61c0-d500-4199-8e56-639a2c55a072


----------Hyperparameter tuning----------


[I 2024-01-07 04:17:51,881] Trial 0 finished with value: 0.8838821627324207 and parameters: {'learning_rate': 0.01, 'reg_alpha': 194.5, 'reg_lambda': 97.0, 'max_depth': 5, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.75, 'top_rate': 0.45000000000000007, 'other_rate': 0.05, 'scale_pos_weight': 1.3}. Best is trial 0 with value: 0.8838821627324207.
[I 2024-01-07 04:19:53,069] Trial 1 finished with value: 0.888469019296484 and parameters: {'learning_rate': 0.08, 'reg_alpha': 10.0, 'reg_lambda': 155.5, 'max_depth': 3, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.9500000000000001, 'top_rate': 0.5, 'other_rate': 0.5, 'scale_pos_weight': 2.5}. Best is trial 1 with value: 0.888469019296484.
[I 2024-01-07 04:22:04,311] Trial 2 finished with value: 0.8865987191796201 and parameters: {'learning_rate': 0.04, 'reg_alpha': 173.5, 'reg_lambda': 79.5, 'max_depth': 9, 'subsample': 0.65, 'colsample_bytree': 0.9500000000000001, 'top_rate': 0.25, 'other_rate': 0.3, 'scale_pos_weight': 3

Best trial: 1 -> Best value: 0.88847
Best hyperparameters:
learning_rate        - 0.08
reg_alpha            - 10.0
reg_lambda           - 155.5
max_depth            - 3
subsample            - 0.8500000000000001
colsample_bytree     - 0.9500000000000001
top_rate             - 0.5
other_rate           - 0.5
scale_pos_weight     - 2.5

[Time taken: 499.41s]

-----Cross-validation and prediction-----
Fold #0: 0.89015 (807 rounds)
Fold #1: 0.88949 (1052 rounds)
Fold #2: 0.88930 (658 rounds)
Fold #3: 0.88666 (609 rounds)
Fold #4: 0.88682 (555 rounds)

OOF score: 0.88847
Avg. score: 0.88848 +/- 0.00145

[Time taken: 143.57s]



In [19]:
optuna.logging.set_verbosity(optuna.logging.ERROR)

# Experiments

In [20]:
op = {} # Train-set OOF predictions
tp = {} # Test-set predictions

In [21]:
model_name = 'lgbv1'
dataset = 'trn'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=False,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 37 -> Best value: 0.88895
Best hyperparameters:
learning_rate        - 0.04
reg_alpha            - 9.0
reg_lambda           - 175.0
max_depth            - 10
subsample            - 0.65
colsample_bytree     - 0.9500000000000001
top_rate             - 0.5
other_rate           - 0.35000000000000003
scale_pos_weight     - 3.2

[Time taken: 10381.39s]

-----Cross-validation and prediction-----
Fold #0: 0.88888 (984 rounds)
Fold #1: 0.89359 (941 rounds)
Fold #2: 0.88830 (1205 rounds)
Fold #3: 0.88926 (787 rounds)
Fold #4: 0.88872 (447 rounds)
Fold #5: 0.88337 (927 rounds)
Fold #6: 0.89076 (757 rounds)

OOF score: 0.88895
Avg. score: 0.88898 +/- 0.00284

[Time taken: 334.31s]



In [22]:
model_name = 'lgbv1'
dataset = 'ext'
folds = 7
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 35 -> Best value: 0.88913
Best hyperparameters:
learning_rate        - 0.08
reg_alpha            - 3.0
reg_lambda           - 67.5
max_depth            - 4
subsample            - 0.9500000000000001
colsample_bytree     - 0.9500000000000001
top_rate             - 0.4
other_rate           - 0.45
scale_pos_weight     - 1.2

[Time taken: 9516.90s]

-----Cross-validation and prediction-----
Fold #0: 0.88921 (635 rounds)
Fold #1: 0.89388 (767 rounds)
Fold #2: 0.88808 (511 rounds)
Fold #3: 0.88951 (686 rounds)
Fold #4: 0.88881 (491 rounds)
Fold #5: 0.88340 (682 rounds)
Fold #6: 0.89124 (462 rounds)

OOF score: 0.88913
Avg. score: 0.88916 +/- 0.00296

[Time taken: 200.00s]



In [23]:
model_name = 'lgbv1'
dataset = 'ext'
folds = 10
seed = SEED
config = f'{model_name}_{dataset}_f{folds}_s{seed}'

op[config], tp[config] = run_experiment(
    features=features,  
    extend=True,
    folds=folds, 
    seed=seed)

create_submission_files(tp[config], config)

----------Hyperparameter tuning----------
Best trial: 22 -> Best value: 0.88910
Best hyperparameters:
learning_rate        - 0.09
reg_alpha            - 0.5
reg_lambda           - 42.0
max_depth            - 5
subsample            - 0.8
colsample_bytree     - 1.0
top_rate             - 0.45000000000000007
other_rate           - 0.45
scale_pos_weight     - 1.2

[Time taken: 13068.22s]

-----Cross-validation and prediction-----
Fold #0: 0.89233 (352 rounds)
Fold #1: 0.88934 (714 rounds)
Fold #2: 0.89163 (573 rounds)
Fold #3: 0.88850 (536 rounds)
Fold #4: 0.89044 (401 rounds)
Fold #5: 0.88930 (372 rounds)
Fold #6: 0.88901 (331 rounds)
Fold #7: 0.88616 (646 rounds)
Fold #8: 0.88452 (413 rounds)
Fold #9: 0.89016 (372 rounds)

OOF score: 0.88910
Avg. score: 0.88914 +/- 0.00223

[Time taken: 258.23s]



**Time to submit!**