In [None]:
# General imports
import os
import cv2
import glob 
import json
import random
import warnings

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.svm import *
from sklearn.tree import *
from sklearn.impute import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.linear_model import *
from sklearn.decomposition import *
from sklearn.preprocessing import *
from sklearn.model_selection import *


warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

sns.set_style("white")
mpl.rcParams['figure.dpi'] = 200
%matplotlib inline

In [None]:
data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')

data.head()

In [None]:
data.describe()

In [None]:
TARGET = 'claim'
IMPUTE_STRATEGY = 'mean'

In [None]:
class Preprocessor:

    def __init__(self, pps):
        self.pps = pps
    
    def fit(self, feat):
        for pp in self.pps:
            feat = pp.fit_transform(feat)
    
    def transform(self, feat):
        for pp in self.pps:
            feat = pp.transform(feat)
        return feat
    
    def fit_transform(self, feat):
        self.fit(feat)
        return self.transform(feat)

preprocessor = Preprocessor(pps=[SimpleImputer(missing_values=np.nan, strategy=IMPUTE_STRATEGY),
                                StandardScaler()])

In [None]:
X = data.drop(columns=['id', TARGET])
Y = data[TARGET]

X['mv_row'] = X.isna().sum(axis=1)
X['min_row'] = X.min(axis=1)
X['std_row'] = X.std(axis=1)

X = preprocessor.fit_transform(X)

In [None]:
X.shape, Y.shape

In [None]:
X_test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

X_test.head()

In [None]:
ids = X_test['id']
X_test = X_test.drop(columns=['id'])

X_test['mv_row'] = X_test.isna().sum(axis=1)
X_test['min_row'] = X_test.min(axis=1)
X_test['std_row'] = X_test.std(axis=1)

X_test = preprocessor.transform(X_test)

X_test.shape

In [None]:
import optuna
from optuna.samplers import TPESampler

optuna.logging.set_verbosity(optuna.logging.WARNING)

## Tuning XGB Classifier using Optuna

In [None]:
from xgboost import XGBClassifier

# Setup XGB hyperparameters for exps
def get_xgb_hyperparams(trail):
    xgb_params = {
        'learning_rate': 0.01,
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'n_estimators': trail.suggest_int('n_estimators', 500, 4000, 100),
        'reg_lambda': trail.suggest_int('reg_lambda', 1, 100),
        'reg_alpha': trail.suggest_int('reg_alpha', 1, 100),
        'subsample': trail.suggest_float('subsample', 0.2, 1.0, step=0.1),
        'colsample_bytree': trail.suggest_float('colsample_bytree', 0.2, 1.0, step=0.1),
        'max_depth': trail.suggest_int('max_depth', 3, 10), 
        'min_child_weight': trail.suggest_int('min_child_weight', 2, 10),
        'gamma': trail.suggest_float('gamma', 0, 20)        
    }
    return xgb_params

In [None]:
# Define objective function
def objective_xgb(trail, X, Y, n_splits=5):
       
    xgb_params = get_xgb_hyperparams(trail)
    
    skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    total_preds, total_y = [], []
    
    for train_index, val_index in skfolds.split(X, Y):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
    
        xgb_clf = XGBClassifier(**xgb_params)
    
        xgb_clf = xgb_clf.fit(x_train, y_train)
        preds = xgb_clf.predict_proba(x_val)
        
        total_preds.extend(preds[:, 1])
        total_y.extend(y_val)
    
    ra_score = roc_auc_score(total_y, total_preds)
    
    return ra_score

In [None]:
# Callback function to print log messages when the best trail is updated

def logging_callback(study, frozen_trail):
    prev_best = study.user_attrs.get('prev_best', None)
    if prev_best != study.best_value:
        study.set_user_attr('prev_best', study.best_value)
        print(f"Trail {frozen_trail.number} finished with best value {frozen_trail.value}")

In [None]:
%%time

study = optuna.create_study(sampler=TPESampler(seed=SEED), 
                            direction='maximize', 
                            study_name='xgb_tuning')
objc = lambda trail : objective_xgb(trail, X, Y)

study.optimize(objc, timeout=60*60, callbacks=[logging_callback])

In [None]:
print(f"Best roc_auc value: {study.best_value}")
print(f"Best params: ")
for param, value in study.best_params.items():
    print(f"\t{param} : {value}")

In [None]:
def cross_validate_model(class_name, class_params, X, Y, X_test, n_splits=5):
    
    skfolds = StratifiedKFold(n_splits=n_splits, shuffle=False)
    
    oof_preds, oof_y = [], []
    
    test_preds = np.zeros((X_test.shape[0]))
    
    for i, (train_index, val_index) in enumerate(skfolds.split(X, Y)):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
        
        print(f"---------- Fold {i+1} Started ----------")
        clf = class_name(**class_params)
    
        clf = clf.fit(x_train, y_train)
        preds = clf.predict_proba(x_val)
        
        oof_preds.extend(preds[:, 1])
        oof_y.extend(y_val)
        
        test_preds += clf.predict_proba(X_test)[:, 1]
        
        ra_score = roc_auc_score(y_val, preds[:, 1])
    
        print(f"ROC AUC of current fold is {ra_score}")
        
        print(f"---------- Fold {i+1} Ended ----------")
    
    return oof_preds, test_preds / n_splits

In [None]:
# These parameters are obtained using above optimization
xgb_params = {
    'n_estimators' : 3600,
    'reg_lambda' : 3,
    'reg_alpha' : 26,
    'subsample' : 0.6000000000000001,
    'colsample_bytree' : 0.6000000000000001,
    'max_depth' : 9,
    'min_child_weight' : 5,
    'gamma' : 13.054739572819486,
}
xgb_params['learning_rate'] = 0.01
xgb_params['tree_method'] = 'gpu_hist'
xgb_params['booster'] = 'gbtree'

xgb_oof, xgb_test = cross_validate_model(XGBClassifier, xgb_params, X, Y, X_test)

In [None]:
df = pd.DataFrame({
    'id': data['id'],
    'oof_preds': xgb_oof
})

df.to_csv('xgb_oof.csv', index=False)

df = pd.DataFrame({
    'id': ids,
    'claim': xgb_test
})

df.to_csv('xgb_preds.csv', index=False)

## Tuning Cat Boost Classifier using Optuna

In [None]:
from catboost import CatBoostClassifier

# Setup CatB hyperparameters for exps
def get_catb_hyperparams(trail):
    catb_params = {
        'loss_function': 'CrossEntropy',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'iterations': trail.suggest_int('iterations', 2000, 20000),
        'od_wait': trail.suggest_int('od_wait', 500, 2000),
        'learning_rate': trail.suggest_uniform('learning_rate', 0.01, 0.3),
        'reg_lambda': trail.suggest_uniform('reg_lambda', 1e-4, 100),
        'subsample': trail.suggest_uniform('subsample', 0, 1),
        'random_strength': trail.suggest_uniform('random_strength', 10, 50),
        'depth': trail.suggest_int('depth', 1, 15),
        'min_data_in_leaf': trail.suggest_int('min_data_in_leaf', 1, 30),
        'leaf_estimation_iterations': trail.suggest_int('leaf_estimation_iterations', 1, 15)
    }
    return catb_params

In [None]:
# Define objective function
def objective_catb(trail, X, Y, n_splits=5):
    
    catb_params = get_catb_hyperparams(trail)
    
    skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    total_preds, total_y = [], []
    
    for train_index, val_index in skfolds.split(X, Y):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
    
        catb_clf = CatBoostClassifier(**catb_params)
    
        catb_clf = catb_clf.fit(x_train, y_train)
        preds = catb_clf.predict_proba(x_val)
        
        total_preds.extend(preds[:, 1])
        total_y.extend(y_val)
    
    ra_score = roc_auc_score(total_y, total_preds)
    
    return ra_score

In [None]:
%%time

study = optuna.create_study(sampler=TPESampler(seed=SEED), 
                            direction='maximize', 
                            study_name='catb_tuning')
objc = lambda trail : objective_catb(trail, X, Y)

study.optimize(objc, timeout=60*15, callbacks=[logging_callback])

In [None]:
print(f"Best ROC AUC value: {study.best_value}")
print(f"Best params: ")
for param, value in study.best_params.items():
    print(f"\t{param} : {value}")

In [None]:
# These parameters are obtained using above optimization
catb_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

catb_oof, catb_test = cross_validate_model(CatBoostClassifier, catb_params, X, Y, X_test)

In [None]:
df = pd.DataFrame({
    'id': data['id'],
    'oof_preds': catb_oof
})

df.to_csv('catb_oof.csv', index=False)

df = pd.DataFrame({
    'id': ids,
    'claim': catb_test
})

df.to_csv('catb_preds.csv', index=False)

## Tuning Light GBM Classifier using Optuna

In [None]:
from lightgbm import LGBMClassifier

# Setup lgbm hyperparameters for exps
def get_lgbm_hyperparams(trail):
    lgbm_params = {
        "objective": "binary",
        "learning_rate": 0.008,
        'device': 'gpu',
        'n_estimators': trail.suggest_int("n_estimators", 500, 5000),
        "num_leaves": trail.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trail.suggest_int("min_child_samples", 2, 3000),
        'feature_fraction': trail.suggest_uniform('feature_fraction', 0.25, 0.7),
        'bagging_fraction': trail.suggest_uniform('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trail.suggest_int('bagging_freq', 0, 5),
        'reg_alpha': trail.suggest_int("reg_alpha", 1, 100),
        'reg_lambda': trail.suggest_int("reg_lambda", 1, 100),
    }
    return lgbm_params

In [None]:
# Define objective function
def objective_lgbm(trail, X, Y, n_splits=5):
    
    lgbm_params = get_lgbm_hyperparams(trail)
    
    skfolds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    total_preds, total_y = [], []
    
    for train_index, val_index in skfolds.split(X, Y):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
    
        lgbm_clf = LGBMClassifier(**lgbm_params)
    
        lgbm_clf = lgbm_clf.fit(x_train, 
                                y_train,
                                eval_metric='auc',
                                eval_set=[(x_val, y_val)],
                                verbose=1000)
        preds = lgbm_clf.predict_proba(x_val)
        
        total_preds.extend(preds[:, 1])
        total_y.extend(y_val)
    
    ra_score = roc_auc_score(total_y, total_preds)
    
    return ra_score

In [None]:
%%time

study = optuna.create_study(sampler=TPESampler(seed=SEED), 
                            direction='maximize', 
                            study_name='lgbm_tuning')
objc = lambda trail : objective_lgbm(trail, X, Y)

study.optimize(objc, timeout=60*60, callbacks=[logging_callback])

In [None]:
print(f"Best ROC AUC value: {study.best_value}")
print(f"Best params: ")
for param, value in study.best_params.items():
    print(f"\t{param} : {value}")

In [None]:
# These parameters are obtained using above optimization
lgbm_params = {
    "objective": "binary",
    "learning_rate": 0.008,
    'device': 'gpu',
    'n_estimators': 3205,
    'num_leaves': 184,
    'min_child_samples': 63,
    'feature_fraction': 0.6864594334728974,
    'bagging_fraction': 0.9497327922401265,
    'bagging_freq': 1,
    'reg_alpha': 19,
    'reg_lambda': 19,
}

lgbm_oof, lgbm_test = cross_validate_model(LGBMClassifier, lgbm_params, X, Y, X_test)

In [None]:
df = pd.DataFrame({
    'id': data['id'],
    'oof_preds': lgbm_oof
})

df.to_csv('lgbm_oof.csv', index=False)

df = pd.DataFrame({
    'id': ids,
    'claim': lgbm_test
})

df.to_csv('lgbm_preds.csv', index=False)

## Stacking

In [None]:
model_names = ['xgb', 'catb', 'lgbm']

oof_data = pd.DataFrame()

for el in model_names:
    oof_data[f'{el}_oof'] = pd.read_csv(f'{el}_oof.csv')['oof_preds']
    
oof_data['claim'] = Y

oof_data.head()

In [None]:
preds_data = pd.DataFrame()

for el in model_names:
    preds_data[f'{el}_preds'] = pd.read_csv(f'{el}_preds.csv')['claim']
    
preds_data.head()

In [None]:
X = oof_data.drop(columns=['claim']).to_numpy()
Y = oof_data['claim'].to_numpy()

X.shape, Y.shape

In [None]:
x_test = preds_data.to_numpy()

x_test.shape

In [None]:
n_splits = 5
skfolds = StratifiedKFold(n_splits=n_splits, shuffle=False)

oof_preds, oof_y = [], []

test_preds = np.zeros((X_test.shape[0]))

for i, (train_index, val_index) in enumerate(skfolds.split(X, Y)):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = Y[train_index], Y[val_index]

    print(f"---------- Fold {i+1} Started ----------")
    clf = LogisticRegression(random_state=SEED)

    clf = clf.fit(x_train, y_train)
    preds = clf.predict_proba(x_val)

    oof_preds.extend(preds[:, 1])
    oof_y.extend(y_val)

    test_preds += clf.predict_proba(x_test)[:, 1]

    ra_score = roc_auc_score(y_val, preds[:, 1])

    print(f"ROC AUC of current fold is {ra_score}")

    print(f"---------- Fold {i+1} Ended ----------")

test_preds /= n_splits

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sub['claim'] = test_preds

sub.head()

In [None]:
sub.to_csv('submission.csv', index=False)