In [None]:
# General imports
import os
import cv2
import glob
import time
import json
import random
import warnings

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn import tree
from sklearn import impute
from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import model_selection


warnings.filterwarnings('ignore')

SEED = 42
np.random.seed(SEED)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

sns.set_style("white")
mpl.rcParams['figure.dpi'] = 600
%matplotlib inline

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv").drop(columns=['id'])

train_data.head()

In [None]:
N_FOLDS = 5
TARGET = 'claim'

In [None]:
test_data = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv").drop(columns=['id'])

test_data.head()

In [None]:
features = [col for col in train_data.columns if col != TARGET]

len(features)

In [None]:
train_data['n_missing'] = train_data[features].isna().sum(axis=1)
test_data['n_missing'] = test_data[features].isna().sum(axis=1)

features.append('n_missing')

In [None]:
modes = train_data[features].mode().iloc[0]

train_data[features] = train_data[features].fillna(modes)
test_data[features] = test_data[features].fillna(modes)

In [None]:
scaler = preprocessing.StandardScaler()

train_data[features] = scaler.fit_transform(train_data[features])
test_data[features] = scaler.transform(test_data[features])

In [None]:
train_data['min_row'] = train_data[features].min(axis=1)
train_data['max_row'] = train_data[features].max(axis=1)
train_data['mean_row'] = train_data[features].mean(axis=1)
train_data['std_row'] = train_data[features].std(axis=1)

test_data['min_row'] = test_data[features].min(axis=1)
test_data['max_row'] = test_data[features].max(axis=1)
test_data['mean_row'] = test_data[features].mean(axis=1)
test_data['std_row'] = test_data[features].std(axis=1)

features += ['min_row', 'max_row', 'mean_row', 'std_row']

In [None]:
def cross_validate_model(class_name, class_params, train_data, test_data, n_splits=5):
    
    X = train_data[features].to_numpy()
    Y = train_data[TARGET]
    X_test = test_data[features].to_numpy()
    
    skfolds = model_selection.StratifiedKFold(n_splits=n_splits, shuffle=False)
    
    oof_preds, oof_y = [], []
    
    test_preds = np.zeros((X_test.shape[0]))
    
    for i, (train_index, val_index) in enumerate(skfolds.split(X, Y)):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
        
        print(f"{'-'*10} Fold {i+1} Started {'-'*10}")
        clf = class_name(**class_params)
    
        clf = clf.fit(x_train, y_train)
        preds = clf.predict_proba(x_val)
        
        oof_preds.extend(preds[:, 1])
        oof_y.extend(y_val)
        
        test_preds += clf.predict_proba(X_test)[:, 1]
        
        ra_score = metrics.roc_auc_score(y_val, preds[:, 1])
    
        print(f"ROC AUC of current fold is {ra_score}")
        
    ra_score = metrics.roc_auc_score(oof_y, oof_preds)
    
    print(f"\nOverall ROC AUC is {ra_score}")
    
    return oof_preds, test_preds / n_splits

In [None]:
xgb_params = {
    'n_estimators' : 3600,
    'reg_lambda' : 3,
    'reg_alpha' : 26,
    'subsample' : 0.6000000000000001,
    'colsample_bytree' : 0.6000000000000001,
    'max_depth' : 9,
    'min_child_weight' : 5,
    'gamma' : 13.054739572819486,
    'learning_rate': 0.01,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree'
}

lgbm_params = {
    "objective": "binary",
    "learning_rate": 0.008,
    'device': 'gpu',
    'n_estimators': 3205,
    'num_leaves': 184,
    'min_child_samples': 63,
    'feature_fraction': 0.6864594334728974,
    'bagging_fraction': 0.9497327922401265,
    'bagging_freq': 1,
    'reg_alpha': 19,
    'reg_lambda': 19,
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

catb_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

### Lv1
**Input**: 118 Features

**Models**: [XGB, CATB1, CATB2, LGBM1, LGBM2]

In [None]:
lv1_oof = pd.DataFrame()
lv1_test = pd.DataFrame()

In [None]:
oof_preds, test_preds = cross_validate_model(XGBClassifier, 
                                             xgb_params, 
                                             train_data, 
                                             test_data,
                                             N_FOLDS)

lv1_oof['xgb'] = oof_preds
lv1_test['xgb'] = test_preds

In [None]:
catb_params['random_state'] = 42
oof_preds, test_preds = cross_validate_model(CatBoostClassifier, 
                                             catb_params, 
                                             train_data, 
                                             test_data,
                                             N_FOLDS)

lv1_oof['catb_1'] = oof_preds
lv1_test['catb_1'] = test_preds

catb_params['random_state'] = 2021
oof_preds, test_preds = cross_validate_model(CatBoostClassifier, 
                                             catb_params, 
                                             train_data, 
                                             test_data,
                                             N_FOLDS)

lv1_oof['catb_2'] = oof_preds
lv1_test['catb_2'] = test_preds

In [None]:
lgbm_params['random_state'] = 42
oof_preds, test_preds = cross_validate_model(LGBMClassifier, 
                                             lgbm_params, 
                                             train_data, 
                                             test_data,
                                             N_FOLDS)

lv1_oof['lgbm_1'] = oof_preds
lv1_test['lgbm_1'] = test_preds

lgbm_params['random_state'] = 2021
oof_preds, test_preds = cross_validate_model(LGBMClassifier, 
                                             lgbm_params, 
                                             train_data, 
                                             test_data,
                                             N_FOLDS)

lv1_oof['lgbm_2'] = oof_preds
lv1_test['lgbm_2'] = test_preds

In [None]:
lv1_oof[TARGET] = train_data[TARGET]

In [None]:
df1 = pd.read_csv('../input/manav-l1/L1_train_pred_1.csv').sort_values('id')
df2 = pd.read_csv('../input/manav-l1/L1_train_pred_2.csv').sort_values('id')
df3 = pd.read_csv('../input/manav-l1/L1_train_pred_3.csv').sort_values('id')
df4 = pd.read_csv('../input/manav-l1/L1_train_pred_4.csv').sort_values('id')
df5 = pd.read_csv('../input/manav-l1/L1_train_pred_5.csv').sort_values('id')
df6 = pd.read_csv('../input/manav-l1/L1_train_pred_6.csv').sort_values('id')
df7 = pd.read_csv('../input/manav-l1/L1_train_pred_7.csv').sort_values('id')

lv1_oof['l1_m0'] = df1.iloc[:, -1].to_numpy()
lv1_oof['l1_m1'] = df2.iloc[:, -1].to_numpy()
lv1_oof['l1_m2'] = df3.iloc[:, -1].to_numpy()
lv1_oof['l1_m3'] = df4.iloc[:, -1].to_numpy()
lv1_oof['l1_m4'] = df5.iloc[:, -1].to_numpy()
lv1_oof['l1_m5'] = df6.iloc[:, -1].to_numpy()
lv1_oof['l1_m6'] = df7.iloc[:, -1].to_numpy()

In [None]:
lv1_oof.head()

In [None]:
df1 = pd.read_csv('../input/manav-l1/L1_test_pred_1.csv').sort_values('id')
df2 = pd.read_csv('../input/manav-l1/L1_test_pred_2.csv').sort_values('id')
df3 = pd.read_csv('../input/manav-l1/L1_test_pred_3.csv').sort_values('id')
df4 = pd.read_csv('../input/manav-l1/L1_test_pred_4.csv').sort_values('id')
df5 = pd.read_csv('../input/manav-l1/L1_test_pred_5.csv').sort_values('id')
df6 = pd.read_csv('../input/manav-l1/L1_test_pred_6.csv').sort_values('id')
df7 = pd.read_csv('../input/manav-l1/L1_test_pred_7.csv').sort_values('id')

lv1_test['l1_m0'] = df1.iloc[:, -1].to_numpy()
lv1_test['l1_m1'] = df2.iloc[:, -1].to_numpy()
lv1_test['l1_m2'] = df3.iloc[:, -1].to_numpy()
lv1_test['l1_m3'] = df4.iloc[:, -1].to_numpy()
lv1_test['l1_m4'] = df5.iloc[:, -1].to_numpy()
lv1_test['l1_m5'] = df6.iloc[:, -1].to_numpy()
lv1_test['l1_m6'] = df7.iloc[:, -1].to_numpy()

In [None]:
lv1_test.head()

In [None]:
lv1_oof.to_csv('lv1_oof.csv', index=False)
lv1_test.to_csv('lv1_test.csv', index=False)

In [None]:
features = [col for col in lv1_oof.columns if col != TARGET]

len(features)

### Lv2
**Input**: Predictions from previous level along with lv1 predictions from this [notebook](https://www.kaggle.com/manabendrarout/custom-stacking-of-classifiers-gpu-tps-sep2021/).

**Models**: [XGB, CATB, LGBM]

In [None]:
lv2_oof = pd.DataFrame()
lv2_test = pd.DataFrame()

In [None]:
oof_preds, test_preds = cross_validate_model(XGBClassifier, 
                                             xgb_params, 
                                             lv1_oof, 
                                             lv1_test,
                                             N_FOLDS)

lv2_oof['xgb'] = oof_preds
lv2_test['xgb'] = test_preds

In [None]:
catb_params['random_state'] = 42
oof_preds, test_preds = cross_validate_model(CatBoostClassifier, 
                                             catb_params, 
                                             lv1_oof, 
                                             lv1_test,
                                             N_FOLDS)

lv2_oof['catb'] = oof_preds
lv2_test['catb'] = test_preds

In [None]:
lgbm_params['random_state'] = 2021
oof_preds, test_preds = cross_validate_model(LGBMClassifier, 
                                             lgbm_params, 
                                             lv1_oof, 
                                             lv1_test,
                                             N_FOLDS)

lv2_oof['lgbm'] = oof_preds
lv2_test['lgbm'] = test_preds

In [None]:
lv2_oof.to_csv('lv2_oof.csv', index=False)
lv2_test.to_csv('lv2_test.csv', index=False)

In [None]:
lv2_oof['xgb_l2'] = lv2_oof['xgb']
lv2_test['xgb_l2'] = lv2_test['xgb']

for col in lv1_oof.columns:
    lv2_oof[col] = lv1_oof[col]

for col in lv1_test.columns:
    lv2_test[col] = lv1_test[col]

lv2_oof.head()

In [None]:
features = [col for col in lv2_oof.columns if col != TARGET]

len(features)

### Lv3
**Input**: Predictions from previous two levels.

**Models**: [LogisticRegressor]

In [None]:
X = lv2_oof[features].to_numpy()
Y = lv2_oof[TARGET]
X_test = lv2_test[features].to_numpy()

skfolds = model_selection.StratifiedKFold(n_splits=N_FOLDS, shuffle=False)

oof_preds, oof_y = [], []

test_preds = np.zeros((X_test.shape[0]))

for i, (train_index, val_index) in enumerate(skfolds.split(X, Y)):
    x_train, x_val = X[train_index], X[val_index]
    y_train, y_val = Y[train_index], Y[val_index]

    print(f"{'-'*10} Fold {i+1} Started {'-'*10}")
    clf = linear_model.LogisticRegression(max_iter=1000)

    clf = clf.fit(x_train, y_train)
    preds = clf.predict_proba(x_val)

    oof_preds.extend(preds[:, 1])
    oof_y.extend(y_val)

    test_preds += clf.predict_proba(X_test)[:, 1]

    ra_score = metrics.roc_auc_score(y_val, preds[:, 1])

    print(f"ROC AUC of current fold is {ra_score}")

ra_score = metrics.roc_auc_score(oof_y, oof_preds)

print(f"\nOverall ROC AUC is {ra_score}")

test_preds /= N_FOLDS

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

sub['claim'] = test_preds

sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()