In [None]:
import pandas as pd
import numpy as np
import random
import time
import gc
import os

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from sklearn import svm
from sklearn import tree
from sklearn import impute
from sklearn import ensemble
from sklearn import linear_model
from sklearn import decomposition


from matplotlib import pyplot as plt
import matplotlib as mpl
import seaborn as sns

import xgboost as xgb
import lightgbm as lgb
import catboost as cat

import warnings

print('done!')

## Data

In [None]:
N_SPLITS = 5
N_ESTIMATORS = 1000
EARLY_STOPING_ROUND = 200
VERBOSE = 1000
SEED = 2021

N_BINS = 20

def seed_everything(seed = 42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('float_format', '{:f}'.format)
    
    sns.set_style("white")
    mpl.rcParams['figure.dpi'] = 600
    %matplotlib inline

seed_everything(SEED)

In [None]:
path = "../input/tabular-playground-series-sep-2021/"

data = {
    "train" : path + "train.csv",
    "test"  : path + "test.csv",
    "sample": path + "sample_solution.csv"
}

train  = pd.read_csv(data["train"])
test   = pd.read_csv(data["test"])
sample = pd.read_csv(data["sample"])

train.head()

In [None]:
features = [col for col in train.columns if col not in ('id', 'claim')]
TARGET   = 'claim'
target   = train[TARGET].copy()

len(features)

In [None]:

train["n_missing"] = train[features].isna().sum(axis=1)
test["n_missing"]  = test[features].isna().sum(axis=1)

features += ['n_missing']

n_missing = train["n_missing"].copy()

In [None]:
modes = train[features].mode().iloc[0]

train[features] = train[features].fillna(modes)
test[features]  = test[features].fillna(modes)

scaler = StandardScaler()
# scaler = RobustScaler()
# scaler = MinMaxScaler()

train[features] = scaler.fit_transform(train[features])
test[features]  = scaler.transform(test[features])

In [None]:
train["min"] = train[features].min(axis=1)
train["max"] = train[features].max(axis=1)
train["std"] = train[features].std(axis=1)
train["mean"] = train[features].mean(axis=1)

test["min"]  = test[features].min(axis=1)
test["max"]  = test[features].max(axis=1)
test["std"]  = test[features].std(axis=1)
test["mean"]  = test[features].mean(axis=1)

features += ['min', 'max', 'mean', 'std']

In [None]:
def cross_validate_model(class_name, class_params, train_data, test_data, n_splits=N_SPLITS):
    
    X = train_data[features].to_numpy()
    y = train_data[TARGET]
    X_test = test_data[features].to_numpy()
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
    
    oof_preds, oof_y = [], []
    
    test_preds = np.zeros(X_test.shape[0])
    
    for i, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        xtrain, xvalid = X[train_idx], X[valid_idx]
        ytrain, yvalid = y[train_idx], y[valid_idx]
        
        print(f"{'-'*10} Fold {i+1} Started {'-'*10}")
        
        clf = class_name(**class_params)
        
        clf.fit(xtrain, ytrain)
        preds = clf.predict_proba(xvalid)
        
        oof_preds.extend(preds[:, 1])
        oof_y.extend(yvalid)
        
        test_preds += clf.predict_proba(X_test)[:, 1]
        
        roc_score = roc_auc_score(yvalid, preds[:, 1])
        print(f"\n roc : {roc_score}\n")
        
    roc_score = roc_auc_score(oof_y, oof_preds)

    print(f"\n Final ROC AUC : {roc_score}")
    
    return oof_preds, test_preds / n_splits
        



In [None]:
xgb_params = {
    'n_estimators' : 3600,
    'reg_lambda' : 3,
    'reg_alpha' : 26,
    'subsample' : 0.6000000000000001,
    'colsample_bytree' : 0.6000000000000001,
    'max_depth' : 9,
    'min_child_weight' : 5,
    'gamma' : 13.054739572819486,
    'learning_rate': 0.01,
    'tree_method': 'gpu_hist',
    'booster': 'gbtree',
    
    'use_label_encoder' : False 
}

lgbm_params = {
    "objective": "binary",
    "learning_rate": 0.008,
    'device': 'gpu',
    'n_estimators': 3205,
    'num_leaves': 184,
    'min_child_samples': 63,
    'feature_fraction': 0.6864594334728974,
    'bagging_fraction': 0.9497327922401265,
    'bagging_freq': 1,
    'reg_alpha': 19,
    'reg_lambda': 19,
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

catb_params = {
    'iterations': 15585, 
    'objective': 'CrossEntropy', 
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0
}


In [None]:
lv1_oof = pd.DataFrame()
lv1_test= pd.DataFrame()

In [None]:
oof_preds, test_preds = cross_validate_model(xgb.XGBClassifier,
                                            xgb_params,
                                            train, test,
                                            N_SPLITS)

lv1_oof['xgb'] = oof_preds
lv1_test['xgb']= test_preds

In [None]:
catb_params['random_state'] = 42
oof_preds, test_preds = cross_validate_model(cat.CatBoostClassifier,
                                            catb_params,
                                            train, test,
                                            N_SPLITS)
lv1_oof['catb_1'] = oof_preds
lv1_test['catb_1']= test_preds

catb_params['random_state'] = 2021
oof_preds, test_preds = cross_validate_model(cat.CatBoostClassifier,
                                            catb_params,
                                            train, test,
                                            N_SPLITS)
lv1_oof['catb_2'] = oof_preds
lv1_test['catb_2']= test_preds



In [None]:
lgbm_params['random_state'] = 42
oof_preds, test_preds = cross_validate_model(lgb.LGBMClassifier,
                                            lgbm_params,
                                            train, test,
                                            N_SPLITS)
lv1_oof['lgbm_1'] = oof_preds
lv1_test['lgbm_1']= test_preds


lgbm_params['random_state'] = 2021
oof_preds, test_preds = cross_validate_model(lgb.LGBMClassifier,
                                            lgbm_params,
                                            train, test,
                                            N_SPLITS)
lv1_oof['lgbm_2'] = oof_preds
lv1_test['lgbm_2']= test_preds



In [None]:
lv1_oof[TARGET] = train[TARGET]


In [None]:
lv1_oof.to_csv("lv1_train.csv", index=False)
lv1_test.to_csv("lv1_test.csv", index=False)



In [None]:
df = pd.read_csv("lv1_train.csv")
df.head()