In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans


In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv").drop(columns=['id'])
test_df = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv").drop(columns=['id'])

In [None]:
Xtrain = train_df.loc[:, train_df.columns != 'claim'].copy()
Ytrain = train_df['claim'].copy()
Xtest = test_df.copy()

features = [col for col in train_df.columns if col != 'claim']
mean = Xtrain[features].mean().iloc[0]



In [None]:
# Filling missing values with median of each column
imputer = SimpleImputer(strategy="median")
for col in features:
    Xtrain[col] = imputer.fit_transform(np.array(Xtrain[col]).reshape(-1,1))
    Xtest[col] = imputer.transform(np.array(Xtest[col]).reshape(-1,1))


scaler = StandardScaler()
for col in features:
    Xtrain[col] = scaler.fit_transform(np.array(Xtrain[col]).reshape(-1,1))
    Xtest[col] = scaler.transform(np.array(Xtest[col]).reshape(-1,1))

In [None]:
FOLD = 5
SEEDS = [29]

fet_imp = 0
counter = 0
oof_score = 0
y_pred_final_xgb = np.zeros((Xtest.shape[0], 1))
y_pred_meta_xgb = np.zeros((Xtrain.shape[0], 1))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(Xtrain.values, Ytrain.values)):
        counter += 1

        train_x, train_y = Xtrain.iloc[train], Ytrain.iloc[train]
        val_x, val_y = Xtrain.iloc[val], Ytrain.iloc[val]
        
        

        model = XGBClassifier(
            objective= 'binary:logistic',
              use_label_encoder= False,
              n_estimators= 2600,
              learning_rate= 0.04,
              subsample= 0.66,
              colsample_bytree= 0.1,
              max_depth= 8,
              booster= 'gbtree',
              gamma= 5.5,
              reg_alpha= 81.8,
              reg_lambda= 72.0,
              random_state= 42,
              verbosity=0, 
              eval_metric='auc',
              #'tree_method': 'gpu_hist',
              n_jobs= 4
        )

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=50)
        
        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))[:,-1]
        y_pred_meta_xgb[val] += np.array([y_pred]).T
        y_pred_final_xgb += np.array([model.predict_proba(Xtest, iteration_range=(0, model.best_iteration))[:,-1]]).T
        
        fet_imp += model.feature_importances_
        score = roc_auc_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
    
    print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


fet_imp = fet_imp / float(counter)
y_pred_meta_xgb = y_pred_meta_xgb / float(len(SEEDS))
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("Aggregate OOF Score: {}".format(oof_score))