## Import libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pickle
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

## Load processed datasets

In [None]:
with open("../input/tps-sep-cooking-data/TPS_Sep_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

In [None]:
Xtrain = train_df.loc[:, train_df.columns != 'claim'].copy()
Ytrain = train_df['claim'].copy()
Xtest = test_df.copy()

print(f"Xtrain: {Xtrain.shape} \nYtrain: {Ytrain.shape} \nXtest: {Xtest.shape}")

del train_df, test_df
gc.collect()

## Helper Function

In [None]:
def plot_confusion_matrix(cm, classes):

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', fontweight='bold', pad=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontweight='bold')
    plt.xlabel('Predicted label', fontweight='bold')
    plt.tight_layout()

## XGBoost Model

In [None]:
def xgb_train_predict(params, train, test, true_label, FOLD=5, SEEDS=[42]):

    counter = 0
    oof_score = 0
    y_pred_final_xgb = np.zeros((test.shape[0], len(SEEDS)))
    y_pred_meta_xgb = np.zeros((train.shape[0], len(SEEDS)))


    for sidx, seed in enumerate(SEEDS):
        seed_score = 0

        kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

        for idx, (train_idx, val_idx) in enumerate(kfold.split(train, true_label)):
            counter += 1

            train_x, train_y = train.iloc[train_idx], true_label.iloc[train_idx]
            val_x, val_y = train.iloc[val_idx], true_label.iloc[val_idx]

            params['learning_rate']=0.02
            init_model = XGBClassifier(**params)

            init_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                           early_stopping_rounds=200, verbose=500)

            params['learning_rate']=0.008
            model = XGBClassifier(**params)

            model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                      early_stopping_rounds=100, verbose=300, xgb_model=init_model)

            y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))[:,-1]
            y_pred_meta_xgb[val_idx, sidx] += y_pred
            y_pred_final_xgb[:, sidx] += model.predict_proba(test, iteration_range=(0, model.best_iteration))[:,-1]

            score = roc_auc_score(val_y, y_pred)
            oof_score += score
            seed_score += score
            print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))

        print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


    y_pred_final_xgb = y_pred_final_xgb / float(FOLD)
    oof_score /= float(counter)
    print("Aggregate OOF Score: {}".format(oof_score))
    
    return y_pred_meta_xgb, y_pred_final_xgb, oof_score

In [None]:
params1 = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'n_estimators': 8000,
    'max_depth': 4,
    'gamma': 0.2465,
    'subsample': 0.6423,
    'colsample_bytree': 0.775,
    'colsample_bylevel': 0.868,
    'min_child_weight': 366,
    'reg_lambda': 0.05,
    'reg_alpha': 10,
    'verbosity': 0,
    'random_state': 42
}

y_pred_meta_xgb1, y_pred_final_xgb1, oof_score1 = xgb_train_predict(params1, Xtrain, Xtest, Ytrain)

In [None]:
params2 = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
    'n_estimators': 8000,
    'max_depth': 3,
    'gamma': 0.2465,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'colsample_bylevel': 0.868,
    'min_child_weight': 256,
    'reg_lambda': 0.05,
    'reg_alpha': 10,
    'verbosity': 0,
    'random_state': 42
}

y_pred_meta_xgb2, y_pred_final_xgb2, oof_score2 = xgb_train_predict(params2, Xtrain, Xtest, Ytrain)

In [None]:
y_pred_meta_xgb = np.concatenate((y_pred_meta_xgb1, y_pred_meta_xgb2), axis=1)
y_pred_final_xgb = np.concatenate((y_pred_final_xgb1, y_pred_final_xgb2), axis=1)
print(f"y_pred_meta_xgb: {y_pred_meta_xgb.shape} \ny_pred_final_xgb: {y_pred_final_xgb.shape}")

In [None]:
y_pred_meta = np.mean(y_pred_meta_xgb, axis=1)
y_pred = (y_pred_meta>0.5).astype(int)
print(classification_report(Ytrain, y_pred))

In [None]:
cnf_matrix = confusion_matrix(Ytrain, y_pred, labels=[0, 1])
np.set_printoptions(precision=2)
plt.figure(figsize=(12, 5))
plot_confusion_matrix(cnf_matrix, classes=[0, 1])

In [None]:
np.savez_compressed('./XGB_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    oof_score1=oof_score1,
                    oof_score2=oof_score2,
                    y_pred_final_xgb=y_pred_final_xgb)

## Create submission file

In [None]:
y_pred_final = np.mean(y_pred_final_xgb, axis=1)
submit_df = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submit_df['claim'] = y_pred_final
submit_df.to_csv("XGB_Submission.csv", index=False)
submit_df.head(10)