## Import libraries

In [None]:
import gc
import pickle
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

## Load processed datasets

In [None]:
with open("../input/tps-sep-cooking-data/TPS_Sep_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

In [None]:
Xtrain = train_df.loc[:, train_df.columns != 'claim'].copy()
Ytrain = train_df['claim'].copy()
Xtest = test_df.copy()

print(f"Xtrain: {Xtrain.shape} \nYtrain: {Ytrain.shape} \nXtest: {Xtest.shape}")

del train_df, test_df
gc.collect()

In [None]:
cat_cols = ['f5_bin','f29_bin','f40_bin','f42_bin','f50_bin','f65_bin',
            'f70_bin','f74_bin','f75_bin','f91_bin','clusters_k']

Xtrain[cat_cols] = Xtrain[cat_cols].astype(int)
Xtest[cat_cols] = Xtest[cat_cols].astype(int)

cat_cols_indices = [Xtrain.columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

## Helper Function

In [None]:
def plot_confusion_matrix(cm, classes):

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', fontweight='bold', pad=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontweight='bold')
    plt.xlabel('Predicted label', fontweight='bold')
    plt.tight_layout()

## CatBoost Model

In [None]:
def cb_train_predict(params, train, test, true_label, FOLD=5, SEEDS=[42]):

    counter = 0
    oof_score = 0
    y_pred_final_cb = np.zeros((test.shape[0], len(SEEDS)))
    y_pred_meta_cb = np.zeros((train.shape[0], len(SEEDS)))


    for sidx, seed in enumerate(SEEDS):
        seed_score = 0

        kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

        for idx, (train_idx, val_idx) in enumerate(kfold.split(train, true_label)):
            counter += 1

            train_x, train_y = train.iloc[train_idx], true_label.iloc[train_idx]
            val_x, val_y = train.iloc[val_idx], true_label.iloc[val_idx]

            params['learning_rate']=0.07
            init_model = CatBoostClassifier(**params)

            init_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                           early_stopping_rounds=200, verbose=500,
                           cat_features=cat_cols_indices)

            params['learning_rate']=0.02
            model = CatBoostClassifier(**params)

            model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                      early_stopping_rounds=100, verbose=200,
                      cat_features=cat_cols_indices, init_model=init_model)

            y_pred = model.predict_proba(val_x)[:,-1]
            y_pred_meta_cb[val_idx, sidx] += y_pred
            y_pred_final_cb[:, sidx] += model.predict_proba(test)[:,-1]

            score = roc_auc_score(val_y, y_pred)
            oof_score += score
            seed_score += score
            print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))

        print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


    y_pred_final_cb = y_pred_final_cb / float(FOLD)
    oof_score /= float(counter)
    print("Aggregate OOF Score: {}".format(oof_score))
    
    return y_pred_meta_cb, y_pred_final_cb, oof_score

In [None]:
params1 = {
    'objective': 'CrossEntropy',
    'eval_metric': 'AUC',
    'iterations': 8000,
    'od_wait': 1144,
    'use_best_model': True,
    'bootstrap_type': 'Bernoulli',
    'reg_lambda': 36.304,
    'random_strength': 43.756,
    'depth': 7,
    'min_data_in_leaf': 11,
    'leaf_estimation_iterations': 1,
    'subsample': 0.8228,
    'random_state': 42
}

y_pred_meta_cb1, y_pred_final_cb1, oof_score1 = cb_train_predict(params1, Xtrain, Xtest, Ytrain)

In [None]:
params2 = {
    'objective': 'CrossEntropy',
    'eval_metric': 'AUC',
    'iterations': 8000,
    'od_wait': 1144,
    'use_best_model': True,
    'bootstrap_type': 'Bernoulli',
    'reg_lambda': 3.0,
    'random_strength': 34.756,
    'depth': 10,
    'min_data_in_leaf': 5,
    'leaf_estimation_iterations': 1,
    'subsample': 0.85,
    'random_state': 42
}

y_pred_meta_cb2, y_pred_final_cb2, oof_score2 = cb_train_predict(params2, Xtrain, Xtest, Ytrain)

In [None]:
y_pred_meta_cb = np.concatenate((y_pred_meta_cb1, y_pred_meta_cb2), axis=1)
y_pred_final_cb = np.concatenate((y_pred_final_cb1, y_pred_final_cb2), axis=1)
print(f"y_pred_meta_cb: {y_pred_meta_cb.shape} \ny_pred_final_cb: {y_pred_final_cb.shape}")

In [None]:
y_pred_meta = np.mean(y_pred_meta_cb, axis=1)
y_pred = (y_pred_meta>0.5).astype(int)
print(classification_report(Ytrain, y_pred))

In [None]:
cnf_matrix = confusion_matrix(Ytrain, y_pred, labels=[0, 1])
np.set_printoptions(precision=2)
plt.figure(figsize=(12, 5))
plot_confusion_matrix(cnf_matrix, classes=[0, 1])

In [None]:
np.savez_compressed('./CB_Meta_Features.npz',
                    y_pred_meta_cb=y_pred_meta_cb, 
                    oof_score1=oof_score1,
                    oof_score2=oof_score2,
                    y_pred_final_cb=y_pred_final_cb)

## Create submission file

In [None]:
y_pred_final = np.mean(y_pred_final_cb, axis=1)
submit_df = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submit_df['claim'] = y_pred_final
submit_df.to_csv("CB_Submission.csv", index=False)
submit_df.head(10)