## Import libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import pickle
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

## Load processed datasets

In [None]:
with open("../input/tps-sep-cooking-data/TPS_Sep_Dataset.txt", 'rb') as handle: 
    data = handle.read()

processed_data = pickle.loads(data)
train_df = processed_data['train_df']
test_df = processed_data['test_df']

del processed_data
gc.collect()

In [None]:
Xtrain = train_df.loc[:, train_df.columns != 'claim'].copy()
Ytrain = train_df['claim'].copy()
Xtest = test_df.copy()

print(f"Xtrain: {Xtrain.shape} \nYtrain: {Ytrain.shape} \nXtest: {Xtest.shape}")

del train_df, test_df
gc.collect()

In [None]:
cat_cols = ['f5_bin','f29_bin','f40_bin','f42_bin','f50_bin','f65_bin',
            'f70_bin','f74_bin','f75_bin','f91_bin','clusters_k']

Xtrain[cat_cols] = Xtrain[cat_cols].astype(int)
Xtest[cat_cols] = Xtest[cat_cols].astype(int)

cat_cols_indices = [Xtrain.columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

## Helper Function

In [None]:
def plot_confusion_matrix(cm, classes):

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', fontweight='bold', pad=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontweight='bold')
    plt.xlabel('Predicted label', fontweight='bold')
    plt.tight_layout()

## LightGBM Model

In [None]:
def lgb_train_predict(params, train, test, true_label, FOLD=5, SEEDS=[42]):

    counter = 0
    oof_score = 0
    y_pred_final_lgb = np.zeros((test.shape[0], len(SEEDS)))
    y_pred_meta_lgb = np.zeros((train.shape[0], len(SEEDS)))


    for sidx, seed in enumerate(SEEDS):
        seed_score = 0

        kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

        for idx, (train_idx, val_idx) in enumerate(kfold.split(train, true_label)):
            counter += 1

            train_x, train_y = train.iloc[train_idx], true_label.iloc[train_idx]
            val_x, val_y = train.iloc[val_idx], true_label.iloc[val_idx]

            lgtrain = lgb.Dataset(train_x, label=train_y.ravel(), free_raw_data=False)
            lgvalidation = lgb.Dataset(val_x, label=val_y.ravel(), free_raw_data=False)

            params['learning_rate'] = 0.15

            model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgvalidation], 
                              categorical_feature=cat_cols_indices,
                              early_stopping_rounds=200, verbose_eval=300)

            params['learning_rate'] = 0.07

            model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgvalidation], 
                              categorical_feature=cat_cols_indices, init_model=model, 
                              early_stopping_rounds=100, verbose_eval=100)

            y_pred = model.predict(val_x, num_iteration=model.best_iteration)
            y_pred_meta_lgb[val_idx, sidx] += y_pred
            y_pred_final_lgb[:, sidx] += model.predict(test, num_iteration=model.best_iteration)

            score = roc_auc_score(val_y, y_pred)
            oof_score += score
            seed_score += score
            print("\nSeed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))

        print("\nSeed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


    y_pred_final_lgb = y_pred_final_lgb / float(FOLD)
    oof_score /= float(counter)
    print("Aggregate OOF Score: {}".format(oof_score))
    
    return y_pred_meta_lgb, y_pred_final_lgb, oof_score

In [None]:
params1 = {
    'objective': 'binary',
    'metric': 'AUC',
    'boosting': 'gbdt',
    'n_jobs': -1,
    'n_estimators': 8000,
    'reg_alpha': 25.0,
    'reg_lambda': 76.7,
    'num_leaves': 6,
    'max_depth': 2,
    'colsample_bytree': 0.69,
    'subsample': 0.98,
    'subsample_freq': 1,
    'feature_fraction_seed': 42,
    'bagging_seed': 42,
    'random_state': 42,
    'min_child_samples': 54,
    'min_child_weight': 256,
    'verbosity': -1
}

y_pred_meta_lgb1, y_pred_final_lgb1, oof_score1 = lgb_train_predict(params1, Xtrain, Xtest, Ytrain)

In [None]:
params2 = {
    'objective': 'binary',
    'metric': 'AUC',
    'boosting': 'gbdt',
    'n_jobs': -1,
    'n_estimators': 8000,
    'reg_alpha': 18.0,
    'reg_lambda': 17.0,
    'num_leaves': 7,
    'max_depth': 3,
    'colsample_bytree': 0.5,
    'subsample': 0.85,
    'subsample_freq': 1,
    'feature_fraction_seed': 42,
    'bagging_seed': 42,
    'random_state': 42,
    'min_child_samples': 20,
    'min_child_weight': 256,
    'verbosity': -1
}

y_pred_meta_lgb2, y_pred_final_lgb2, oof_score2 = lgb_train_predict(params2, Xtrain, Xtest, Ytrain)

In [None]:
y_pred_meta_lgb = np.concatenate((y_pred_meta_lgb1, y_pred_meta_lgb2), axis=1)
y_pred_final_lgb = np.concatenate((y_pred_final_lgb1, y_pred_final_lgb2), axis=1)
print(f"y_pred_meta_lgb: {y_pred_meta_lgb.shape} \ny_pred_final_lgb: {y_pred_final_lgb.shape}")

In [None]:
y_pred_meta = np.mean(y_pred_meta_lgb, axis=1)
y_pred = (y_pred_meta>0.5).astype(int)
print(classification_report(Ytrain, y_pred))

In [None]:
cnf_matrix = confusion_matrix(Ytrain, y_pred, labels=[0, 1])
np.set_printoptions(precision=2)
plt.figure(figsize=(12, 5))
plot_confusion_matrix(cnf_matrix, classes=[0, 1])

In [None]:
np.savez_compressed('./LGB_Meta_Features.npz',
                    y_pred_meta_lgb=y_pred_meta_lgb, 
                    oof_score1=oof_score1,
                    oof_score2=oof_score2,
                    y_pred_final_lgb=y_pred_final_lgb)

## Create submission file

In [None]:
y_pred_final = np.mean(y_pred_final_lgb, axis=1)
submit_df = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")
submit_df['claim'] = y_pred_final
submit_df.to_csv("LGB_Submission.csv", index=False)
submit_df.head(10)