## Import libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import itertools
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
train_df.set_index('id', inplace=True)
print(f"train_df: {train_df.shape}")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
test_df.set_index('id', inplace=True)
print(f"test_df: {test_df.shape}")
test_df.head()

## Feature Engineering

In [None]:
cat_cols = [col for col in test_df.columns if train_df[col].nunique() < 5]
num_cols = [col for col in test_df.columns if col not in cat_cols]
print(f"cat_cols: {len(cat_cols)} \nnum_cols: {len(num_cols)}")

In [None]:
train_df[num_cols] = train_df[num_cols].astype('float32')
train_df[cat_cols] = train_df[cat_cols].astype('uint8')

test_df[num_cols] = test_df[num_cols].astype('float32')
test_df[cat_cols] = test_df[cat_cols].astype('uint8')

print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

features = test_df.columns.tolist()
print(f"Num features: {len(features)}")

cat_cols_indices = [train_df.columns.get_loc(col) for col in cat_cols]
print(f"cat_cols_indices: {cat_cols_indices}")

## Helper Function

In [None]:
def plot_confusion_matrix(cm, classes):

    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', fontweight='bold', pad=15)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontweight='bold')
    plt.xlabel('Predicted label', fontweight='bold')
    plt.tight_layout()

## Model Hyperparameters

In [None]:
FOLD = 10
SEEDS = [791, 225, 508]

params = {
    'objective' : 'binary',
    'metric' : 'auc',
    'importance_type': 'gain',
    'n_estimators' : 10000,
    'colsample_bytree' : 0.196,
    'subsample' : 0.5828,
    'subsample_freq' : 1, 
    'reg_alpha' : 0.045,
    'reg_lambda' : 0.0117,
    'min_child_weight' : 16.843,
    'min_child_samples' : 412,
    'num_leaves': 546, 
    'max_depth': 5, 
    'cat_smooth': 36.40200359200525, 
    'cat_l2': 12.979520035205597,
    'verbosity' : 0,
    'force_col_wise' : True,
    'random_state' : 2021
}

## LightGBM Model

In [None]:
counter = 0
oof_score = 0
y_pred_final_lgb = np.zeros((test_df.shape[0], 1))
y_pred_meta_lgb = np.zeros((train_df.shape[0], 1))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(train_df[features], train_df['target'])):
        counter += 1

        train_x, train_y = train_df[features].iloc[train], train_df['target'].iloc[train]
        val_x, val_y = train_df[features].iloc[val], train_df['target'].iloc[val]

        lgtrain = lgb.Dataset(train_x, label=train_y.ravel(), free_raw_data=False)
        lgvalidation = lgb.Dataset(val_x, label=val_y.ravel(), free_raw_data=False)

        params['learning_rate'] = 0.03

        model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgvalidation], 
                          #categorical_feature=cat_cols_indices,
                          early_stopping_rounds=200, verbose_eval=500)

        params['learning_rate'] = 0.01

        model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgvalidation], 
                          #categorical_feature=cat_cols_indices, 
                          init_model=model, early_stopping_rounds=100, 
                          verbose_eval=500)

        y_pred = model.predict(val_x, num_iteration=model.best_iteration)
        y_pred_meta_lgb[val] += np.array([y_pred]).T
        y_pred_final_lgb += np.array([model.predict(test_df, num_iteration=model.best_iteration)]).T
        
        score = roc_auc_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nLightGBM | Seed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nLightGBM | Seed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_lgb = y_pred_meta_lgb / float(len(SEEDS))
y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("LightGBM | Aggregate OOF Score: {}".format(oof_score))

In [None]:
y_pred_meta = np.mean(y_pred_meta_lgb, axis=1)
y_pred = (y_pred_meta>0.5).astype(int)
print(classification_report(train_df['target'], y_pred))

In [None]:
cnf_matrix = confusion_matrix(train_df['target'], y_pred, labels=[0, 1])
np.set_printoptions(precision=2)
plt.figure(figsize=(12, 5))
plot_confusion_matrix(cnf_matrix, classes=[0, 1])

## Save meta features

In [None]:
np.savez_compressed('./TPS_1021_LGB_Meta_Features.npz',
                    y_pred_meta_lgb=y_pred_meta_lgb,
                    y_pred_final_lgb=y_pred_final_lgb)

## Create submission files

In [None]:
submit_df = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
submit_df['target'] = y_pred_final_lgb.ravel()
submit_df.to_csv("LGB_Submission.csv", index=False)
submit_df.head()