## Import libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## Load source datasets

In [None]:
train_df = pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv")
train_df.set_index('id', inplace=True)
print(f"train_df: {train_df.shape}")
train_df.head()

In [None]:
test_df = pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv")
test_df.set_index('id', inplace=True)
print(f"test_df: {test_df.shape}")
test_df.head()

## Data Exploration

In [None]:
feats = []
for col in tqdm(train_df.columns):
    feats.append([
        col,
        train_df[col].nunique(),
        train_df[col].mean(),
        train_df[col].min(),
        train_df[col].max(),
        train_df[col].corr(train_df['target'])
    ])

df = pd.DataFrame(feats, columns=['Col','Num Unique','Mean','Min','Max','Corr'])
df.sort_values(by=['Corr'], ascending=False, key=abs).head(25)

In [None]:
cat_cols = df[(df['Num Unique']<5)&(df['Col']!='target')]['Col'].unique().tolist()
num_cols = [col for col in test_df.columns if col not in cat_cols]
print(f"cat_cols: {len(cat_cols)} \nnum_cols: {len(num_cols)}")

del df
gc.collect()

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(x='f22', hue='target', data=train_df)
plt.title("Countplot - f22 vs target", fontweight='bold', pad=15);

In [None]:
plt.figure(figsize=(12,5))
sns.kdeplot(x='f179', hue='target', data=train_df)
plt.title("Countplot - f179 vs target", fontweight='bold', pad=15);

## Feature Engineering

In [None]:
train_df['mean'] = train_df[num_cols].mean(axis=1)
train_df['std'] = train_df[num_cols].std(axis=1)
train_df['min'] = train_df[num_cols].min(axis=1)
train_df['max'] = train_df[num_cols].max(axis=1)

test_df['mean'] = test_df[num_cols].mean(axis=1)
test_df['std'] = test_df[num_cols].std(axis=1)
test_df['min'] = test_df[num_cols].min(axis=1)
test_df['max'] = test_df[num_cols].max(axis=1)

print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

features = test_df.columns.tolist()
print(f"Num features: {len(features)}")

cat_cols_indices = [train_df.columns.get_loc(col) for col in cat_cols]
print(f"cat_cols_indices: {cat_cols_indices}")

## Quantile Transformation

In [None]:
for col in tqdm(num_cols):
    transformer = QuantileTransformer(n_quantiles=3000, 
                                      random_state=42, 
                                      output_distribution="normal")
    
    vec_len = len(train_df[col].values)
    vec_len_test = len(test_df[col].values)

    raw_vec = train_df[col].values.reshape(vec_len, 1)
    test_vec = test_df[col].values.reshape(vec_len_test, 1)
    transformer.fit(raw_vec)
    
    train_df[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_df[col] = transformer.transform(test_vec).reshape(1, vec_len_test)[0]

print(f"train_df: {train_df.shape} \ntest_df: {test_df.shape}")

## Compress data

In [None]:
INT8_MIN    = np.iinfo(np.int8).min
INT8_MAX    = np.iinfo(np.int8).max
INT16_MIN   = np.iinfo(np.int16).min
INT16_MAX   = np.iinfo(np.int16).max
INT32_MIN   = np.iinfo(np.int32).min
INT32_MAX   = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail=1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024*1024)
    print("Memory usage : {0:.2f}MB".format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    length_interval      = 50
    length_float_decimal = 4

    print('='*length_interval)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print("Name: {0:24s} Type: {1}".format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, length_float_decimal)), str(np.round(col_max, length_float_decimal))))
                if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
                    data[col] = data[col].astype(np.float16)
                    print("  float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
                    print("compress float64 --> float16")
                elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print("  float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print("compress float64 --> float32")
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('='*length_interval)

            if col_dtype == 'int64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN/2) and (col_max < INT8_MAX/2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print("     int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print("    int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print("    int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print("compress (int64) ==> (int32)")
                elif type_flag == 16:
                    print("compress (int64) ==> (int16)")
                else:
                    print("compress (int64) ==> (int8)")
                print('='*length_interval)

    print()
    memory_after_compress = memory_usage(data, 0)
    print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data

In [None]:
train_df = compress_dataset(train_df)

In [None]:
test_df = compress_dataset(test_df)

## Model Hyperparameters

In [None]:
FOLD = 5
SEEDS = [29, 47]

xgb_params = {
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'tree_method': 'gpu_hist', 
    'gpu_id': 0, 
    'predictor': 'gpu_predictor', 
    'n_estimators': 10000, 
    'learning_rate': 0.01063045229441343, 
    'gamma': 0.24652519525750877, 
    'max_depth': 4, 
    'min_child_weight': 366, 
    'subsample': 0.6423040816299684, 
    'colsample_bytree': 0.7751264493218339, 
    'colsample_bylevel': 0.8675692743597421, 
    'lambda': 0, 
    'alpha': 10,
    'random_state': 2021
}

cb_params = {
    'eval_metric' : 'AUC',
    'iterations': 15585, 
    'objective': 'CrossEntropy',
    'bootstrap_type': 'Bernoulli', 
    'od_wait': 1144, 
    'learning_rate': 0.023575206684596582, 
    'reg_lambda': 36.30433203563295, 
    'random_strength': 43.75597655616195, 
    'depth': 7, 
    'min_data_in_leaf': 11, 
    'leaf_estimation_iterations': 1, 
    'subsample': 0.8227911142845009,
    'task_type' : 'GPU',
    'devices' : '0',
    'verbose' : 0,
    'random_state': 2021
}

lgb_params = {
    'metric' : 'auc',
    'objective' : 'binary',
    'device_type': 'gpu', 
    'n_estimators': 10000, 
    'learning_rate': 0.12230165751633416, 
    'num_leaves': 1400, 
    'max_depth': 8, 
    'min_child_samples': 300, 
    'reg_alpha': 10, 
    'reg_lambda': 65, 
    'min_split_gain': 5.157818977461183, 
    'subsample': 0.5, 
    'subsample_freq': 1, 
    'colsample_bytree': 0.2,
    'random_state': 2021
}

## XGBoost Model

In [None]:
counter = 0
oof_score = 0
y_pred_final_xgb = np.zeros((test_df.shape[0], 1))
y_pred_meta_xgb = np.zeros((train_df.shape[0], 1))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(train_df[features], train_df['target'])):
        counter += 1

        train_x, train_y = train_df[features].iloc[train], train_df['target'].iloc[train]
        val_x, val_y = train_df[features].iloc[val], train_df['target'].iloc[val]

        model = XGBClassifier(**xgb_params)

        model.fit(train_x, train_y, eval_set=[(train_x, train_y), (val_x, val_y)], 
                  early_stopping_rounds=200, verbose=1000)
        
        y_pred = model.predict_proba(val_x, iteration_range=(0, model.best_iteration))[:,-1]
        y_pred_meta_xgb[val] += np.array([y_pred]).T
        y_pred_final_xgb += np.array([model.predict_proba(test_df, iteration_range=(0, model.best_iteration))[:,-1]]).T
        
        score = roc_auc_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nXGBoost | Seed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nXGBoost | Seed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_xgb = y_pred_meta_xgb / float(len(SEEDS))
y_pred_final_xgb = y_pred_final_xgb / float(counter)
oof_score /= float(counter)
print("XGBoost | Aggregate OOF Score: {}".format(oof_score))

## CatBoost Model

In [None]:
counter = 0
oof_score = 0
y_pred_final_cb = np.zeros((test_df.shape[0], 1))
y_pred_meta_cb = np.zeros((train_df.shape[0], 1))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(train_df[features], train_df['target'])):
        counter += 1

        train_x, train_y = train_df[features].iloc[train], train_df['target'].iloc[train]
        val_x, val_y = train_df[features].iloc[val], train_df['target'].iloc[val]

        model = CatBoostClassifier(**cb_params)

        model.fit(train_x, train_y, eval_set=[(val_x, val_y)],
                  cat_features=cat_cols_indices,
                  early_stopping_rounds=200, verbose=500)

        y_pred = model.predict_proba(val_x)[:,-1]
        y_pred_meta_cb[val] += np.array([y_pred]).T
        y_pred_final_cb += np.array([model.predict_proba(test_df)[:,-1]]).T
        
        score = roc_auc_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nCatBoost | Seed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nCatBoost | Seed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_cb = y_pred_meta_cb / float(len(SEEDS))
y_pred_final_cb = y_pred_final_cb / float(counter)
oof_score /= float(counter)
print("CatBoost | Aggregate OOF Score: {}".format(oof_score))

## LightGBM Model

In [None]:
counter = 0
oof_score = 0
y_pred_final_lgb = np.zeros((test_df.shape[0], 1))
y_pred_meta_lgb = np.zeros((train_df.shape[0], 1))


for sidx, seed in enumerate(SEEDS):
    seed_score = 0
    
    kfold = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=seed)

    for idx, (train, val) in enumerate(kfold.split(train_df[features], train_df['target'])):
        counter += 1

        train_x, train_y = train_df[features].iloc[train], train_df['target'].iloc[train]
        val_x, val_y = train_df[features].iloc[val], train_df['target'].iloc[val]

        model = LGBMClassifier(**lgb_params)
        
        model.fit(train_x, train_y, eval_metric='auc',
                  eval_set=[(train_x, train_y), (val_x, val_y)],
                  categorical_feature=cat_cols_indices, 
                  early_stopping_rounds=200, verbose=200)

        y_pred = model.predict_proba(val_x, num_iteration=model.best_iteration_)[:,-1]
        y_pred_meta_lgb[val] += np.array([y_pred]).T
        y_pred_final_lgb += np.array([model.predict_proba(test_df, num_iteration=model.best_iteration_)[:,-1]]).T
        
        score = roc_auc_score(val_y, y_pred)
        oof_score += score
        seed_score += score
        print("\nLightGBM | Seed-{} | Fold-{} | OOF Score: {}\n".format(seed, idx, score))
        
        del model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()
    
    print("\nLightGBM | Seed: {} | Aggregate OOF Score: {}\n\n".format(seed, (seed_score / FOLD)))


y_pred_meta_lgb = y_pred_meta_lgb / float(len(SEEDS))
y_pred_final_lgb = y_pred_final_lgb / float(counter)
oof_score /= float(counter)
print("LightGBM | Aggregate OOF Score: {}".format(oof_score))

## Save meta features

In [None]:
np.savez_compressed('./TPS_1021_Meta_Features.npz',
                    y_pred_meta_xgb=y_pred_meta_xgb, 
                    y_pred_meta_cb=y_pred_meta_cb, 
                    y_pred_meta_lgb=y_pred_meta_lgb, 
                    y_pred_final_xgb=y_pred_final_xgb,
                    y_pred_final_cb=y_pred_final_cb,
                    y_pred_final_lgb=y_pred_final_lgb)

## Create submission files

In [None]:
submit_df = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
submit_df['target'] = y_pred_final_xgb.ravel()
submit_df.to_csv("XGB_Submission.csv", index=False)
submit_df.head()

In [None]:
submit_df['target'] = y_pred_final_cb.ravel()
submit_df.to_csv("CB_Submission.csv", index=False)
submit_df.head()

In [None]:
submit_df['target'] = y_pred_final_lgb.ravel()
submit_df.to_csv("LGB_Submission.csv", index=False)
submit_df.head()

In [None]:
submit_df['target'] = (y_pred_final_xgb * 0.55) + (y_pred_final_cb * 0.15) + (y_pred_final_lgb * 0.3)
submit_df.to_csv("WA_Submission.csv", index=False)
submit_df.head()