In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
sns.set_style("darkgrid")
%matplotlib inline


from sklearn.cluster import KMeans
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import optuna

import eli5
from eli5.sklearn import PermutationImportance

# Load data

In [None]:
X_full = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/train.csv', index_col='id')
X_test = pd.read_csv('/kaggle/input/tabular-playground-series-mar-2021/test.csv', index_col='id')

# Specify target

In [None]:
target = "target"
X_full.dropna(axis=0, subset=[target], inplace=True)
y_full = X_full.pop(target)

# Categorical features

In [None]:
cat_features = [col for col in X_full.columns if X_full[col].dtype == "object"]

# Numerical features

In [None]:
num_features = [col for col in X_full.columns if X_full[col].dtype in ["int", "float"]]

# Make some EDA plots

In [None]:
nc = 4
nr = int(len(num_features)/nc+1)

fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18,4*nr))

for count, feature in enumerate(num_features):
    ks_score = stats.ks_2samp(X_full[feature], X_test[feature])[0]
    i, j = count//nc, count%nc
    sns.kdeplot(X_full[feature], color='Blue', ax=axes[i, j])
    sns.kdeplot(X_test[feature], color='Red', ax=axes[i, j])

    axes[i, j].legend(["Train", "Test"], facecolor="White")
    axes[i, j].set_title(f"{feature} ks stat : {np.round(ks_score,3)}")

plt.tight_layout()

As can be seen, the distribution is quite even between train and test datasets for the numerical features. 

# Encode categorical features

In [None]:
def loo_encode(X_full, X_test, column):
    loo = LeaveOneOutEncoder()
    new_feature = f"{column}_loo"
    loo.fit(X_full[column], y_full)
    X_full[new_feature] = loo.transform(X_full[column])
    X_test[new_feature] = loo.transform(X_test[column])
    return new_feature

loo_features = []
for feature in cat_features:
    loo_features.append(loo_encode(X_full, X_test, feature))

In [None]:
nc = 4
nr = int(len(loo_features)/nc+1)

fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18,4*nr))

for count, feature in enumerate(loo_features):
    ks_score = stats.ks_2samp(X_full[feature], X_test[feature])[0]
    i, j = count//nc, count%nc
    sns.kdeplot(X_full[feature], color='Blue', ax=axes[i, j])
    sns.kdeplot(X_test[feature], color='Red', ax=axes[i, j])

    axes[i, j].legend(["Train", "Test"], facecolor="White")
    axes[i, j].set_title(f"{feature} ks stat : {np.round(ks_score,3)}")

plt.tight_layout()

Also, the distribution is quite even between train and test datasets for the leave-one-out encoded categorical features.

# Label encode 'cat16'

In [None]:
def label_encode(X_full, X_test, column):
    le = LabelEncoder()
    new_feature = f"{column}_le"
    le.fit(X_full[column])
    le.fit(X_full[column].unique().tolist() + X_test[column].unique().tolist())
    X_full[new_feature] = le.transform(X_full[column])
    X_test[new_feature] = le.transform(X_test[column])
    return new_feature

le_list = ['cat16']
le_features = []
for feature in le_list:
    le_features.append(label_encode(X_full, X_test, feature))

# K-cluster 'cat16_loo'

In [None]:
clusters = [ 
    ("cat16_loo", 2)
]

kmeans_features = []
for var in clusters:
    kmeans = KMeans(n_clusters=var[1])
    X_full[f"{var[0]}_kmeans"] = kmeans.fit_predict( np.array(X_full[var[0]]).reshape(-1, 1) )
    X_test[f"{var[0]}_kmeans"] = kmeans.predict( np.array(X_test[var[0]]).reshape(-1, 1) )
    kmeans_features.append(f"{var[0]}_kmeans")

# Specify all features to use

In [None]:
my_features = num_features + loo_features + le_features + kmeans_features

# Optuna hyperparameter optimization

In [None]:
def objective(trial, data=X_full[my_features], target=y_full):
    seed = 2021
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, valid_index in split.split(X_full[my_features], y_full):
        X_train = X_full[my_features].iloc[train_index]
        y_train = y_full.iloc[train_index]
        X_valid = X_full[my_features].iloc[valid_index]
        y_valid = y_full.iloc[valid_index]


    lgbm_params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.01, 0.02, 0.05, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'random_state': seed,
        'boosting_type': 'gbdt',
        'metric': 'AUC',
        #'device': 'gpu'
    }
    

    model = LGBMClassifier(**lgbm_params)  
    
    model.fit(
            X_train,
            y_train,
            early_stopping_rounds=100,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )

    y_valid_pred = model.predict_proba(X_valid)[:,1]
    
    roc_auc = roc_auc_score(y_valid, y_valid_pred)
    
    return roc_auc

In [None]:
#study = optuna.create_study(direction = 'maximize')
#study.optimize(objective, n_trials = 10)
#print('Number of finished trials:', len(study.trials))
#print('Best trial:', study.best_trial.params)
#print('Best value:', study.best_value)

# Optuna visualization

In [None]:
#optuna.visualization.plot_optimization_history(study)

In [None]:
#optuna.visualization.plot_param_importances(study)

# Fit model with Optuna best parameters

In [None]:
seed = 2021
#paramsLGBM = study.best_trial.params
paramsLGBM = {'reg_alpha': 1.9553269755200153, 
              'reg_lambda': 6.667487742284949, 
              'num_leaves': 173, 
              'min_child_samples': 86, 
              'max_depth': 23, 
              'learning_rate': 0.01, 
              'colsample_bytree': 0.15433885172555964, 
              'n_estimators': 3473}
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['metric'] = 'AUC'
paramsLGBM['random_state'] = seed


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
for train_index, valid_index in split.split(X_full[my_features], y_full):
    X_train = X_full[my_features].iloc[train_index]
    y_train = y_full.iloc[train_index]
    X_valid = X_full[my_features].iloc[valid_index]
    y_valid = y_full.iloc[valid_index]


lgbm_clf = LGBMClassifier(**paramsLGBM)
lgbm_clf.fit(X_train[my_features], y_train, 
             early_stopping_rounds=100, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

In [None]:
#perm = PermutationImportance(lgbm_clf, random_state=seed).fit(X_valid, y_valid)
#eli5.show_weights(perm, feature_names = X_valid.columns.tolist())

# Make predictions

In [None]:
test_preds = lgbm_clf.predict_proba(X_test[my_features])[:,1]

# Save predictions to file

In [None]:
output = pd.DataFrame({'Id': X_test.index,
                       target: test_preds})
output.to_csv('submission.csv', index=False)

# Acknowledgement

This notebook is a combination of different ideas I have learnt from:
* https://www.kaggle.com/craigmthomas/tps-mar-2021-stacked-starter/comments
* https://www.kaggle.com/dmitryuarov/catboost-vs-xgb-vs-lgbm-tps-mar-21
* And many other Kagglers