In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from category_encoders import LeaveOneOutEncoder

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

import optuna

# Load data

In [None]:
base_dir = "../input/tabular-playground-series-apr-2021"
X_full = pd.read_csv(os.path.join(base_dir, "train.csv"))
X_test = pd.read_csv(os.path.join(base_dir, "test.csv"))

# Specify target

In [None]:
target = "Survived"
y_full = X_full.pop(target)

# Get Categorical and Numerical features

In [None]:
cat_features = [col for col in X_full.columns if X_full[col].dtype in ["object", "int"]]
num_features = [col for col in X_full.columns if X_full[col].dtype in ["float"]]

# Check cardinality of categorical features

In [None]:
X_full[cat_features].nunique()

# Check NaN

In [None]:
X_full.isnull().sum()

In [None]:
X_test.isnull().sum()

# Remove "PassengerId", "Name", "Ticket" features

In [None]:
X_full.drop(['PassengerId','Name', 'Ticket'], inplace=True, axis=1)
X_test.drop(['Name', 'Ticket'], inplace=True, axis=1)
PassengerId = X_test.pop('PassengerId')

# "Cabin": Extract the first letter and fill NaN

In [None]:
X_full['Cabin'] = X_full['Cabin'].str[0]
X_full['Cabin'] = X_full['Cabin'].fillna('N')
X_test['Cabin'] = X_test['Cabin'].str[0]
X_test['Cabin'] = X_test['Cabin'].fillna('N')

# Update categorical and numerical features

In [None]:
cat_features = [col for col in X_full.columns if X_full[col].dtype in ["object", "int"]]
num_features = [col for col in X_full.columns if X_full[col].dtype in ["float"]]

# Preprocessing

In [None]:
# Imputation
num_imputer = SimpleImputer(strategy='mean')
num_imputer.fit(X_full[num_features])
X_full[num_features] = num_imputer.transform(X_full[num_features])
X_test[num_features] = num_imputer.transform(X_test[num_features])

cat_imputer = SimpleImputer(strategy='most_frequent')
cat_imputer.fit(X_full[cat_features])
X_full[cat_features] = cat_imputer.transform(X_full[cat_features])
X_test[cat_features] = cat_imputer.transform(X_test[cat_features])

# LabelEncoder or LeaveOneOutEncoder cat_features
for feature in cat_features:
    #le = LabelEncoder()
    #le.fit(X_full[feature])
    #X_full[feature] = le.transform(X_full[feature])
    #X_test[feature] = le.transform(X_test[feature])
    loo = LeaveOneOutEncoder()
    loo.fit(X_full[feature], y_full)
    X_full[feature] = loo.transform(X_full[feature])
    X_test[feature] = loo.transform(X_test[feature])


# Make some EDA plots

In [None]:
eda_features = num_features + cat_features
n = len(eda_features)
nc = 3
nr = int(n/nc+1)

fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(18,4*nr))

for count, feature in enumerate(eda_features):
    ks_score = stats.ks_2samp(X_full[feature], X_test[feature])[0]
    i, j = count//nc, count%nc
    sns.kdeplot(X_full[feature], color='Blue', ax=axes[i, j])
    sns.kdeplot(X_test[feature], color='Red', ax=axes[i, j])

    axes[i, j].legend(["Train", "Test"], facecolor="White")
    axes[i, j].set_title(f"{feature} ks stat : {np.round(ks_score,3)}")

plt.tight_layout()

# Specify features to use

In [None]:
my_features = num_features + cat_features

In [None]:
X_full[my_features].head()

# Optuna hyperparameter optimization

In [None]:
def objective(trial, data=X_full[my_features], target=y_full):
    seed = 2021
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, valid_index in split.split(X_full[my_features], y_full):
        X_train = X_full[my_features].iloc[train_index]
        y_train = y_full.iloc[train_index]
        X_valid = X_full[my_features].iloc[valid_index]
        y_valid = y_full.iloc[valid_index]


    lgbm_params = {
        'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 11, 333),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.005, 0.01, 0.02, 0.05, 0.1]),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'random_state': seed,
        'boosting_type': 'gbdt',
        'metric': 'binary_logloss',
        #'device': 'gpu'
    }
    

    model = LGBMClassifier(**lgbm_params)  
    
    model.fit(
            X_train,
            y_train,
            early_stopping_rounds=100,
            eval_set=[(X_valid, y_valid)],
            verbose=False
        )

    y_valid_pred = model.predict(X_valid)
    
    acc_score = accuracy_score(y_valid, y_valid_pred)
    
    return acc_score

In [None]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 20)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

# Fit model with Optuna best parameters

In [None]:
seed = 2021
paramsLGBM = study.best_trial.params
paramsLGBM['boosting_type'] = 'gbdt'
paramsLGBM['metric'] = 'binary_logloss'
paramsLGBM['random_state'] = seed


split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
for train_index, valid_index in split.split(X_full[my_features], y_full):
    X_train = X_full[my_features].iloc[train_index]
    y_train = y_full.iloc[train_index]
    X_valid = X_full[my_features].iloc[valid_index]
    y_valid = y_full.iloc[valid_index]


lgbm_clf = LGBMClassifier(**paramsLGBM)
#lgbm_clf.fit(X_train[my_features], y_train, 
#             early_stopping_rounds=100, 
#             eval_set=[(X_valid, y_valid)], 
#             verbose=False)

# no eval_set or early_stopping
lgbm_clf.fit(X_full[my_features], y_full, verbose=False)

# Make predictions

In [None]:
test_preds = lgbm_clf.predict(X_test[my_features])

# Save predictions to file

In [None]:
output = pd.DataFrame({'PassengerId': PassengerId,
                       target: test_preds})
output.to_csv('submission.csv', index=False)