In this notebook, we will be using Optuna to fine tune our hyperparameters for our LightGBM model.

Dataset: https://www.kaggle.com/c/tabular-playground-series-feb-2021

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import lightgbm as lgb
import optuna
import warnings
warnings.filterwarnings("ignore")

In [None]:
PATH = '../input/tabular-playground-series-feb-2021/'
SEED = 25
df = pd.read_csv(PATH + 'train.csv')
df.drop('id', axis=1, inplace=True)

In [None]:
features = [x for x in df.columns if x not in 'target']
target = ['target']

cont_features = [x for x in features if x.find('cont') != -1]
cat_features = [x for x in features if x.find('cat') != -1]

df[cat_features] = df[cat_features].astype('category')

Test Set

In [None]:
test_df = pd.read_csv(PATH + 'test.csv')
ids = test_df['id']
test_df.drop('id', axis=1, inplace=True)
test_df[cat_features] = test_df[cat_features].astype('category')

# EDA

Target Distribution

In [None]:
sns.histplot(x='target', data=df, kde=True)

We have a bimodal distribution in our target.

Distributions

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20,10))
for i, row in enumerate(ax):
    for j, col in enumerate(row):
        sns.histplot(x=cat_features[i*5+j], data=df, ax=col)

We can see some very imbalanced data in some categorical features like 'cat4', 'cat6', 'cat7'.

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=4, figsize=(20,15))
for i, row in enumerate(ax):
    for j, col in enumerate(row):
        try:
            sns.histplot(x=cont_features[i*4+j], data=df, ax=col)
        except IndexError:
            continue

We can see several multimodal distributions in our continuous features.

Continuous Features x Target

In [None]:
fig, ax = plt.subplots(nrows=4, ncols=4, figsize=(20,20))
for i, row in enumerate(ax):
    for j, col in enumerate(row):
        try:
            sns.scatterplot(x=cont_features[i*4+j], y='target', data=df, ax=col)
        except IndexError:
            continue

In [None]:
f = plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(df.corr(), annot=True, square=True, fmt='.2f')

There's no linear correlation between our target and continous features. Let's try mutual info correlation...

In [None]:
from sklearn.feature_selection import mutual_info_regression

In [None]:
mi_corr = mutual_info_regression(df[cont_features].values, df['target'].values, random_state=SEED)

In [None]:
f = plt.figure(figsize=(12,4))
plt.title('Mutual Information Correlation')
sns.barplot(x=cont_features, y=mi_corr)

# LGBM Tuning

In [None]:
x_train = df.sample(frac=0.8, random_state=SEED)[features]
x_val = df.drop(x_train.index, axis=0)[features]
y_train = df.iloc[x_train.index][target]
y_val = df.drop(x_train.index, axis=0)[target]

In [None]:
def objective(trial):  
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbose': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_uniform('learning_rate', 0.002, 0.02),
        'max_depth': trial.suggest_int('max_depth', 12, 30),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 20.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 16,  102),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.2, 0.8),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'subsample_freq': trial.suggest_int('subsample_freq', 1, 9),
        'min_child_samples': trial.suggest_int('min_child_samples', 50, 500)
    }
    
    gbm = lgb.LGBMRegressor(**params)
    gbm = gbm.fit(x_train, y_train,
                  verbose=0)

    val_pred = gbm.predict(x_val)
    return mean_squared_error(y_val, val_pred, squared=False)

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=15)

In [None]:
study.best_params

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

# LGBM Inference

5 Fold Cross Validation

In [None]:
best_params = study.best_params

scores = []
preds = []

kf = KFold(n_splits=5)

for k, (train_idx, val_idx) in enumerate(kf.split(df)):
    x_train = df.iloc[train_idx][features]
    x_val = df.iloc[val_idx][features]
    y_train = df.iloc[train_idx][target]
    y_val = df.iloc[val_idx][target]

    gbm = lgb.LGBMRegressor(**best_params)
    gbm.fit(x_train,
            y_train,
            eval_set=(x_val, y_val),
            verbose=0,
            early_stopping_rounds=500,
            eval_metric='rmse')

    test_pred = gbm.predict(x_val)
    rmse = mean_squared_error(y_val, test_pred, squared=False)
    print(f'Fold {k} CV: {rmse:.4f}')
    scores.append(rmse)
    preds.append(gbm.predict(test_df))
    
print(f'CV: {np.mean(scores):.4f}')

In [None]:
predictions = np.mean(preds, axis=0)

# Feature Importance Plot

In [None]:
f_importance = gbm.booster_.feature_importance()
x_df = df.drop('target', axis=1)

importances = pd.DataFrame(sorted(zip(f_importance, x_df.columns), reverse=True), columns=['Feature Importance', 'Feature Name'])[:15]

In [None]:
f = plt.figure(figsize=(12,6))
sns.barplot(x='Feature Importance', y='Feature Name', data=importances)

# Submission

In [None]:
submission = pd.DataFrame({'id': ids, 'target': predictions})
submission.to_csv('submission.csv', index=False)