# Feb 2021 Tabular Playground:LGBM with Optuna Tuning 

 This note book is a respomse to to the Kaggle [Tabular Playground Series - Feb 2021 competition.](http://https://www.kaggle.com/c/tabular-playground-series-feb-2021).  The Approach I have taken is as follows:
 
* Setup including reading in the data 
* Examination of the Data 
* Evaluation of Models (spoiler LGBM wins)
* Tuning of the model
* Execution of tuned model
* Submission of Results


## Setup and Read Data

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

        
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')

## Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
pd.set_option('display.max_columns', None)
display(train.head())

In [None]:
train.describe()

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
test.describe()

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv')
display(submission.head())

## Pull out the target

In [None]:
target = train.pop('target')

## Identify Categorical Columns

In [None]:
cols = train.columns
num_cols = train._get_numeric_data().columns
cat_features= list(set(cols) - set(num_cols))
cat_features.sort()

## Lets Look at our Features

In [None]:
boxplot = train.boxplot(column=num_cols.values.tolist(),
                       figsize=(12,9))

In [None]:
boxplot = test.boxplot(column=num_cols.values.tolist(),
                       figsize=(12,9))

Hmm, data looks very symetrical (at least on numericals) in test and train, outliers and all

## Look at Correlation

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
matrix = np.triu(train.corr())
cor = train.corr()
sns.heatmap(cor, annot=True, mask=matrix,cmap= 'coolwarm', linewidths=.5, ax=ax)

We see some strong correlations that could be explore further

## Review Categorical Data

In [None]:
plt.figure(figsize=(20,20))
for i, col in enumerate(cat_features):
    plt.subplot(5,2,i+1)
    sns.countplot(x=col,data=train, order=('A','B','C','D','E','F','G','H','I','J','K','L','N'))
plt.tight_layout()

In [None]:
plt.figure(figsize=(20,20))
for i, col in enumerate(cat_features):
    plt.subplot(5,2,i+1)
    sns.countplot(x=col,data=test, order=('A','B','C','D','E','F','G','H','I','J','K','L','N'))
plt.tight_layout()

Again we see a symetry between train and test data 

## We need to encode the categoricals.

There are different strategies to accomplish this, and different approaches will have different performance when using different algorithms. For this starter notebook, we'll use simple encoding.

In [None]:
for feature in cat_features:
    le = LabelEncoder()
    le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    test[feature] = le.transform(test[feature])

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

## Make a validation split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.80)  # change 60 to 80 

## Model Evaluation

In [None]:
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LassoLars
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import TheilSenRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostClassifier
from xgboost import XGBRegressor

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
def FitAndScoreModel(df,name, model,X_tr,y_tr,X_tst,y_tst):
    model.fit(X_tr,y_tr)
    Y_pred = model.predict(X_tst)
    score=mean_squared_error(y_tst, Y_pred, squared=False)
    df = df.append({'Model':name, 'MSE': score},ignore_index = True) 
   # plot_results(name, y_test, Y_pred)
    return df

In [None]:
dResults = pd.DataFrame(columns = ['Model', 'MSE'])

In [None]:
classifiers = [
    DummyRegressor(strategy='median'),
   # SVR(),
    SGDRegressor(),
    BayesianRidge(),
    LassoLars(),
    ARDRegression(),
    LinearRegression(),
    LGBMRegressor(),
    RandomForestRegressor(n_estimators=50, n_jobs=-1)]

 
#for item in classifiers:
#    print(item)
#    clf = item
#    dResults=FitAndScoreModel(dResults,item,item,X_train,y_train,X_test,y_test) 
    

In [None]:
#dResults.sort_values(by='MSE', ascending=True,inplace=True)
#dResults.set_index('MSE',inplace=True)
#dResults.head(dResults.shape[0])

Optuna is the best out of those tested. Let's tune it.

## LGBM Tuning

In [None]:
import optuna
import sklearn

optuna.logging.set_verbosity(optuna.logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

def objective(trial):    
    list_bins = [25, 50, 75, 100, 125, 150, 175, 200, 225, 250,500,750,1000]   

    param = {
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02,0.05]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,50,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 1000),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 400),
        'cat_smooth' : trial.suggest_int('cat_smooth', 1, 256),
        'cat_l2' : trial.suggest_int('cat_smooth', 1, 256),
        'max_bin': trial.suggest_categorical('max_bin', list_bins)
    }
    

    model = LGBMRegressor(**param,objective='regression',metric= 'rmse',boosting_type='gbdt',verbose=-1,random_state=42,n_estimators=20000,cat_feature= [x for x in range(len(cat_features))])
    
    
    model.fit(X_train, y_train,eval_set=[(X_test,y_test)], early_stopping_rounds=150,verbose=False)
    
    preds = model.predict(X_test)
    
    rmse = mean_squared_error(y_test, preds,squared=False)
    
    return rmse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=400)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))
    


In [None]:
params = study.best_params
params

In [None]:
study.best_value

In [None]:
#Visualize parameter importance.
optuna.visualization.plot_param_importances(study)

In [None]:
#plot_optimization_history: shows the scores from all trials as well as the best score so far at each point.
optuna.visualization.plot_optimization_history(study)

In [None]:
from sklearn.model_selection import KFold

n_fold = 20
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
train_columns = train.columns.values

oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, target.values)):
    
    strLog = "fold {}".format(fold_)
    print(strLog)
    
    X_tr, X_val = train.iloc[trn_idx], train.iloc[val_idx]
    y_tr, y_val = target.iloc[trn_idx], target.iloc[val_idx]

    model = LGBMRegressor(**params, objective='regression',metric= 'rmse',boosting_type='gbdt',random_state=42,verbose=-1,n_estimators=20000,cat_feature= [x for x in range(len(cat_features))])
   
    model.fit(X_tr, y_tr, 
              eval_set=[(X_tr, y_tr), (X_val, y_val)], eval_metric='rmse',
              verbose=-1, early_stopping_rounds=400)
    
    
    oof[val_idx] = model.predict(X_val, num_iteration=model.best_iteration_)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = train_columns
    fold_importance_df["importance"] = model.feature_importances_[:len(train_columns)]
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += model.predict(test, num_iteration=model.best_iteration_) / folds.n_splits



In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:3014].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure()
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('LightGBM Features (averaged over folds)')
plt.tight_layout()

## Submission of Results

In [None]:
LGBMsubmission=submission.copy()
LGBMsubmission['target'] = predictions
LGBMsubmission.to_csv('submission_LGBM.csv', header=True, index=False)
LGBMsubmission.head()