In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder #Encode Categorical Features
import lightgbm as lgb #Gradient Boosting Machine
import matplotlib.pyplot as plt #Visualization
import seaborn as sns #Visualization
from sklearn.model_selection import KFold #N-Fold Validation
from sklearn.metrics import mean_squared_error #Evaluation Metric
import optuna #hyperparams Tuning

In [None]:
trainSet = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')

# Data Preprocessing

In [None]:
trainSet.head()

In [None]:
#plot the Target Distribution
sns.displot(data=trainSet, x="target", kde=True)

In [None]:
len(trainSet[trainSet.target < 5])/len(trainSet)

In [None]:
len(trainSet[trainSet.target > 10])/len(trainSet)

In [None]:
#From the distribution graph, I would like to get rid of rows which has target < 5 and > 10 to minimize outlier.
trainSet = trainSet[(trainSet.target > 5) & (trainSet.target < 10)]

In [None]:
#encode categorical feats
cat_feat = [f"cat{val}" for val in range(0,10)]

labelEnc = [LabelEncoder() for _ in range(len(cat_feat))]

for i in range(len(cat_feat)):
    trainSet[cat_feat[i]] = labelEnc[i].fit_transform(trainSet[cat_feat[i]])

In [None]:
#Lets see the Correlation of each features and target

corr = trainSet.drop(['id'], axis=1).corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corr, mask=mask, cmap='BrBG', vmin=-1, vmax=1, annot=True)

From the correlation matrix, I could say that there is no single feature that is highly correlated to the target. So for this notebook, I will use all those features.

In [None]:
cont_var = [f"cont{val}" for val in range(14)]
for i in cont_var:
    trainSet[i] = np.log(trainSet[i])

In [None]:
#Seperate features and its target
y = trainSet.target
X = trainSet.drop(['target', 'id'], axis=1)

# Optuna Hyperparams Tuning on Light GBM Model

In [None]:
def objective(trial):
    # Define the search spaces, for your guidance, visit the optuna official sample codes https://optuna.org/#code_examples
    params = {
        'num_iterations' : trial.suggest_int('num_iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.05),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 256),
        'num_leaves': trial.suggest_int('num_leaves', 15, 256),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 25.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 25.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 25.0),
        'random_state': 47,
        'boosting_type': 'gbdt', 
        'verbose': -1
    }

    # Use 5 folds cross-validation
    N_FOLDS = 5
    rmse_score = 0
    lgbm_models = []

    kf = KFold(n_splits = N_FOLDS)
    
    for folds, (train_idx,val_idx) in enumerate(kf.split(X, y)):
        print(f"folds: {folds}")
        trainSet = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
        valSet = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])

        model = lgb.train(params, trainSet)
        lgbm_models.append(model)
        y_pred = model.predict(X.iloc[val_idx])

        rmse_score += mean_squared_error(y.iloc[val_idx], y_pred, squared=False)/N_FOLDS

        print(mean_squared_error(y.iloc[val_idx], y_pred, squared=False))
        
    return rmse_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

#Start the hyperparams tunning and suppress any warnings
study = optuna.create_study(direction='minimize')
study.optimize(objective)

In [None]:
best_params = study.best_params
print(study.best_params)

In [None]:
study.best_value

In [None]:
#For time sake, I will not rerun the hyperparam tunning, here is the best Hyperparams I got from optuna tunner

best_params = {'num_iterations': 748,
             'learning_rate': 0.021972728143721563,
             'min_data_in_leaf': 251,
             'num_leaves': 201,
             'lambda_l1': 10.618325636467706,
             'lambda_l2': 5.65105835287371,
             'bagging_freq': 0,
             'feature_fraction': 0.20664741485758317,
             'random_state': 47,
             'boosting_type': 'gbdt', 
             'verbose': -1,
             'metric': 'rmse'
              }

# End Hyperparam Tuning

In [None]:
N_FOLDS = 5
rmse_score = 0
lgbm_models = []
eval_results = [{} for _ in range (N_FOLDS)]

kf = KFold(n_splits = N_FOLDS)

In [None]:
#Train our LGBM using the best parameter

import warnings
warnings.filterwarnings("ignore")

for folds, (train_idx,val_idx) in enumerate(kf.split(X, y)):
    print(f"folds: {folds}")
    trainSet = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])
    valSet = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])
    
    model = lgb.train(best_params, trainSet, valid_sets=[trainSet, valSet], evals_result=eval_results[folds])
    lgbm_models.append(model)
    y_pred = model.predict(X.iloc[val_idx])
    
    rmse_score += mean_squared_error(y.iloc[val_idx], y_pred, squared=False)/N_FOLDS
    
    print(mean_squared_error(y.iloc[val_idx], y_pred, squared=False))

In [None]:
print(rmse_score)

In [None]:
#plot the rmse score for each iteration in 5th fold model
lgb.plot_metric(eval_results[4])

In [None]:
lgb.plot_importance(lgbm_models[4])

# Predict the Test Set

In [None]:
testSet = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')

for i in range(len(cat_feat)):
    testSet[cat_feat[i]] = labelEnc[i].transform(testSet[cat_feat[i]])

In [None]:
cont_var = [f"cont{val}" for val in range(14)]
for i in cont_var:
    testSet[i] = np.log(testSet[i])

In [None]:
id = testSet.id
testSet.drop('id', axis=1, inplace=True)

In [None]:
y_pred = np.zeros(len(testSet))

In [None]:
for model in lgbm_models:
    y_pred += model.predict(testSet)

In [None]:
y_pred = pd.DataFrame(y_pred/N_FOLDS)

# Create Submission File as in sample_submission.csv

In [None]:
submFile = pd.concat([id, y_pred],axis=1)
submFile.columns = ['id', 'target']

In [None]:
submFile.to_csv('submFile.csv', index=False)