# <center>Tabular Playground Series - May/2021<center>
## <center> LightGBM Tuned with Hyperopt<center>
---
    
I didn't perform an extensive EDA, since there were already great notebooks on it. My Suggestion:
- [[TPS-May] Categorical EDA](https://www.kaggle.com/subinium/tps-may-categorical-eda) by [@subinium](https://www.kaggle.com/subinium)
    
Hyperparameter tuning with Hyperopt:
- [Approaching (Almost) Any Machine Learning Problem](https://github.com/abhi1thakur/approachingalmost) by [@abhishek](https://www.kaggle.com/abhishek)
    
My other notebooks in this competition:
- [Tabular Playground Series - May/2021: Neural Network with Keras](https://www.kaggle.com/jonaspalucibarbosa/tps05-21-nn-with-keras-first-nn)
- [Tabular Playground Series - May/2021: Model Stacking using Logistic Regression as Meta-Learner](https://www.kaggle.com/jonaspalucibarbosa/tps05-21-model-stacking-meta-learner-lr)

## Importing Libraries and Datasets

In [None]:
import pandas as pd       
import matplotlib as mat
import matplotlib.pyplot as plt    
import numpy as np
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier

from hyperopt.pyll.base import scope
from hyperopt import hp, fmin, tpe, Trials
from hyperopt import space_eval


import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv', index_col = 'id')
Y_train = df_train['target'].copy()
X_train = df_train.copy().drop('target', axis = 1)

X_test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv', index_col = 'id')

In [None]:
df_train

## Exploring the Data

In [None]:
df_train.info()

In [None]:
df_train.describe().T

In [None]:
df_train.nunique().sort_values()

In [None]:
plt.figure(figsize=(6,4))

class_order = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
ax = sns.countplot(x="target", data=df_train, palette="BuPu", order = class_order)

plt.xlabel("Class", fontsize= 12)
plt.ylabel("N_Samples", fontsize= 12)
plt.title("Number of Samples per Class", fontsize= 13)
plt.ylim(0,100000)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.22, p.get_height()+3000))

plt.show()

In [None]:
features = X_train.columns

plt.figure(figsize=(15,60))

for i,col in enumerate(features):    
    plt.subplot(25,2,i + 1)
    sns.distplot(df_train.loc[:,col])
    plt.ylabel('')
    plt.tight_layout()

plt.show()

## Base LightGBM

In [None]:
def cv_function (X_train, Y_train, model, splits = 10):
    
    kfold = StratifiedKFold(n_splits = splits)
    logloss = []
   
    cv_pred = np.zeros((100000,4))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xtest = X_train.iloc[test_idx]
        ytest = Y_train.iloc[test_idx]
        
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xtest,ytest)], verbose = False)

        #create predictions
        preds = model.predict_proba(xtest)
        cv_pred[test_idx] = preds
                              
        # calculate and append accuracy
        fold_logloss = metrics.log_loss(ytest,preds)
        print("LogLoss: {0:0.5f}". format(fold_logloss))
        logloss.append(fold_logloss)
        
    print (np.mean(logloss))
    #return np.mean(accuracies)
    return cv_pred

In [None]:
lgbm_model = LGBMClassifier(n_estimators = 2000, learning_rate = 0.02, random_state = 42, num_class = 4, metric = 'multi_logloss')

In [None]:
#lgbm_cvpred = cv_function(X_train, Y_train, lgbm_model)
#1.094724908866833

## Hyperparameter Tuning with Hyperopt

In [None]:
def objective(params):

    
    clf_search = LGBMClassifier(n_estimators = 2000, learning_rate = 0.02, random_state = 42, num_class = 4, metric = 'multi_logloss', verbosity = -1)
    clf_search.set_params(**params)
   
    search_cvpred = cv_function(X_train, Y_train, clf_search, splits = 5)  
    score =metrics.log_loss(Y_train, search_cvpred)
    print("Logloss: {0:0.6f}".format(score)) 
    
    return score
    
params_lgbm = {
    "max_depth": scope.int(hp.quniform("max_depth", 3, 25, 1)),
    "subsample": hp.uniform("subsample",0.4,1),
    "colsample_bytree": hp.uniform("colsample_bytree",0.4,1),
    "min_child_weight": scope.int(hp.quniform("min_child_weight", 0.1, 1.0, 0.1)),    
    "min_child_samples": scope.int(hp.quniform("min_child_samples", 20, 100, 5)),
    "num_leaves": scope.int(hp.quniform("num_leaves", 7, 256, 1)),
    "reg_alpha": hp.uniform('reg_alpha', 0.0, 1),
    "reg_lambda": hp.uniform('reg_lambda', 0.0, 1),
    
}

#Uncomment to run hyperopt

#trials = Trials()

#best = fmin(
#    fn=objective,
#    space = params_lgbm, 
#    algo=tpe.suggest, 
#    max_evals=50, 
#    trials=trials
#)

#print("Best: {}".format(best))
#trials.results

In [None]:
#best_params_lgbm = space_eval(params_lgbm, best)
#print(best_params_lgbm)

#Best parameters
best_params_lgbm = {'colsample_bytree': 0.4083405369693822, 'max_depth': 17, 'min_child_samples': 95, 
                    'min_child_weight': 0, 'num_leaves': 10, 'reg_alpha': 0.6966573230086442, 
                    'reg_lambda': 0.5138577842412738, 'subsample': 0.9800623921808034}

In [None]:
lgbm_tuned = lgbm_model
lgbm_tuned = lgbm_tuned.set_params(**best_params_lgbm)
lgbm_tuned

In [None]:
#lgbm_tuned_cvpred = cv_function(X_train, Y_train, lgbm_tuned)
#1.0918248375370458

## Making Predictions

In [None]:
def prediction (X_train, Y_train, model, X_test):
    
    kfold = StratifiedKFold(n_splits = 10)

    y_pred = np.zeros((50000,4))
    train_oof = np.zeros((100000,4))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, val_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xval = X_train.iloc[val_idx]
        yval = Y_train.iloc[val_idx]
        
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xval,yval)], verbose = False)

        #create predictions
        y_pred += model.predict_proba(X_test)/kfold.n_splits
        print(y_pred)
               
        val_pred = model.predict_proba(xval)
        # getting out-of-fold predictions on training set
        train_oof[val_idx] = val_pred
        
        # calculate and append logloss
        fold_logloss = metrics.log_loss(yval,val_pred)
        print("Logloss: {0:0.5f}". format(fold_logloss))
  
    return y_pred, train_oof

In [None]:
lgbm_pred, train_oof  = prediction (X_train, Y_train, lgbm_tuned, X_test)

In [None]:
print("Logloss: {0:0.6f}".format(metrics.log_loss(Y_train,train_oof))) #Logloss: 1.091825

In [None]:
train_oof = pd.DataFrame(train_oof, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
train_oof

In [None]:
pred_test = pd.DataFrame(lgbm_pred, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
pred_test

In [None]:
train_oof.to_csv('lgbm_train_oof.csv', index=False)
train_oof

In [None]:
output = pred_test
output['id'] = X_test.index
output.to_csv('submission.csv', index=False)

output