# <center>Tabular Playground Series - June/2021<center>
## <center>Starter - EDA + Base LightGBM<center>
---
At first, it looks like this competition will be pretty much similar to the last one. More classes and more features, but with a similar distribution, as shown on the distribution plot. This notebook provides a baseline score for further approaches.
    
My other notebooks in this competition:
- [Tabular Playground Series - June/2021: Simple Neural Network with Keras](https://www.kaggle.com/jonaspalucibarbosa/tps06-21-simple-nn-with-keras)
- [Tabular Playground Series - June/2021: Keras Neural Network with Embedding Layer](https://www.kaggle.com/jonaspalucibarbosa/tps06-21-keras-nn-with-embedding)
- [Tabular Playground Series - June/2021: Wide and Deep Neural Network with Keras](https://www.kaggle.com/jonaspalucibarbosa/tps06-21-wide-and-deep-nn-w-keras)
- [Tabular Playground Series - June/2021: LightAutoML with KNN Features](https://www.kaggle.com/jonaspalucibarbosa/tps06-21-lightautoml-w-knn-feats)
- [Tabular Playground Series - June/2021: Keras Neural Network with Skip Connections](https://www.kaggle.com/jonaspalucibarbosa/tps06-21-keras-nn-with-skip-connections)

## Importing Libraries and Datasets

In [None]:
import pandas as pd       
import matplotlib as mat
import matplotlib.pyplot as plt    
import numpy as np
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jun-2021/train.csv', index_col = 'id')
Y_train = df_train['target'].copy()
X_train = df_train.copy().drop('target', axis = 1)

X_test = pd.read_csv('../input/tabular-playground-series-jun-2021/test.csv', index_col = 'id')

In [None]:
df_train

## Exploring the Data

In [None]:
df_train.info()

In [None]:
X_test.info()

In [None]:
df_train.describe().T

In [None]:
df_train.nunique().sort_values()

In [None]:
plt.figure(figsize=(12,5))

class_order = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
ax = sns.countplot(x="target", data=df_train, palette="BuPu", order = class_order)

plt.xlabel("Class", fontsize= 12)
plt.ylabel("N_Samples", fontsize= 12)
plt.title("Number of Samples per Class", fontsize= 13)
plt.ylim(0,100000)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.18, p.get_height()+3000))

plt.show()

In [None]:
features = X_train.columns

plt.figure(figsize=(15,70))

for i,col in enumerate(features):    
    plt.subplot(25,3,i + 1)
    sns.distplot(df_train.loc[:,col])
    plt.ylabel('')
    plt.tight_layout()

plt.show()

## CV with Base LightGBM

In [None]:
def cv_function (X_train, Y_train, model, splits = 10):
    
    kfold = StratifiedKFold(n_splits = splits)
    logloss = []
   
    cv_pred = np.zeros((200000,9))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xtest = X_train.iloc[test_idx]
        ytest = Y_train.iloc[test_idx]
        
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xtest,ytest)], verbose = False)

        #create predictions
        preds = model.predict_proba(xtest)
        cv_pred[test_idx] = preds
                              
        # calculate and append accuracy
        fold_logloss = metrics.log_loss(ytest,preds)
        print("LogLoss: {0:0.5f}". format(fold_logloss))
        logloss.append(fold_logloss)
        
    print (np.mean(logloss))
    #return np.mean(accuracies)
    return cv_pred

In [None]:
lgbm_model = LGBMClassifier(n_estimators = 2000, learning_rate = 0.02, random_state = 42, num_class = 9, metric = 'multi_logloss',
                           subsample = 0.8, colsample_bytree = 0.8, reg_alpha = 0.5, reg_lambda = 0.5, max_depth = 20)

In [None]:
lgbm_cvpred = cv_function(X_train, Y_train, lgbm_model)
#1.7502096540231844
#1.7495114276370374 after adding subsample and colsample_bytree
#1.7489962868771634 after adding reg_alpha and reg_lambda

## Making Predictions

In [None]:
def prediction (X_train, Y_train, model, X_test):
    
    kfold = StratifiedKFold(n_splits = 10)

    y_pred = np.zeros((100000,9))
    train_oof = np.zeros((200000,9))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, val_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xval = X_train.iloc[val_idx]
        yval = Y_train.iloc[val_idx]
        
        # fit model for current fold
        model.fit(xtrain, ytrain, 
            early_stopping_rounds = 100, eval_set = [(xval,yval)], verbose = False)

        #create predictions
        y_pred += model.predict_proba(X_test)/kfold.n_splits
        print(y_pred)
               
        val_pred = model.predict_proba(xval)
        # getting out-of-fold predictions on training set
        train_oof[val_idx] = val_pred
        
        # calculate and append logloss
        fold_logloss = metrics.log_loss(yval,val_pred)
        print("Logloss: {0:0.5f}". format(fold_logloss))
  
    return y_pred, train_oof

In [None]:
lgbm_pred, train_oof  = prediction (X_train, Y_train, lgbm_model, X_test)

In [None]:
print("Logloss: {0:0.6f}".format(metrics.log_loss(Y_train,train_oof)))

In [None]:
train_oof = pd.DataFrame(train_oof, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
train_oof

In [None]:
pred_test = pd.DataFrame(lgbm_pred, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9'])
pred_test

In [None]:
train_oof.to_csv('lgbm_train_oof.csv', index=False)
train_oof

In [None]:
output = pred_test
output['id'] = X_test.index
output.to_csv('submission.csv', index=False)

output