# <center>Tabular Playground Series - May/2021<center>
## <center>Model Stacking using Logistic Regression as Meta-Learner<center>
---

Models used for stacking:
- [LightGBM](https://www.kaggle.com/jonaspalucibarbosa/tps05-21-lgbm-tuned-w-hyperopt)
- [Neural Network](https://www.kaggle.com/jonaspalucibarbosa/tps05-21-nn-with-keras-first-nn)
- KNN (w/Log Transformation and Standard Scaler applied to features)
- Logistic Regression (w/Log Transformation applied to features)
- Multinomial Naive Bayers (w/Log Transformation applied to features)
    

My other notebooks in this competition:
- [Tabular Playground Series - May/2021: LightGBM Tuned with Hyperopt](https://www.kaggle.com/jonaspalucibarbosa/tps05-21-lgbm-tuned-w-hyperopt)
- [Tabular Playground Series - May/2021: Neural Network with Keras](https://www.kaggle.com/jonaspalucibarbosa/tps05-21-nn-with-keras-first-nn)    

## Importing Libraries and Datasets

In [None]:
import pandas as pd       
import matplotlib as mat
import matplotlib.pyplot as plt    
import numpy as np
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv', index_col = 'id')
Y_train = df_train['target'].copy()
X_train = df_train.copy().drop('target', axis = 1)

X_test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv', index_col = 'id')

In [None]:
oof_lgb = pd.read_csv("../input/tps05-21-lgbm-tuned-w-hyperopt/lgbm_train_oof.csv")
test_lgb = pd.read_csv("../input/tps05-21-lgbm-tuned-w-hyperopt/submission.csv", index_col = 'id')

oof_knn = pd.read_csv("../input/tps05-21-knn/knn_train_oof.csv")
test_knn = pd.read_csv("../input/tps05-21-knn/submission.csv", index_col = 'id')

oof_nn = pd.read_csv("../input/tps05-21-nn-with-keras-first-nn/nn_train_oof.csv")
test_nn = pd.read_csv("../input/tps05-21-nn-with-keras-first-nn/submission.csv", index_col = 'id')

oof_lr = pd.read_csv("../input/tps05-21-logistic-regression/lr_train_oof.csv")
test_lr = pd.read_csv("../input/tps05-21-logistic-regression/submission.csv", index_col = 'id')

oof_nb = pd.read_csv("../input/tps05-21-multinomialnb/nb_train_oof.csv")
test_nb = pd.read_csv("../input/tps05-21-multinomialnb/submission.csv", index_col = 'id')

In [None]:
oof_lgb.columns = [('lgb_{0:d}').format(i) for i in range(1,5)]
test_lgb.columns = [('lgb_{0:d}').format(i) for i in range(1,5)]

oof_knn.columns = [('knn_{0:d}').format(i) for i in range(1,5)]
test_knn.columns = [('knn_{0:d}').format(i) for i in range(1,5)]

oof_nn.columns = [('nn_{0:d}').format(i) for i in range(1,5)]
test_nn.columns = [('nn_{0:d}').format(i) for i in range(1,5)]

oof_lr.columns = [('lr_{0:d}').format(i) for i in range(1,5)]
test_lr.columns = [('lr_{0:d}').format(i) for i in range(1,5)]

oof_nb.columns = [('nb_{0:d}').format(i) for i in range(1,5)]
test_nb.columns = [('nb_{0:d}').format(i) for i in range(1,5)]

In [None]:
allpredictions_train = pd.concat([oof_lgb, oof_knn, oof_nn, oof_lr, oof_nb], axis=1)
allpredictions_test = pd.concat([test_lgb, test_knn, test_nn, test_lr, test_nb], axis=1)

In [None]:
allpredictions_train

In [None]:
allpredictions_test

In [None]:
plt.figure(figsize=(16,10))

sns.heatmap(allpredictions_train.corr(), annot = True, fmt=".2f", vmin=-1, vmax=1, center= 0, cmap = 'rocket')
plt.show()

In [None]:
plt.figure(figsize=(16,10))

sns.heatmap(allpredictions_test.corr(), annot = True, fmt=".2f", vmin=-1, vmax=1, center= 0, cmap = 'rocket')
plt.show()

In [None]:
n4predictions_train = pd.concat([oof_lgb, oof_knn, oof_nn, oof_lr], axis=1)
n4predictions_test = pd.concat([test_lgb, test_knn, test_nn, test_lr], axis=1)

n3predictions_train = pd.concat([oof_lgb, oof_knn, oof_nn], axis=1)
n3predictions_test = pd.concat([test_lgb, test_knn, test_nn], axis=1)

## Testing Combinations

In [None]:
def cv_function (X_train, Y_train, model):
    
    kfold = StratifiedKFold(n_splits = 10)
    logloss = []
   
    cv_pred = np.zeros((100000,4))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xtest = X_train.iloc[test_idx]
        ytest = Y_train.iloc[test_idx]
        
        # fit model for current fold        
        model.fit(xtrain, ytrain)
        
        #create predictions
        preds = model.predict_proba(xtest)
        cv_pred[test_idx] = preds
                              
        # calculate and append accuracy
        fold_logloss = metrics.log_loss(ytest,preds)
        print("LogLoss: {0:0.4f}". format(fold_logloss))
        logloss.append(fold_logloss)
        
    print (np.mean(logloss))
    #return np.mean(accuracies)
    return cv_pred

In [None]:
lr_model = LogisticRegression(C = 3.0, random_state = 42, n_jobs = -1)

In [None]:
#stacking_5_cvpred = cv_function(allpredictions_train, Y_train, lr_model) #1.090460058588747
#stacking_4_cvpred = cv_function(n4predictions_train, Y_train, lr_model) #1.0903964690708698
#stacking_3_cvpred = cv_function(n3predictions_train, Y_train, lr_model) #1.0904649018880122
#1.0904597097069197 after update

## Making Predictions

In [None]:
def prediction (X_train, Y_train, model, X_test):
    
    kfold = StratifiedKFold(n_splits = 20)

    y_pred = np.zeros((50000,4))
    train_oof = np.zeros((100000,4))
    
    for idx in kfold.split(X=X_train, y=Y_train):
        train_idx, val_idx = idx[0], idx[1]
        xtrain = X_train.iloc[train_idx]
        ytrain = Y_train.iloc[train_idx]
        xval = X_train.iloc[val_idx]
        yval = Y_train.iloc[val_idx]
        
        # fit model for current fold
        model.fit(xtrain, ytrain)
        
        #create predictions    
        y_pred += model.predict_proba(X_test)/kfold.n_splits
        print(y_pred)
               
        val_pred = model.predict_proba(xval)
        # getting out-of-fold predictions on training set
        train_oof[val_idx] = val_pred
        
        # calculate and append logloss
        fold_logloss = metrics.log_loss(yval,val_pred)
        print("Logloss: {0:0.5f}". format(fold_logloss))
  
    return y_pred, train_oof

In [None]:
#stack_pred, train_oof = prediction (allpredictions_train, Y_train, lr_model, allpredictions_test)
#stack_pred, train_oof = prediction (n4predictions_train, Y_train, lr_model, n4predictions_test)
stack_pred, train_oof = prediction (n3predictions_train, Y_train, lr_model, n3predictions_test)

In [None]:
train_oof = pd.DataFrame(train_oof, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
train_oof

In [None]:
pred_test = pd.DataFrame(stack_pred, columns = ['Class_1', 'Class_2', 'Class_3', 'Class_4'])
pred_test

In [None]:
train_oof.to_csv('stack_train_oof.csv', index=False)
train_oof

In [None]:
output = pred_test
output['id'] = X_test.index
output.to_csv('submission.csv', index=False)

output