# **Ensembles galore!**

**The idea of ensembling is very simple. Rather than trying to produce a single very strong learner, you should produce many weaker learners - all which are specialised in one aspect of the data and take their aggregate predictions.**

This is what led me to produce this notebook, first you can see below I have built a denoising autoencoder + MLP - this stage of the model aims to denoise the data (that likely crept in during the CTGAN process of generating the data.)

In the second stage I have 3 boosted treees models which are combined using a voting classifier - all of these models were tuned using Optuna and thus are highly specialised too.

I do not have the energy to continue the research and so I have written this notebook with clean code to allow you all to take over and thus I would love to see where you all take this notebook!

# Import Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from statistics import mean

import warnings
warnings.filterwarnings("ignore")

TRAINING = True
RS = 69420
DATA_PATH = "../input/tabular-playground-series-mar-2021/train.csv"

In [None]:
train = pd.read_csv(DATA_PATH, index_col=0)

cat_features = [c for c in train.columns if 'cat' in c]
le = LabelEncoder()
for col in cat_features:
    train[col] = le.fit_transform(train[col])

X = train.iloc[:, :-1].values
y = train.iloc[:, -1].values

# Stage 1: AutoEncoder + MLP

**Create the AutoEncoder**

In [None]:
def create_autoencoder(input_dim, output_dim,noise=0.05):
    i = Input(input_dim)
    encoded = BatchNormalization()(i)
    encoded = GaussianNoise(noise)(encoded)
    encoded = Dense(64,activation='relu')(encoded)
    decoded = Dropout(0.2)(encoded)
    decoded = Dense(input_dim,name='decoded')(decoded)
    x = Dense(32,activation='relu')(decoded)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(output_dim,activation='sigmoid',name='label_output')(x)
    
    encoder = Model(inputs=i,outputs=encoded)
    autoencoder = Model(inputs=i,outputs=[decoded,x])
    
    autoencoder.compile(optimizer=Adam(0.001),loss={'decoded':'mse','label_output':'binary_crossentropy'})
    return autoencoder, encoder

**Create the MLP**

In [None]:
# An area of further research is to tune the Dense Layers, Dropouts, Learning rate and Label Smoothing

def create_model(input_dim,output_dim,encoder):
    inputs = Input(input_dim)
    
    x = encoder(inputs)
    x = Concatenate()([x,inputs]) #use both raw and encoded features
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    
    for i in range(3):
        x = Dense(64)(x)
        x = BatchNormalization()(x)
        x = Lambda(tf.keras.activations.swish)(x)
        x = Dropout(0.3)(x)
    
    x = Dense(output_dim, activation='sigmoid')(x)
    model = Model(inputs=inputs,outputs=x)
    model.compile(optimizer=Adam(0.00001),
                  loss=BinaryCrossentropy(label_smoothing=0),
                  metrics=[tf.keras.metrics.AUC(name ='auc')])
    return model

In [None]:
autoencoder, encoder = create_autoencoder(X.shape[-1], 1, noise=0.1)

**Train the AutoEncoder**

In [None]:
# Tune the number of Epochs
if TRAINING:
    autoencoder.fit(X,(X,y),
                    epochs=5,
                    batch_size=32, 
                    validation_split=0.1,
                    callbacks=[EarlyStopping('val_loss',patience=5,restore_best_weights=True)])
    encoder.save_weights('./encoder.hdf5')
else:
    encoder.load_weights('encoder.hdf5')
encoder.trainable = False

**Merge the AutoEncoder to the MLP**

In [None]:
model_fn = create_model(X.shape[-1], 1, encoder)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
%%time
history = model_fn.fit(X_train, y_train,
                       epochs=1000,
                       batch_size=32, 
                       validation_split=0.1,
                       callbacks=[EarlyStopping('val_loss',patience=5,restore_best_weights=True)])

**There is an issue with Tensorflow that after a model is finished training, the GPU memory is not released - the following code releases it manually!**

**I suggest you leave it commented out when submitting final notebook - some wierd CUDA errors show up elsewise**

In [None]:
# from numba import cuda
# cuda.select_device(0)
# cuda.close()
# import gc
# del cuda
# gc.collect()

# Stage 2: Boosted Ensemble

In [None]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score

In [None]:
# Here are some tuned parameters - feel free to run Optuna for Catboost aswell, I couldnt be bothered

XGB_Params = {'n_estimators': 450,
              'max_depth': 15,
              'reg_lambda': 5,
              'min_child_weight': 0,
              'subsample': 0.8832278322447424,
              'learning_rate': 0.014255981518563889,
              'colsample_bytree': 0.28}

LGBM_Params = {'lambda_l1': 0.048263765268859345,
               'lambda_l2': 0.002059552723754179,
               'num_leaves': 138,
               'feature_fraction': 0.4090885438608842,
               'bagging_fraction': 0.833157756558512,
               'bagging_freq': 1,
               'min_child_samples': 97}

In [None]:
# Tune the number of models in the ensemble with the range
# GPU turned off when submitting due to kaggle submission taking the damn piss

estimators = []

for i in range(3):
    estimators.append((f"model_lgbm{i}",
                       LGBMClassifier(**LGBM_Params,
                                      random_seed=np.random.randint(0, 100000))))

    estimators.append((f"model_xgb{i}",
                       XGBClassifier(**XGB_Params,
                                     objective='binary:logistic',
                                     random_state=np.random.randint(0, 100000))))
    
    estimators.append((f"model_cat{i}",
                       CatBoostClassifier(random_seed=np.random.randint(0, 100000),
                                          verbose=False)))

**You might notice the NN is missing, this is because Sklearn seems to have a fit when you include it :( I manually take the mean later to ensemble the NN**

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

clf = VotingClassifier(estimators=estimators,
                       verbose=1,
                       voting='soft')

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
# Predict
# y_pred = clf.predict(X_test)
# y_pred = y_pred.reshape(-1,1)
# print(f"Testing Precision (Pure Ensemble): {precision_score(y_test, y_pred, 'weighted')}")

# y_nnpred = model_fn.predict(X_test)
# # By default the neural net outputs probabilites, thus this line converts those into binary with a threshold of 0.5, maybe expe3riment
# y_nnpred[:] = y_nnpred[:]>0.5
# print(f"Testing Precision (Pure NN): {precision_score(y_test, y_nnpred, 'weighted')}")

# y_pred = np.average((y_nnpred, y_pred), axis=0)
# # Compute Metrics
# print(f"Testing Precision (Ensemble): {precision_score(y_test, y_pred, 'weighted')}")

# Produce Submission

In [None]:
test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv", index_col=0)
cat_cols = [c for c in train.columns if 'cat' in c]

for col in cat_cols:
    test[col] = le.fit_transform(test[col])

submission = pd.DataFrame(index=test.index)

nn_pred = model_fn.predict(test.values)
clf_pred = clf.predict_proba(test.values)[:, 1]
clf_pred = clf_pred.reshape(-1,1)

submission['target'] = np.average((nn_pred, clf_pred), axis=0)

submission.to_csv("submission.csv")