# Approach overview
-  Build Ensemble that includes multiple model categories: Logistic Regression, Random Forests, XGBoost, Adaboost, and Neural Networks.
-  Split the training dataset into K stratified folds. For each fold and model category, train a separate model using Grid Search.
-  Combine all models into ensemble using Averaging.

### We experimented with: 
1. Which model categories to include in the ensemble  
2. How many stratified folds to use: 1, 5, 10, 20, 40 
3. How to build the ensemble: Averaging vs. Max voting
4. Oversampling techniques such as SMOTE and ADASYN: including models trained with SMOTE data in the ensemble worked for the Public leaderboad, but not for Private
5. Feature standardization: did not seem to improve anything.


## Lessons Learned
-  Ensembling is the way to go, of course.
-  Increasing the number of stratified folds improved performance.
-  Improvements in training data ccuracy (on validation set) did not necessarrily translate to better accuracies in the Public dataset. A prime example for this was the LR method that did not perform as well in the training validation accuracy compared to other methods such as NN. However, LR was an integral part of the overall Ensemble; whenever we removed it, the Public dataset accuracy ended up much worse.
-  Ensembling using Averaging always worked better than Max voting.
-  We kind of `overfitted' to the Public Leaderboard, i.e., our best performing model in Public was not the best in Private. 
-  Adding models trained with oversampled data, using either SMOTE or ADASYN, decreased accuracy in Private dataset. 
-  Gini impurity appeared to work better than Entropy for tree-based models.

## Initialization

In [None]:
"""
Created on Tue Mar 26 14:35:22 2019

@author: nikosc
"""

import pandas as pd, numpy as np, time, sys, h5py
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from keras.layers import Input, Dense , Dropout , TimeDistributed , LSTM , GRU, concatenate, BatchNormalization
from keras.models import Model
from keras.optimizers import SGD , Adadelta, RMSprop, Adam, Adamax
from keras.models import  load_model
from keras.callbacks import EarlyStopping
from keras.utils import  to_categorical 
from keras.regularizers import l1, l2, l1_l2
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import pickle
from sklearn.svm import SVC


##########################################
# Initialize problem parameters
class Args:
    """ Class containing all model arguments """
    def __init__( self ):
        self.project    = 'MLchallenge_DontOverfit'
        self.dataPath   = '/home/ubuntu/{}/'       .format(self.project)
        self.modelsPath = '/home/ubuntu/{}/Models' .format(self.project)
        self.resultsPath= '/home/ubuntu/{}/Results'.format(self.project)
        self.CV_folds   = 40  # split the Training data in stratified folds, to train different versions of models 
args = Args()
##########################################


########################################## 
############
# LOAD DATA
train = pd.read_csv( args.dataPath + 'TTT_train.csv' )
test  = pd.read_csv( args.dataPath + 'TTT_test_features.csv',index_col = 'ID')
#train.describe()
X = train.loc[:, train.columns != 'label']
y = train['label']
y_cat = to_categorical(y)
# Generate a set of stratified folds of Training to train different versions of each model.
folds = list(StratifiedKFold(n_splits=args.CV_folds, shuffle=True, random_state=1).split(X, y))
############
###########################################


## Some functions for model training and validation

In [None]:
##################################################
# function to fit a model on every fold, and store trained model
def fitValidateSave( model, modelType ):
    #
    accuracies = []
    for foldIndex, fold in enumerate(folds):
        X_fold      = np.take( X, fold[0], axis=0)
        y_fold      = np.take( y, fold[0], axis=0)
        #
        #oversampler = RandomOverSampler(random_state=77)
        #X_fold, y_fold = oversampler.fit_sample(X_fold, y_fold)
        #
        X_fold_test = np.take( X, fold[1], axis=0)
        y_fold_test = np.take( y, fold[1], axis=0)
        #
        model.fit(X_fold, y_fold)
        #
        accuracies.append( model.score(X_fold_test, y_fold_test) )
        print( '{}: {}'.format(foldIndex, accuracies[-1]) )
        print(model.best_params_)
        #
        pickle.dump( model, open( '{}/{}_fold{}.h5'.format( args.modelsPath, modelType, foldIndex ) , 'wb'))
    print( 'Average accuracy for {} is:  {}'.format( modelType, np.mean(accuracies)) )  
    return model
##################################################


##################################################
# Compute accuracies across folds using an already trained model.
def validateAcrossFolds( modelType ):
    #
    accuracies = []
    for foldInd, fold in enumerate(folds):
        X_fold_test = np.take( X, fold[1], axis=0)
        y_fold_test = np.take( y, fold[1], axis=0)
        #
        if 'NN' in modelType:
            y_fold_test = to_categorical(y_fold_test)
            model = load_model( '{}/{}_fold{}.h5'.format( args.modelsPath, modelType, foldInd ) )
            accuracies.append( model.evaluate(X_fold_test, y_fold_test, batch_size=512, verbose=0 )[1] )
        else:
            model = pickle.load(open( '{}/{}_fold{}.h5'.format( args.modelsPath, modelType, foldInd ), 'rb'))
            accuracies.append( model.score(X_fold_test, y_fold_test) )
        print( '{}: {}'.format(foldInd, accuracies[-1]) )
        #
    print( 'Average accuracy for {} is:  {}'.format( modelType, np.mean(accuracies)) )  
    return model
##################################################


## Logistic Regression

In [None]:
parameters = {
    "penalty":["l2"],
    "C": [ 3., 4., 5.],
    "fit_intercept": [True],
    "class_weight":['balanced'],
    "solver":[ 'lbfgs' ],
    "multi_class": ["multinomial"],
    "random_state":[77]
    }
LR = GridSearchCV(LogisticRegression(), 
                  parameters, 
                  cv=4, 
                  n_jobs=-1)

LR = fitValidateSave( LR, 'LR' ) 

## Random Forests

In [None]:
parameters = {
    "criterion":["gini"],
    "max_depth":[ 15, 30  ],
    "min_samples_split": [ 5 ],
    "min_samples_leaf": [1],
    "max_features":[None ],
    "random_state": [77],
    "n_estimators":[ 200 ]
    }
RF_gini = GridSearchCV(RandomForestClassifier(), 
                  parameters, 
                  cv=4, 
                  n_jobs=-1)

RF_gini = fitValidateSave( RF_gini, 'RF_gini' ) 

## Adaboost

In [None]:
AB_gini = AdaBoostClassifier( base_estimator = DecisionTreeClassifier( 
                             criterion         = 'gini', 
                             splitter          = 'random',
                             max_depth         = 30, 
                             min_samples_split = 5, 
                             min_samples_leaf  = 1,
                             max_features      = None,
                             random_state      = 77 
                            ),
                            learning_rate= 1,
                            n_estimators = 200
                         )
AB_gini = fitValidateAndSave( AB_gini, 'AB_gini' )

## XGBOOST

In [None]:
XGB = XGBClassifier(  max_depth=6,  
                      learning_rate=0.1, 
                      n_estimators=100, 
                      verbosity=1, 
                      objective='multi:softmax', 
                      num_class=y_cat.shape[-1],
                      booster='gbtree', 
                      n_jobs=4, 
                      gamma=0, 
                      min_child_weight=1,
                      max_delta_step=0, 
                      subsample=.7, 
                      colsample_bytree=.6, 
                      colsample_bylevel=.6, 
                      colsample_bynode=.6, 
                      reg_alpha=.0, 
                      reg_lambda=.0, 
                      scale_pos_weight=1, 
                      base_score=0.1, 
                      random_state=77 
                      )
XGB = fitValidateAndSave( XGB, 'XGB' )

## Neural Nets (MLP)

In [None]:
##################################################
def saveToH5( data , filePath , fillvalue=0 ):
    h5f = h5py.File( filePath , 'w')
    h5f.create_dataset('dataset', data =  data ,fillvalue=fillvalue ,compression='gzip', compression_opts=4 ); 
    h5f.close()
##################################################

##################################################
def loadFromH5( filePath ):
    h5f = h5py.File( filePath , 'r')
    output =   h5f['dataset'][:]  ; h5f.close()
    return output
##################################################

##################################################
def buildMLP():
    # DEFINE MLP MODEL
    main_input = Input( shape=( X.shape[-1], ) ,  name = 'features' )
    x = Dropout(0.8) (main_input)
    #x = BatchNormalization(axis = -1)(main_input)
    x = Dense( nodes, activation='relu',
                           kernel_regularizer   =reg,
                           activity_regularizer =reg,
                           bias_regularizer     =reg
              )(x)
    x = Dropout( drops ) (x)
    
    ###
    for lay in range(layers-1):
        #
        if True:
            x = concatenate([x, main_input])
        #
        #x = BatchNormalization(axis = -1)(x)
        x = Dense( nodes, activation='relu' )(x)
        x = Dropout( drops ) (x)
    ###       
    output =  Dense( y_cat.shape[-1], activation='softmax', name = 'output' )(x)     
    
    ###
    model = Model(input=main_input, output=output)
    ###
    model.compile( optimizer=optimizer , 
                   loss='categorical_crossentropy',
                   metrics=['categorical_accuracy']
                   )
    ###
    #model.summary()  
    return model
############

############
# TRAIN MODELS
def trainMLP(modelName='NN'):
    accuracies = []
    for foldInd, fold in enumerate(folds[:]):
        model = buildMLP()
        X_fold = np.take( X, fold[0], axis=0)
        y_fold = np.take( y_cat, fold[0], axis=0)
        X_fold_test = np.take( X, fold[1], axis=0)
        y_fold_test = np.take( y_cat, fold[1], axis=0)
        
        
        loss_history =[] ;  no_improvement = 0 ; break_it = 0
        for epok in range( 0 , 1000 ) :
            if break_it :
                break
            
            model.fit(X_fold, 
                      y_fold, 
                      batch_size=batchSize,
                      shuffle=True,
                      epochs=1, 
                      verbose=0,
                      validation_data=(X_fold_test, y_fold_test)
                      )
            
            
            loss_history.append( model.evaluate( x=X_fold_test, y=y_fold_test, batch_size=512, verbose=2)[1] )
            #
            if len(loss_history)>1:
                if loss_history[-1] <= max( loss_history[:-1] ):
                    no_improvement +=1
                else:
                    no_improvement = 0
                    model.save( '{}/{}_fold{}.h5'.format( args.modelsPath, modelName, foldInd )  )
                #                            
                if no_improvement >= 20:
                    break_it = 1
                    accuracies.append( max(loss_history) )
                    print( '{}: {}'.format( foldInd, accuracies[-1] ) )
                    break
    print( 'Average accuracy for {} is:  {}'.format( 'NN', np.mean(accuracies)) )  
################################################


###########
index_ = 0 
#
layers = 2
nodes = 512
drops = 0.5
batchSize = 128
lr=0.003
optimizer = Adam(lr=lr)
reg=l1_l2(l1=0.0001, l2=0.0005)
#
trainMLP( modelName='NN{}'.format(index_) )
###########


###########
index_ = 1 
#
layers = 2
nodes = 512
drops = 0.5
batchSize = 128
lr=0.003
optimizer = Adam(lr=lr)
reg=l1_l2(l1=0.0001, l2=0.001)
#
trainMLP( modelName='NN{}'.format(index_) )
###########

## Some more functions to generate the Ensemble prediction on Test dataset

In [None]:
##################################################
def genTestPredictionsPerModelInstance( modelInstance ):
    # Generate predictions over test set
    predictions = np.zeros(( testData.shape[0], y_cat.shape[-1] ))
    for foldIndex, fold in enumerate(folds[:]):
        print(foldIndex)
        if 'NN' in modelInstance:
            model = load_model( '{}/{}_fold{}.h5'.format( args.modelsPath, modelInstance, foldIndex ) )
            predictions += model.predict(testData, batch_size=1024) 
        else:
            model = pickle.load(open( '{}/{}_fold{}.h5'.format( args.modelsPath, modelInstance, foldIndex ), 'rb'))
            predictions += model.predict_proba(testData)
    #
    predictionsPath = '{}/predictions_{}.h5'.format( args.resultsPath, modelInstance )
    saveToH5( predictions, predictionsPath ) 
    return model
##################################################


##################################################
# Get predictions 
def generateEnsemblePredictions(  ensemble = [ 'LR', 'RF_gini', 'AB_gini', 'NN', 'GB' ],                   
                                  mode='sum'
                                ): 
    #                      
    ensemblePredictions = np.zeros(( testData.shape[0], len(ensemble), y_cat.shape[-1] ))
    for modelInstanceIndex, modelInstance in enumerate(ensemble) :
        print(modelInstance)
        predictionsPath  = '{}/predictions_{}.h5'.format( args.resultsPath, modelInstance ) 
        modelPredictions = loadFromH5(predictionsPath)
        ensemblePredictions[:,modelInstanceIndex] += np.copy( modelPredictions ) 
    #
    if mode=='sum':
        classPredictions = np.sum(ensemblePredictions, axis=1)   
    elif mode=='max':
        classPredictions = np.max(ensemblePredictions, axis=1)   
    
    #
    classPredictions = np.argmax( classPredictions, axis=1 )      
    predictions = testData.copy()
    predictions['predictions'] = classPredictions 
    predictions = predictions['predictions']
    predictions.to_csv( '{}/v0.1'.format(args.resultsPath ), index_label='ID', header=['label']   )
##################################################

## Build Ensemble predictions

In [None]:

## Generate test predictions per modelInstance
for modelInstance in [ 'LR', 'RF_gini', 'AB_gini', 'NN0', 'NN1'  ]:
    print(modelInstance)
    genTestPredictionsPerModelInstance( modelInstance ) 

In [None]:
# Generate Ensemble Predictions
generateEnsemblePredictions( ensemble = [ 'LR', 'RF_gini', 'AB_gini', 'NN0', 'NN1' ] 
                           )