In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from keras.layers import Dropout

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


import re
import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')
%matplotlib inline



Using TensorFlow backend.


In [None]:
# Before loading the following dataset, be sure to run the preprocessing ipython

In [2]:
data = pd.read_csv("1_dataset_ML_ready.csv", index_col=0 )

# My Custom helper function

I have created a custom gridSearch function that is 'Fault tolerant' and which prevent the gridSearch from craching in case of incompatible arguments.

In [3]:
#Create all tuples of parameters
from itertools import product #Returns the cartesian product of lists (same as nested for loops)
def all_params_grid(dico):
    #Create list of keys and a list of values
    keys = list(dico.keys())
    values = dico.values()
    #Create all possible combinations of parameters
    params = []    
    for value in product(*values):#Loop trough all the combination of values
        subParam =dict() #reconstruct the dictionary of parameters
        for i in range(len(keys)):           
            subParam[str(keys[i])] = value[i] #Attribute the respective value to the respective key
        params.append(subParam)
    return(params)

In [4]:
from sklearn.model_selection import cross_validate
#Custom training method with cross validation
def trainRawModel(strModel,params, x_train, y_train, x_test, y_test):
    '''This function is used to train the model without any ensemble method'''    
    #Instanciate the model
    model = eval(strModel)()
    #Set parameters
    model.set_params(**params)
    
    #Train Base model No ensemble           
    model_raw = model.fit(x_train,y_train)        
    cv = cross_validate(model_raw, x_train, y_train, cv=10, scoring=['recall', 'roc_auc', 'f1'], 
                        return_train_score=False,n_jobs=-1)        
    model_raw.fit(x_train,y_train)

    #Predict model        
    y_pred_raw = model_raw.predict(x_test)

    roc_predict_raw = roc_auc_score(y_test, y_pred_raw, average='macro', sample_weight=None)
    obj = {
        'model_raw': model_raw,
        'cv_raw': cv,
        'roc_predict_raw': roc_predict_raw
    }        
    return(obj) 

In [5]:
# Custom Gridsearch function 
def custom_gridSearch(strModel, param_dict, x_train, y_train, x_test, y_test):
    params = All_params_grid(param_dict)
    best_model = ''
    
    for param in params:  
        try:
            model = trainRawModel(strModel,param, X_train_scaled, y_train, X_test_scaled, y_test)
        
            if best_model == '':
                best_model = model
            elif best_model['roc_predict_raw'] < model['roc_predict_raw']:
                best_model = model
        except:
            pass
        
    return best_model 

In [6]:
#Apply adaptive boosting classifier on a already tuned model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

def ada_boost(model, x_train, y_train, x_test, y_test):
        #Try model with boosting
        try:
            adaGS_Param ={
                'algorithm':['SAMME'],
                'base_estimator':[model['model_raw']],
                'n_estimators':[50, 80,100,500]            
            }
            adaBoost = AdaBoostClassifier()

            adaBGS = GridSearchCV(adaBoost, adaGS_Param, cv=10, error_score=-1, scoring='roc_auc')
            #Training a model
            adaBGS.fit(x_train,y_train)        
        
            adaBest = adaBGS.best_estimator_
            
            y_pred_adaB = adaBGS.best_estimator_.predict(X_test)
            
            adaScore =  roc_auc_score(y_test, y_pred_adaB, average='macro', sample_weight=None)
            
        #Return final object
        except:
            adaBest=None
            adaScore=0
            
        #bagging
                               
        obj = {
            'initial_model' : model,
            'model_adaBoost': adaBest,
            'adaBoost_score':adaScore            
        }
        
        return(obj)        

In [7]:
#Apply a bagging classifier on a already tuned model
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

def baggingModel(model, x_train, y_train, x_test, y_test):   
    
        params = {   
            'base_estimator':[model['model_raw']],
            'max_features' : [0.2, 0.4, 0.6, 0.8, 1.0],
            'max_samples' : [0.05, 0.1, 0.2, 0.5]
        }
        try:
            
            BaggingModel = BaggingClassifier()
            GS = GridSearchCV(BaggingModel, params, cv=5, scoring='roc_auc')
            GS.fit(x_train, y_train)
            
            bestBag = GS.best_estimator_

            y_pred_bagging = bestBag.predict(X_test)

            model_bagging_score = roc_auc_score(y_test,y_pred_bagging, average='macro', sample_weight=None)
            
        except:
            bestBag = None,
            model_bagging_score = 0
        
        
        obj ={
            'initial_model' : model,
            'model_bagging' : bestBag,
            'bagging_score': model_bagging_score
            }
        
        return(obj)

In [8]:
# Function to perform the predictions
import pickle
from collections import defaultdict

#Available Classifiers
# classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 
#               'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", 
#               "PassiveAggressiveClassifier", "SGDClassifier" ]

def prediction_pipeline(classifiers_list, X_train_scaled, y_train, X_test_scaled, y_test):
    #Train all models    
    All_classifiers= defaultdict(dict)
    compt = 1
    for classifier in classifiers_list:
        model = custom_gridSearch(classifier, model_parameters[classifier], X_train_scaled, y_train, X_test_scaled, y_test)
        All_classifiers[classifier][classifier + '_raw'] = model

        #Store intermediate versions
        pickling_on = open("zz_model_{}_out_of_{}.pickle".format(compt, len(classifiers)),"wb")
        pickle.dump(All_classifiers, pickling_on)
        pickling_on.close()
        

In [9]:
#take a sample for testing the code
import random
indecies = random.sample(range(len(data)), 500)
data = data.iloc[indecies]

In [10]:
X = data.drop('readmitted', axis=1)
Y = data['readmitted']

In [11]:
#Split the dataset into training testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=101)

In [42]:
#Construct a dictionary of model parameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from xgboost import XGBClassifier

model_parameters = {
    'DecisionTreeClassifier':{    
        'criterion': ['gini', 'entropy'],
        'max_depth':[None], #integer
        'min_samples_split': [2, 4, 8, 20], #integer or proportion of samples
        'min_samples_leaf':[1, 5, 10 , 20],
        'max_features':['auto', 'log2', None],
        'max_leaf_nodes' :[None], #int
        'min_impurity_decrease' :[1e-7],    
        'class_weight':['balanced'],    
    },
    #-------------
    "LogisticRegression":{
        'penalty':['l2','l1'], # The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties
        'dual':[False], #Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.
        'tol':[0.0001], #Tolerance for stopping criteria
        'C':[1.0], #Like in support vector machines, smaller values specify stronger regularization.
        'fit_intercept':[True], 
        'intercept_scaling':[1], 
        'class_weight':['balanced'], 
        'random_state':[101], 
        'solver':['liblinear', #Good choice for small datasets, one-vs-rest only, handels L1
                  'newton-cg',  #multinomial problems (multi-class), L2
                  'lbfgs',  #multinomial problems (multi-class), L2
                  'sag', #Stochastic Average Gradient descent #Good choice for small datasets, multinomial problems, L2
                  'saga', #Good choice for small datasets, multinomial problems, handels L1
                 ], 
        'max_iter':[100, 500], #Useful only for the newton-cg, sag and lbfgs solvers
        'multi_class':['ovr', 
                       'multinomial', 
                       'auto'], #‘ovr’, ‘multinomial’, ‘auto’
        'verbose':[0], 
        'warm_start':[False], 
        'n_jobs':[-1]
    },
    #-------------
    'SVC':{
        'C':[0.5, 1.0, 2, 5], 
        'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 
        'degree':[3,4,5], #degree of the polynomial (only for polynomial kernel)
        'gamma': ['auto', 'scale'],
        'coef0': [0.0, 1, 2], 
        'shrinking':[True], 
        'probability':[False],  #Change this to True if probabilities need to be calculated
        'tol': [0.001], 
        'cache_size':[200], 
        'class_weight':['balanced'], 
        'verbose':[False], 
        'max_iter':[-1], 
        'decision_function_shape':['ovr', 'ovo'], 
        'random_state': [101]    
    },
    #-------------
    'RandomForestClassifier':{
        'n_estimators':[100, 200, 1000],
        'criterion': ['gini', 'entropy'],
        'max_depth':[None], #integer
        'min_samples_split': [2, 8, 20], #integer or proportion of samples
        'min_samples_leaf':[1, 5, 10, 20],
        'max_features':['auto', 'log2', None],
        'max_leaf_nodes' :[None], #int
        'min_impurity_decrease' :[1e-7],    
        'class_weight':['balanced', 'balanced_subsample'],    
        'bootstrap':[True],
        'n_jobs':[-1]
    },
    #-------------
    "GradientBoostingClassifier": {
        'loss':['deviance'], 
        'learning_rate':[0.1], 
        'n_estimators':[100, 200, 1000], 
        'subsample':[1.0], 
        'criterion': ['friedman_mse'], 
        'min_samples_split':[2, 5, 10, 20], 
        'min_samples_leaf': [2, 5, 10, 20], 
        'min_weight_fraction_leaf':[0.0], 
        'max_depth':[3, 4, 5, 10], 
        'min_impurity_decrease': [0.0], 
        'min_impurity_split': [None], 
        'init': [None], 
        'random_state':[101], 
        'max_features':[None, 'sqrt', 'log2'], 
        'verbose':[0], 
        'max_leaf_nodes':[None], 
        'warm_start':[False], 
        'presort':['auto'], 
        'validation_fraction': [0.1], 
        'n_iter_no_change':[5], #if the score is not improving for 3 iterations ==>Early stopping. Default None
        'tol':[0.0001]
    },
    #-------------------------------
    "SGDClassifier":{
        'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 
        'penalty':['l2', 'l1', 'elasticnet'], 
        'alpha': [0.0001], 
        'l1_ratio': [0.15, 0.4], 
        'fit_intercept':[True], 
        'max_iter':[1000], 
        'tol': [0.001], 
        'shuffle':[True], 
        'verbose':[0], 
        'epsilon':[0.1],        
        'random_state':[101], 
        'learning_rate':['optimal', 'adaptive', 'invscaling', 'constant'], 
        'eta0':[0.0], 
        'power_t':[0.5], 
        'early_stopping':[True], 
        'validation_fraction':[0.1], 
        'n_iter_no_change': [10], 
        'class_weight':['balanced'], 
        'warm_start':[False], 
        'average':[False],
        'n_jobs':[-1]
    },
    #-------------------------------    
    "PassiveAggressiveClassifier": {
        'C':[1.0], 
        'fit_intercept':[True], 
        'max_iter':[1000], 
        'tol': [0.001], 
        'early_stopping':[True], 
        'validation_fraction':[0.1], 
        'n_iter_no_change': [10], 
        'shuffle':[True], 
        'verbose':[0], 
        'loss':['hinge', 'squared_hinge'], 
        'n_jobs':[-1], 
        'random_state':[101], 
        'warm_start':[False], 
        'class_weight':['balanced'], 
        'average':[False]
    },
    #-------------------------------    
    "XGBClassifier": {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1],
        'n_estimators': [100, 200, 500],
        'verbosity':[0],
        'silent': [None],
        "objective": ['binary:logistic'],
        'booster': ['gbtree'],
        'nthread': [None],
        'gamma': [0.5, 1, 1.5, 2, 5],        
        'min_child_weight': [1, 5, 10],        
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'colsample_bylevel': [1],
        'colsample_bynode': [1],
        'reg_alpha': [1],
        'reg_lambda': [1],
        'scale_pos_weight': [1],
        'base_score': [0.5],
        'random_state': [101],
        'seed': [None],
        'missing': [None],        
        'n_jobs':[-1]                   
    }    
    
}

# Stacking Scikit-learn classifiers

In [140]:
#Available Classifiers
#classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", "PassiveAggressiveClassifier", "SGDClassifier", ]

classifiers =["XGBClassifier", \
              'DecisionTreeClassifier', "LogisticRegression",'SVC',\
              'RandomForestClassifier', "GradientBoostingClassifier", \
              "PassiveAggressiveClassifier", "SGDClassifier" ]

In [263]:
#Stacking Class
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from itertools import product #Returns the cartesian product of lists (same as nested for loops)
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, classification_report
from collections import defaultdict


# Implementing a stacking class
class Stacking():
#     x_train = ""
#     y_test = "" 
    
    _x_train_scaled_fold1 = ""
    _y_test_fold1 = ""
    _x_train_scaled_fold2 = ""
    _y_test_fold2 = ""
    _scaler = ""
    #Fitting the base models to the 1st fold of training and getting prediction on the second fold
    _trainin_f1_models = {}
    _training_set_meta_classifier = {} #this will be based on prediction on the second fold using models fro the first fold
    _trainin_f2_models = {}
    
    _testing_set_meta_classifier = {} # This will contain de predictions to be fed to the f2 models from the testing set
    _final_results_testing = defaultdict(dict)
       
    def __init__(self, param_dict, classifiers,  x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train
        self.classifiers = classifiers
        self.param_dict = param_dict
        
    def _split_and_scale(self): 
        X_train_f1, X_train_f2, y_train_f1, y_train_f2 = train_test_split(self.x_train, self.y_train, 
                                                                          test_size=0.5, random_state=101)
        
        Scaler = StandardScaler()
        self._x_train_scaled_fold1 = Scaler.fit_transform(X_train_f1)
        self._x_train_scaled_fold2 = Scaler.transform(X_train_f2)
        
        self._y_test_fold1 = y_train_f1
        self._y_test_fold2 = y_train_f2
        self._scaler = Scaler
        
    def _all_params_grid(self, dict_params):
        '''Generate all the combinations aof parameters from a dictionary'''
        # Create list of keys and a list of values
        keys = list(dict_params.keys())
        values = dict_params.values()
        # Create all possible combinations of parameters
        params = []    
        for value in product(*values):#Loop trough all the combination of values
            subParam =dict() #reconstruct the dictionary of parameters
            for i in range(len(keys)):           
                subParam[str(keys[i])] = value[i] #Attribute the respective value to the respective key
            params.append(subParam)
        return(params)
    
    def _train_raw_model_stacking(self, strModel, params, x_train, y_train):
        '''This function is used to train the model without any ensemble method'''    
        #Instanciate the model
        model = eval(strModel)()
        #Set parameters
        model.set_params(**params)

        #Train Base model No ensemble           
        model_raw = model.fit(x_train,y_train)        
        cv = cross_validate(model_raw, x_train, y_train, cv=10, scoring=['recall', 'roc_auc', 'f1'], 
                            return_train_score=False,n_jobs=-1)     

        model_raw.fit(x_train,y_train)    
        obj = {
            'model_raw': model_raw,
            'cv_raw': cv        
        }        
        return(obj) 
    
    def _custom_gridSearch_stacking(self, strModel, param_dict, x_train, y_train):
        params = self._all_params_grid(param_dict)
        best_model = ''

        for param in params:  
            try:
                model = self._train_raw_model_stacking(strModel,param, x_train, y_train)       

                if best_model == '':
                    best_model = model
                elif best_model['cv_raw']['test_roc_auc'].mean() < model['cv_raw']['test_roc_auc'].mean():
                    best_model = model
            except:
                pass

        return best_model 
    
    
    def train_fold1(self):
        compt = 1
        for classifier in self.classifiers:
            try:
                print('training training fold 1: ' + classifier)
                model = self._custom_gridSearch_stacking(classifier, self.param_dict[classifier], 
                                                   self._x_train_scaled_fold1 , self._y_test_fold1)
                
                pickling_on = open("Xgb.pickle","wb")
                pickle.dump(model, pickling_on)
                pickling_on.close()
                
                self._trainin_f1_models[classifier] = model
                self._training_set_meta_classifier[classifier]= model['model_raw'].predict(self._x_train_scaled_fold2)

                compt += 1
            except:
                print ('error f1 with: ' + classifier)
                
    def train_fold2(self):            
        dataset = pd.DataFrame(self._training_set_meta_classifier)
        
        #making sure that the order of the columns stays the same
        dataset = dataset[self.classifiers] 

        for classifier in self.classifiers:
            try:
                print('training training fold 2: ' + classifier)
                model = self._custom_gridSearch_stacking(classifier, self.param_dict[classifier], 
                                                   dataset , self._y_test_fold2)   
                
                self._trainin_f2_models[classifier] = model 
            
            except: 
                print ('error f2 with: ' + classifier)


    def predict(self, x_test, y_test):
        #Scale the testing set
        scaled_test = self._scaler.transform(x_test, y_test)
        
        # Get the predictions for building the f1 dataset
        for model1 in self._trainin_f1_models:
            self._testing_set_meta_classifier[model1] = self._trainin_f1_models[model1]['model_raw'].predict(scaled_test)
            
        dataset = pd.DataFrame(self._testing_set_meta_classifier)
        
        #making sure that the order of the columns stays the same
        dataset = dataset[self.classifiers]
        
        for model2 in self._trainin_f2_models:
            pred = self._trainin_f2_models[model2]['model_raw'].predict(dataset)
            
            self._final_results_testing[model2]['roc_auc_score'] = roc_auc_score(y_test, pred)
            self._final_results_testing[model2]['classification_report'] = classification_report(y_test, pred)
    

        return(self._final_results_testing)
                
    

In [264]:
# Training
s = Stacking(model_parameters, classifiers, X_train, y_train)
s._split_and_scale()
s.train_fold1()
s.train_fold2()

In [265]:
# Prediction
s.predict(X_test, y_test)


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# Remain to do

In [None]:
# Create a voting classifier on the models trained in f2