In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from keras.layers import Dropout

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


import re
import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')
%matplotlib inline



Using TensorFlow backend.


In [2]:
# Before loading the following dataset, be sure to run the preprocessing ipython

In [3]:
data = pd.read_csv("1_dataset_ML_ready.csv", index_col=0 )

# My Custom helper function

I have created a custom gridSearch function that is 'Fault tolerant' and which prevent the gridSearch from craching in case of incompatible arguments.

In [4]:
#Create all tuples of parameters
from itertools import product #Returns the cartesian product of lists (same as nested for loops)
def All_params_grid(dico):
    #Create list of keys and a list of values
    keys = list(dico.keys())
    values = dico.values()
    #Create all possible combinations of parameters
    params = []    
    for value in product(*values):#Loop trough all the combination of values
        subParam =dict() #reconstruct the dictionary of parameters
        for i in range(len(keys)):           
            subParam[str(keys[i])] = value[i] #Attribute the respective value to the respective key
        params.append(subParam)
    return(params)

In [16]:
from sklearn.model_selection import cross_validate
#Custom training method with cross validation
def trainRawModel(strModel,params, x_train, y_train, x_test, y_test):
    '''This function is used to train the model without any ensemble method'''    
    #Instanciate the model
    model = eval(strModel)()
    #Set parameters
    model.set_params(**params)
    
    #Train Base model No ensemble           
    model_raw = model.fit(x_train,y_train)        
    cv = cross_validate(model_raw, x_train, y_train, cv=10, scoring=['recall', 'roc_auc', 'f1'], 
                        return_train_score=False,n_jobs=5)        
    model_raw.fit(x_train,y_train)

    #Predict model        
    y_pred_raw = model_raw.predict(x_test)

    roc_predict_raw = roc_auc_score(y_test, y_pred_raw, average='macro', sample_weight=None)
    obj = {
        'model_raw': model_raw,
        'cv_raw': cv,
        'roc_predict_raw': roc_predict_raw
    }        
    return(obj) 

In [6]:
# Custom Gridsearch function 
def custom_gridSearch(strModel, param_dict, x_train, y_train, x_test, y_test):
    params = All_params_grid(param_dict)
    best_model = ''
    
    for param in params:  
        try:
            model = trainRawModel(strModel,param, X_train_scaled, y_train, X_test_scaled, y_test)
        
            if best_model == '':
                best_model = model
            elif best_model['roc_predict_raw'] < model['roc_predict_raw']:
                best_model = model
        except:
            pass
        
    return best_model 

In [7]:
#Apply adaptive boosting classifier on a already tuned model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

def ada_boost(model, x_train, y_train, x_test, y_test):
        #Try model with boosting
        try:
            adaGS_Param ={
                'algorithm':['SAMME'],
                'base_estimator':[model['model_raw']],
                'n_estimators':[50, 80,100,500]            
            }
            adaBoost = AdaBoostClassifier()

            adaBGS = GridSearchCV(adaBoost, adaGS_Param, cv=10, error_score=-1, scoring='roc_auc')
            #Training a model
            adaBGS.fit(x_train,y_train)        
        
            adaBest = adaBGS.best_estimator_
            
            y_pred_adaB = adaBGS.best_estimator_.predict(X_test)
            
            adaScore =  roc_auc_score(y_test, y_pred_adaB, average='macro', sample_weight=None)
            
        #Return final object
        except:
            adaBest=None
            adaScore=0
            
        #bagging
                               
        obj = {
            'initial_model' : model,
            'model_adaBoost': adaBest,
            'adaBoost_score':adaScore            
        }
        
        return(obj)        

In [8]:
#Apply a bagging classifier on a already tuned model
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

def baggingModel(model, x_train, y_train, x_test, y_test):   
    
        params = {   
            'base_estimator':[model['model_raw']],
            'max_features' : [0.2, 0.4, 0.6, 0.8, 1.0],
            'max_samples' : [0.05, 0.1, 0.2, 0.5]
        }
        try:
            
            BaggingModel = BaggingClassifier()
            GS = GridSearchCV(BaggingModel, params, cv=5, scoring='roc_auc')
            GS.fit(x_train, y_train)
            
            bestBag = GS.best_estimator_

            y_pred_bagging = bestBag.predict(X_test)

            model_bagging_score = roc_auc_score(y_test,y_pred_bagging, average='macro', sample_weight=None)
            
        except:
            bestBag = None,
            model_bagging_score = 0
        
        
        obj ={
            'initial_model' : model,
            'model_bagging' : bestBag,
            'bagging_score': model_bagging_score
            }
        
        return(obj)

In [9]:
# Function to perform the predictions
import pickle
from collections import defaultdict
from datetime import datetime

#Available Classifiers
# classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 
#               'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", 
#               "PassiveAggressiveClassifier", "SGDClassifier" ]

def prediction_pipeline(classifiers_list, X_train_scaled, y_train, X_test_scaled, y_test):
    #Train all models    
    All_classifiers= defaultdict(dict)
    compt = 1
    for classifier in classifiers_list:
        print('Current classifier ' + classifier)
        
        start = datetime.now()
        
        model = custom_gridSearch(classifier, model_parameters[classifier], X_train_scaled, y_train, X_test_scaled, y_test)
        All_classifiers[classifier][classifier + '_raw'] = model

        #Store intermediate versions
        pickling_on = open("zz_model_{}_out_of_{}.pickle".format(compt, len(classifiers)),"wb")
        pickle.dump(All_classifiers, pickling_on)
        pickling_on.close()
        
        
        end = datetime.now()
        diff = (start - end)
        hours = diff//3600
        minutes = (diff - (hours * 3600))//60
        seconds = diff - (hours * 3600)-(minutes * 60)

        print(classifier + " ended after " + str(hours) + " hours, " + str(minutes) + " minutes and  " + str(seconds)+ " seconds."
        

In [10]:
# #take a sample for testing the code
# import random
# indecies = random.sample(range(len(data)), 500)
# data = data.iloc[indecies]

In [11]:
X = data.drop('readmitted', axis=1)
Y = data['readmitted']

In [12]:
#Split the dataset into training testing
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.1, random_state=101)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=101)

In [13]:
#Scale the data
from sklearn.preprocessing import StandardScaler
Scaler = StandardScaler()
X_train_scaled = Scaler.fit_transform(X_train)
X_test_scaled = Scaler.transform(X_test)

In [17]:
#Construct a dictionary of model parameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from xgboost import XGBClassifier
n_jobs = 5
model_parameters = {
    'DecisionTreeClassifier':{    
        'criterion': ['gini', 'entropy'],
        'max_depth':[None], #integer
        'min_samples_split': [2, 4, 8, 20], #integer or proportion of samples
        'min_samples_leaf':[1, 5, 10 , 20],
        'max_features':['auto', 'log2', None],
        'max_leaf_nodes' :[None], #int
        'min_impurity_decrease' :[1e-7],    
        'class_weight':['balanced'],    
    },
    #-------------
    "LogisticRegression":{
        'penalty':['l2','l1'], # The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties
        'dual':[False], #Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features.
        'tol':[0.0001], #Tolerance for stopping criteria
        'C':[1.0], #Like in support vector machines, smaller values specify stronger regularization.
        'fit_intercept':[True], 
        'intercept_scaling':[1], 
        'class_weight':['balanced'], 
        'random_state':[101], 
        'solver':[
#             'liblinear', #Good choice for small datasets, one-vs-rest only, handels L1
                  'newton-cg',  #multinomial problems (multi-class), L2
                  'lbfgs',  #multinomial problems (multi-class), L2
#                   'sag', #Stochastic Average Gradient descent #Good choice for small datasets, multinomial problems, L2
#                   'saga', #Good choice for small datasets, multinomial problems, handels L1
                 ], 
        'max_iter':[100, 500], #Useful only for the newton-cg, sag and lbfgs solvers
        'multi_class':[
#             'ovr', 
#                        'multinomial', 
                       'auto'], #‘ovr’, ‘multinomial’, ‘auto’
        'verbose':[0], 
        'warm_start':[False], 
        'n_jobs':[n_jobs]
    },
    #-------------
    'SVC':{
        'C':[0.5, 1.0, 2, 5], 
        'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 
        'degree':[3,4,5], #degree of the polynomial (only for polynomial kernel)
        'gamma': ['auto', 'scale'],
        'coef0': [0.0, 1, 2], 
        'shrinking':[True], 
        'probability':[False],  #Change this to True if probabilities need to be calculated
        'tol': [0.001], 
        'cache_size':[200], 
        'class_weight':['balanced'], 
        'verbose':[False], 
        'max_iter':[-1], 
        'decision_function_shape':['ovr', 'ovo'], 
        'random_state': [101]    
    },
    #-------------
    'RandomForestClassifier':{
        'n_estimators':[100, 200, 1000],
        'criterion': ['gini', 'entropy'],
        'max_depth':[None], #integer
        'min_samples_split': [2, 8, 20], #integer or proportion of samples
        'min_samples_leaf':[1, 5, 10, 20],
        'max_features':['auto', 'log2', None],
        'max_leaf_nodes' :[None], #int
        'min_impurity_decrease' :[1e-7],    
        'class_weight':['balanced', 'balanced_subsample'],    
        'bootstrap':[True],
        'n_jobs':[n_jobs]
    },
    #-------------
    "GradientBoostingClassifier": {
        'loss':['deviance'], 
        'learning_rate':[0.1], 
        'n_estimators':[100, 200, 1000], 
        'subsample':[1.0], 
        'criterion': ['friedman_mse'], 
        'min_samples_split':[2, 5, 10, 20], 
        'min_samples_leaf': [2, 5, 10, 20], 
        'min_weight_fraction_leaf':[0.0], 
        'max_depth':[3, 4, 5, 10], 
        'min_impurity_decrease': [0.0], 
        'min_impurity_split': [None], 
        'init': [None], 
        'random_state':[101], 
        'max_features':[None, 'sqrt', 'log2'], 
        'verbose':[0], 
        'max_leaf_nodes':[None], 
        'warm_start':[False], 
        'presort':['auto'], 
        'validation_fraction': [0.1], 
        'n_iter_no_change':[5], #if the score is not improving for 3 iterations ==>Early stopping. Default None
        'tol':[0.0001]
    },
    #-------------------------------
    "SGDClassifier":{
        'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'], 
        'penalty':['l2', 'l1', 'elasticnet'], 
        'alpha': [0.0001], 
        'l1_ratio': [0.15, 0.4], 
        'fit_intercept':[True], 
        'max_iter':[1000], 
        'tol': [0.001], 
        'shuffle':[True], 
        'verbose':[0], 
        'epsilon':[0.1], 
        'n_jobs':[n_jobs], 
        'random_state':[101], 
        'learning_rate':['optimal', 'adaptive', 'invscaling', 'constant'], 
        'eta0':[0.0], 
        'power_t':[0.5], 
        'early_stopping':[True], 
        'validation_fraction':[0.1], 
        'n_iter_no_change': [10], 
        'class_weight':['balanced'], 
        'warm_start':[False], 
        'average':[False]
    },
    #-------------------------------    
    "PassiveAggressiveClassifier": {
        'C':[1.0], 
        'fit_intercept':[True], 
        'max_iter':[1000], 
        'tol': [0.001], 
        'early_stopping':[True], 
        'validation_fraction':[0.1], 
        'n_iter_no_change': [10], 
        'shuffle':[True], 
        'verbose':[0], 
        'loss':['hinge', 'squared_hinge'], 
        'n_jobs':[n_jobs], 
        'random_state':[101], 
        'warm_start':[False], 
        'class_weight':['balanced'], 
        'average':[False]
    },
    #-------------------------------    
    "XGBClassifier": {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1],
        'n_estimators': [100, 200, 500],
        'verbosity':[0],
        'silent': [None],
        "objective": ['binary:logistic'],
        'booster': ['gbtree'],
        'nthread': [None],
#         'gamma': [0.5, 1, 1.5, 2, 5],        
#         'min_child_weight': [1, 5, 10],        
#         'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'colsample_bylevel': [1],
        'colsample_bynode': [1],
        'reg_alpha': [1],
        'reg_lambda': [1],
        'scale_pos_weight': [1],
        'base_score': [0.5],
        'random_state': [101],
        'seed': [None],
        'missing': [None],        
        'n_jobs':[-1]                   
    }    
    
}

# Using Scikit-learn classifiers

In [18]:
#Available Classifiers
#classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", "PassiveAggressiveClassifier", "SGDClassifier", ]

classifiers =['DecisionTreeClassifier', "LogisticRegression", 
#               "XGBClassifier", 
              'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", 
              "PassiveAggressiveClassifier", "SGDClassifier" ]

In [19]:
import warnings
warnings.filterwarnings(action="once")

In [None]:
#running the prediction pipeline
prediction_pipeline(classifiers, X_train_scaled, y_train, X_test_scaled, y_test)

In [None]:
# from sklearn.ensemble import VotingClassifier

# #classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", "PassiveAggressiveClassifier", "SGDClassifier", ]


# classifiers = [('lr', lr['model_raw']), ('svc', svc['model_raw']), 
#                ('rf', rf['model_raw']), ('gb', gb['model_raw']), 
#                ('sgd', sgd['model_raw']), ('passive_aggressive', p_agg['model_raw']),
#               ('GaussianNB', nb['model_raw'])]

# Voting_classifier = VotingClassifier(estimators=classifiers, voting='hard', n_jobs=-1)

# Voting_classifier.fit(X_train_scaled, y_train)