In [1]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


import re
import math
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
# plt.style.use('seaborn-whitegrid')
%matplotlib inline

In [2]:
#Loading the dataset
data = pd.read_csv("1_dataset_ML_no_OHE_ready.csv", index_col=0 )

In [3]:
data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,diag_1,diag_2,diag_3,metformin,...,change,diabetesMed,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,readmitted
1,Caucasian,Female,2,1,1,7,276,250.01,other,No,...,Ch,Yes,3,59,0,18,0,0,0,1
2,AfricanAmerican,Female,3,1,1,7,648,250,other,No,...,No,Yes,2,11,5,13,2,0,1,0
3,Caucasian,Male,4,1,1,7,8,other,403,No,...,Ch,Yes,2,44,1,16,0,0,0,0
4,Caucasian,Male,5,1,1,7,197,other,250,No,...,Ch,Yes,1,51,0,8,0,0,0,0
5,Caucasian,Male,6,2,1,2,414,411,250,No,...,No,Yes,3,31,6,16,0,0,0,1


In [4]:
X = data.drop("readmitted", axis=1)
Y = data["readmitted"]

In [5]:
X.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2',
       'diag_3', 'metformin', 'repaglinide', 'glimepiride', 'glipizide',
       'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change',
       'diabetesMed', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient'],
      dtype='object')

In [6]:
cat_columns = ['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2',
       'diag_3', 'metformin', 'repaglinide', 'glimepiride', 'glipizide',
       'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change',
       'diabetesMed']

continuous_columns = [col for col in X.columns if col not in cat_columns]


In [7]:
# Use label encoder on the categorical columns
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [8]:
le = defaultdict(LabelEncoder)
fit = X[cat_columns].apply(lambda x: le[x.name].fit_transform(x))
#fit.apply(lambda x: le[x.name].inverse_transform(x))

In [9]:
#Reconstruct the dataset
dataset = pd.concat( [fit, X[continuous_columns]], axis=1)

In [10]:
#Split the dataset into training testing
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(dataset, Y, test_size=0.1, random_state=101)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=101)

# Custom functions for performing fault tolerance GridSearch


In [11]:
#Create all tuples of parameters
from itertools import product #Returns the cartesian product of lists (same as nested for loops)
def All_params_grid(dico):
    #Create list of keys and a list of values
    keys = list(dico.keys())
    values = dico.values()
    #Create all possible combinations of parameters
    params = []    
    for value in product(*values):#Loop trough all the combination of values
        subParam =dict() #reconstruct the dictionary of parameters
        for i in range(len(keys)):           
            subParam[str(keys[i])] = value[i] #Attribute the respective value to the respective key
        params.append(subParam)
    return(params)

In [12]:
from sklearn.model_selection import cross_validate
#Custom training method with cross validation
def trainRawModel(strModel,params, x_train, y_train, x_test, y_test):
    '''This function is used to train the model without any ensemble method'''    
    #Instanciate the model
    model = eval(strModel)()
    #Set parameters
    model.set_params(**params)
    
    #Train Base model No ensemble           
    model_raw = model.fit(x_train,y_train)        
    cv = cross_validate(model_raw, x_train, y_train, cv=5, scoring=['recall', 'roc_auc', 'f1'], 
                        return_train_score=False,n_jobs=5)        
    model_raw.fit(x_train,y_train)

    #Predict model        
    y_pred_raw = model_raw.predict(x_test)

    roc_predict_raw = roc_auc_score(y_test, y_pred_raw, average='macro', sample_weight=None)
    obj = {
        'model_raw': model_raw,
        'cv_raw': cv,
        'roc_predict_raw': roc_predict_raw
    }        
    return(obj) 

In [13]:
# Custom Gridsearch function 
def custom_gridSearch(strModel, param_dict, x_train, y_train, x_test, y_test):
    params = All_params_grid(param_dict)
    best_model = ''
    
    for param in params:  
        try:
            model = trainRawModel(strModel,param, X_train, y_train, X_test, y_test)
        
            if best_model == '':
                best_model = model
            elif best_model['roc_predict_raw'] < model['roc_predict_raw']:
                best_model = model
        except:
            pass
        
    return best_model 

In [14]:
# Function to perform the predictions
import pickle
from collections import defaultdict
from datetime import datetime

#Available Classifiers
# classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 
#               'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", 
#               "PassiveAggressiveClassifier", "SGDClassifier" ]

def prediction_pipeline(classifiers_list, X_train_scaled, y_train, X_test_scaled, y_test):
    #Train all models    
    All_classifiers= defaultdict(dict)
    compt = 1
    for classifier in classifiers_list:
        print('Current classifier ' + classifier)
        
        start = datetime.now()
        
        model = custom_gridSearch(classifier, model_parameters[classifier], X_train_scaled, y_train, X_test_scaled, y_test)
        All_classifiers[classifier][classifier + '_raw'] = model

        #Store intermediate versions
        pickling_on = open("zz_model_{}_out_of_{}.pickle".format(compt, len(classifiers)),"wb")
        pickle.dump(All_classifiers, pickling_on)
        pickling_on.close()
        
        
        end = datetime.now()
        diff = (start - end)
        hours = diff//3600
        minutes = (diff - (hours * 3600))//60
        seconds = diff - (hours * 3600)-(minutes * 60)

        print(classifier + " ended after " + str(hours) + " hours, " + str(minutes) + " minutes and  " + str(seconds)+ " seconds.")
        

In [15]:
#Construct a dictionary of model parameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

n_jobs = 5
model_parameters = {
    'DecisionTreeClassifier':{    
        'criterion': ['gini', 'entropy'],
        'max_depth':[None], #integer
        'min_samples_split': [2, 4, 8, 20], #integer or proportion of samples
        'min_samples_leaf':[1, 5, 10 , 20],
        'max_features':['auto', 'log2', None],
        'max_leaf_nodes' :[None], #int
        'min_impurity_decrease' :[1e-7],    
        'class_weight':['balanced'],    
    },    
    #-------------
    'RandomForestClassifier':{
        'n_estimators':[100, 200, 1000],
        'criterion': ['gini'], # 'entropy'
        'max_depth':[None], #integer
        'min_samples_split': [2, 8, 20], #integer or proportion of samples
        'min_samples_leaf':[1, 5, 10, 20],
        'max_features':['auto', 'log2', None],
        'max_leaf_nodes' :[None], #int
        'min_impurity_decrease' :[1e-7],    
        'class_weight':['balanced'],  # 'balanced_subsample'  
        'bootstrap':[True],
        'n_jobs':[n_jobs]
    },
    #-------------
    "GradientBoostingClassifier": {
        'loss':['deviance'], 
        'learning_rate':[0.1], 
        'n_estimators':[100, 200, 1000], 
        'subsample':[1.0], 
        'criterion': ['friedman_mse'], 
        'min_samples_split':[2, 5, 10, 20], 
        'min_samples_leaf': [2, 5, 10, 20], 
        'min_weight_fraction_leaf':[0.0], 
        'max_depth':[3, 4, 5, 10], 
        'min_impurity_decrease': [0.0], 
        'min_impurity_split': [None], 
        'init': [None], 
        'random_state':[101], 
        'max_features':[None, 'sqrt', 'log2'], 
        'verbose':[0], 
        'max_leaf_nodes':[None], 
        'warm_start':[False], 
        'presort':['auto'], 
        'validation_fraction': [0.1], 
        'n_iter_no_change':[5], #if the score is not improving for 5 iterations ==>Early stopping. Default None
        'tol':[0.0001]
    },
#     "LGBMClassifier":{
        
#     }
}

In [16]:
#Available Classifiers
#classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", "PassiveAggressiveClassifier", "SGDClassifier", ]

classifiers =['DecisionTreeClassifier', 'RandomForestClassifier', "GradientBoostingClassifier",  
#              "LGBMClassifier"
             ]

In [17]:
import warnings
warnings.filterwarnings(action="once")

In [None]:
#running the prediction pipeline
prediction_pipeline(classifiers, X_train, y_train, X_test, y_test)

Current classifier DecisionTreeClassifier
DecisionTreeClassifier ended after -1 day, 23:59:59.974290 hours, 0:00:00.000013 minutes and  0:00:00.000028 seconds.
Current classifier RandomForestClassifier
RandomForestClassifier ended after -1 day, 23:59:50.464639 hours, 0:00:00.000033 minutes and  0:00:00.000016 seconds.
Current classifier GradientBoostingClassifier
