In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd
import pickle


In [5]:
#Create all tuples of parameters
from itertools import product #Returns the cartesian product of lists (same as nested for loops)
def All_params_grid(dico):
    #Create list of keys and a list of values
    keys = list(dico.keys())
    values = dico.values()
    #Create all possible combinations of parameters
    params = []    
    for value in product(*values):#Loop trough all the combination of values
        subParam =dict() #reconstruct the dictionary of parameters
        for i in range(len(keys)):           
            subParam[str(keys[i])] = value[i] #Attribute the respective value to the respective key
        params.append(subParam)
    return(params)

In [6]:
from sklearn.model_selection import cross_validate
#Custom training method with cross validation
def trainRawModel(strModel,params, x_train, y_train, x_test, y_test):
    '''This function is used to train the model without any ensemble method'''    
    #Instanciate the model
    model = eval(strModel)()
    #Set parameters
    model.set_params(**params)
    
    #Train Base model No ensemble           
    model_raw = model.fit(x_train,y_train)        
    cv = cross_validate(model_raw, x_train, y_train, cv=5, scoring=['recall', 'roc_auc', 'f1'], 
                        return_train_score=False,n_jobs=5)        
    model_raw.fit(x_train,y_train)

    #Predict model        
    y_pred_raw = model_raw.predict(x_test)

    roc_predict_raw = roc_auc_score(y_test, y_pred_raw, average='macro', sample_weight=None)
    obj = {
        'model_raw': model_raw,
        'cv_raw': cv,
        'roc_predict_raw': roc_predict_raw
    }        
    return(obj) 

In [9]:
from random import shuffle
from random import seed
seed(12345678)

# Custom Gridsearch function 
def custom_gridSearch(strModel, param_dict, x_train, y_train, x_test, y_test):
    params = All_params_grid(param_dict)
    
    params = shuffle(params)
    
    best_model = ''
    counter  = 0
    print('testing param list : 1/' + str(len(params)))
    for param in params:  
        
        pickling_on = open("Xgboost_last_set_of_paraneters.pickle" ,"wb")
        pickle.dump(param , pickling_on)
        pickling_on.close()
        
        counter += 1
        if counter%10 == 0:
            print('testing param list : {}/{} \t  Best auc : {}'.format(str(counter), 
                                                                        str(len(params)), 
                                                                        str(best_model['roc_predict_raw']) 
                                                                       ))
            
                   
            
        try:
            model = trainRawModel(strModel,param, X_train, y_train, X_test, y_test)
        
            if best_model == '':
                best_model = model
                
                pickling_on = open("Xgboost_best_model_{}.pickle".format(str(counter)) ,"wb")
                pickle.dump(best_model, pickling_on)
                pickling_on.close()
                
            elif best_model['roc_predict_raw'] < model['roc_predict_raw']:
                best_model = model
                
                pickling_on = open("Xgboost_best_model_{}.pickle".format(str(counter)) ,"wb")
                pickle.dump(best_model, pickling_on)
                pickling_on.close()  
                
        except:
            pass
        
        
        
    return best_model 

In [10]:
# Function to perform the predictions
import pickle
from collections import defaultdict
from datetime import datetime

#Available Classifiers
# classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 
#               'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", 
#               "PassiveAggressiveClassifier", "SGDClassifier" ]

def prediction_pipeline(classifiers_list, X_train_scaled, y_train, X_test_scaled, y_test):
    #Train all models    
    All_classifiers= defaultdict(dict)
    compt = 1
    for classifier in classifiers_list:
        print('Current classifier ' + classifier)
               
        try:
            model = custom_gridSearch(classifier, model_parameters[classifier], X_train_scaled, y_train, X_test_scaled, y_test)
            All_classifiers[classifier][classifier + '_raw'] = model
        except:
            pass

In [15]:
from xgboost import XGBClassifier
model_parameters = {
"XGBClassifier": {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.1],
        'n_estimators': [100, 500, 700],
        'verbosity':[0],
        'silent': [None],
        "objective": ['binary:logistic'],
        'booster': ['gbtree'],
        'nthread': [None],
        'gamma': [0],        
         'min_child_weight': [1, 5, 10],        
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'colsample_bylevel': [1],
        'colsample_bynode': [1],
        'reg_alpha': [0, 1, 2 , 6],
        'reg_lambda': [1, 2, 3, 6],
        'scale_pos_weight': [1],
        'base_score': [0.5],
        'random_state': [101],
        'seed': [None],
        'missing': [None], 
#         'tree_method':['gpu_hist'], #  default 'auto', 'gpu_exact'
        'updater':['grow_gpu']   
    }    
}

In [5]:
#Loading the dataset
data = pd.read_csv("1_dataset_ML_no_OHE_ready.csv", index_col=0 )

In [6]:
# data = data.sample(10000)

In [7]:
X = data.drop("readmitted", axis=1)
Y = data["readmitted"]

In [8]:
cat_columns = ['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2',
       'diag_3', 'metformin', 'repaglinide', 'glimepiride', 'glipizide',
       'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change',
       'diabetesMed']

continuous_columns = [col for col in X.columns if col not in cat_columns]

In [9]:
# Use label encoder on the categorical columns
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

le = defaultdict(LabelEncoder)
fit = X[cat_columns].apply(lambda x: le[x.name].fit_transform(x))
#fit.apply(lambda x: le[x.name].inverse_transform(x))

In [10]:
#Reconstruct the dataset
dataset = pd.concat( [fit, X[continuous_columns]], axis=1)

In [11]:
#Split the dataset into training testing
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(dataset, Y, test_size=0.1, random_state=101)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=101)

In [None]:
import warnings
warnings.filterwarnings(action="once")

#Available Classifiers
#classifiers =['DecisionTreeClassifier', "LogisticRegression", "XGBClassifier", 'SVC', 'RandomForestClassifier', "GradientBoostingClassifier", "PassiveAggressiveClassifier", "SGDClassifier", ]

classifiers =["XGBClassifier"]

#running the prediction pipeline
prediction_pipeline(classifiers, X_train, y_train, X_test, y_test)

Current classifier XGBClassifier
testing param list : 1/6480
testing param list : 2/6480
Best auc : 0.631316416902169
testing param list : 4/6480
Best auc : 0.6314886060040901
testing param list : 6/6480
Best auc : 0.6317978962279184
