# Import 

In [43]:
# EDA tools
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(10)
import warnings
warnings.filterwarnings("ignore")

#Single model classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

#Ensemble classifications models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#Model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score,confusion_matrix, classification_report, roc_auc_score,
f1_score, recall_score, precision_score, SCORERS)
import time

# Imbalance, Hyperparameter tunning and pipelines
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from collections import Counter
from imblearn.over_sampling import SMOTEN

# Function

In [44]:
def distbox_plot(data):
    '''Function to plot a seaborn distribution plot 
        with a box plot on top.
        Reference: https://python-graph-gallery.com/24-histogram-with-a-boxplot-on-top-seaborn/'''
    # Cut the window in 2 parts
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, figsize=(14,4),gridspec_kw={"height_ratios": (.15, .85)})
    # Add a graph in each part
    sns.boxplot(data, ax=ax_box)
    sns.distplot(data, ax=ax_hist)
    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')
    #Print describtive statistics
    print(data.describe())

def preds(data=[]):
    #Create training and testing data sets depending on wheather or not they have been generated previously.
    if len(data)== 2:
        X_train, X_test, y_train, y_test = train_test_split(data[0],data[1],random_state=123)
    else:
        X_train, X_test, y_train, y_test = data[0],data[1],data[2],data[3]

    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    test_preds = clf.predict_proba(X_test)
    return test_preds

def baseline_models(data=[], verbose=False):
    #List of models to be used
    models=[DecisionTreeClassifier(),LogisticRegression(),
            RandomForestClassifier(),GradientBoostingClassifier()]
    #Create training and testing data sets depending on wheather or not they have been generated previously.
    if len(data)== 2:
        X_train, X_test, y_train, y_test = train_test_split(data[0],data[1],random_state=123)
    else:
        X_train, X_test, y_train, y_test = data[0],data[1],data[2],data[3]
    #Instantiate lists to store each of the models results
    accuracy = []
    f1 = []
    auc = []
    recall = []
    precision = []
    #Run thorugh each of the models to get their performance metrics
    for model in models:
        clf = model
        clf.fit(X_train, y_train)
        test_preds = clf.predict(X_test)
        f1.append(f1_score(y_test, test_preds))
        accuracy.append(accuracy_score(y_test, test_preds))
        auc.append(roc_auc_score(y_test, test_preds))
        recall.append(recall_score(y_test, test_preds))
        precision.append(precision_score(y_test, test_preds))
        #Print the model and its report
        if verbose:
            print('Classification Model: ',model,'\n')
            print(classification_report(y_test, test_preds),'\n')
    #store results in dataframe
    results = pd.DataFrame([f1,auc, accuracy, precision,recall],
                      index= ['f1','roc_auc','accuracy','precision','recall',],
                           columns=['DecisionTree','LogisticRegression','RandomForest','Gradient Boosting'])
    #Change orientation of the dataframe
    return results.transpose()

def plot_feat_importance(clf,index):
    '''Plot the 0 most important features for a classifier model in a bar chart
    according to importance'''
    feat_importances = pd.DataFrame(clf.feature_importances_, index=index, columns=['Score'])
    feat_importances = feat_importances.sort_values(by='Score',ascending=True).tail(10)
    feat_importances.plot(kind='barh', title='Top 10 Important Features',legend=False)
    plt.xlabel('Importance Score')
    plt.ylabel('Feature')
    plt.show();
    
def heatmap(data,figsize=(8,8), annot=False):
    #plot heatmap to find multicollinearity
    plt.figure(figsize=figsize)
    cmap = sns.diverging_palette(220, 20, sep=20, as_cmap=True)
    sns.heatmap(data.corr(),vmin=-0.75,vmax=0.75,center=0, cmap=cmap,annot=annot);

def grid_pipe(pipedict, hyperdict, scoring='accuracy', display=True):
    model_scores=[]
    fitted_models={}
    for name, pipeline in pipedict.items():
    # Construct grid search
        model = GridSearchCV(estimator=pipeline,
                             param_grid=hyperdict[name],
                             scoring=scoring,
                             cv=5, verbose=2, n_jobs=-1, return_train_score = True)

        # Fit using grid search
        start = time.time()
        model.fit(X_trainres, y_trainres)
        end = time.time()
        #Append socres and time
        model_scores.append((name,model.best_score_,end-start))
        #Append model
        fitted_models[name]=model
        if display:
            #Print when the model has been fitted
            print(f'The {name} model has been fitted.')
            # Best accuracy
            print('Best accuracy: %.3f' % model.best_score_)
            # Best params
            print('Best params:\n', model.best_params_,'\n')
    
    return model_scores, fitted_models

# Process

In [45]:
path='/home/o/oananbeh/notebook/Experment2/src/TabTransformer-multiclass_discharge/'

In [46]:
data=pd.read_csv(path+'discharge_final_ML.csv')#,usecols = ['G.Patient','G.Gender','PO.OUTCOME']


In [47]:
# #Create list with categorical features' names.
# nonNumericalFeatures=['C_Diabetes', 'C_HTN', 'C.Heart ischemic', 'C.Heart failure', 'C.Cardiomyopathies', 'C.End stage renal', 'C.Hemodialysis', 'C.COPD',
#    'C.Lung Interstitial Disease ', 'C.Bronchial Asthma', 'C.Cerebrovascular', 'C.Neurologic (dementia)', 'C.History of psychiatric', 'C.Cirrhosis', 'C.liver disease', 'C.Obesity',
#    'C.Sick cell', 'C.Cancer','C.Solid organ transplant', 'IS.Hematopoietic cell transplant', 'IS.HIV', 'IS.corticosteroids',
#    'IS.Other immunosuppressing agents', 'IS.Other immunodeficiencies', 'IS.Pregnancy','IS.Smoker', 
#     'CSA_Fever', 'CSA_SOB', 'CSA.Chest pain', 'CSA.Confusion', 'CSA.Hemoptysis', 'CSA.Diarrhea', 'CSA_Cough', 'CSA.Myalgia', 'CSA.Headache',
#    'PEFF.Nasal Cannula', 'PEFF.Mask', 'PEFF.HFNC',
#    'PEFF.If patient need prone position', 'PEFF.If patient intubated', 'PEFF.If Patient required Psychiatric Consultation', 'PEFF.Presence of thrombo-embolic', 'PEFF.Confirmed DVT',
#    'PEFF.Confirmed Pulmonary embolism', 'PEFF.Confirmed Myocardial infarction', 'PEFF.Confirmed CVA/TIA','MPA_Antibiotics','MPA_Favipiravir',
#    'MPA.Kaletra +Ribavirin +Interferon','MPA.Hydroxychloroquine', 'MPA_Dexamethasone',
#    'MPA_Convalescent_plasma_transfusion','MPA_Clexan_or_Heparine',
#    'MPA_Tocilizumab',  'Presence of consolidation', 'Presence of ground glass opacities','OUTCOME'] 

In [48]:
#Create list with categorical features' names.
nonNumericalFeatures=['C_Diabetes', 'C_HTN', 'C.Heart ischemic', 'C.Heart failure', 'C.Cardiomyopathies', 'C.End stage renal', 'C.Hemodialysis', 'C.COPD',
   'C.Lung Interstitial Disease ', 'C.Bronchial Asthma', 'C.Cerebrovascular', 'C.Neurologic (dementia)', 'C.History of psychiatric', 'C.Cirrhosis', 'C.liver disease', 'C.Obesity',
   'C.Sick cell', 'C.Cancer','C.Solid organ transplant', 'IS.Hematopoietic cell transplant', 'IS.HIV', 'IS.corticosteroids',
   'IS.Other immunosuppressing agents', 'IS.Other immunodeficiencies', 'IS.Pregnancy','IS.Smoker', 
    'CSA_Fever', 'CSA_SOB', 'CSA.Chest pain', 'CSA.Confusion', 'CSA.Hemoptysis', 'CSA.Diarrhea', 'CSA_Cough', 'CSA.Myalgia', 'CSA.Headache',
   'PEFF.Nasal Cannula', 'PEFF.Mask', 'PEFF.HFNC',
   'PEFF.If patient need prone position', 'PEFF.If patient intubated', 'PEFF.If Patient required Psychiatric Consultation', 'PEFF.Presence of thrombo-embolic', 'PEFF.Confirmed DVT',
   'PEFF.Confirmed Pulmonary embolism', 'PEFF.Confirmed Myocardial infarction', 'PEFF.Confirmed CVA/TIA',
   'MPA_Antibiotics','Anticoagulant','Immunomodulators','antiviral',
   'XrayResult'
   ] 

In [49]:
other=['PatientId','G.Nationality','OUTCOME','Gender'] 

In [50]:
#Create list with numerical features' names.
numericalFeatures=['AGE','ABGF_Ph','ABGF_Pa_O2', 'ABGF_Pa_CO2', 'ABGF_HCO3','PEFF_Temperature', 'PEFF_Respiratory_Rate','PEFF_Pulse', 'PEFF_BP_Systolic', 'PEFF_BP_Diastolic', 'PEFF_Glasgow',
 'WBC', 'PNN', 'Lymphocytes', 'Hemoglobin', 'Platelets', 'Creatinine', 'ALT', 'LDH', 'FERRITIN', 'D_DIMER', 'CRP', 'PROCALCITONI', 'TROPONIN', 'Pro_BNP', 'PTT', 'Vitamin_D', 'IL6']

In [51]:
data.drop(['Blood Group'], axis=1,inplace=True) 

In [52]:
data.rename(columns={'PO.OUTCOME':'OUTCOME'},inplace=True)

In [53]:
#Convert OUTCOME to 0 and 1
data['OUTCOME']=data['OUTCOME'].replace('discharge','1')
data['OUTCOME']=data['OUTCOME'].replace('died','0')

In [54]:
#Replace False values with null 
data.replace('FALSE', None,inplace=True)
# data.dropna(inplace=True)

In [55]:
for feature in ['Gender']:
    data[feature] = data[feature].map({'male':1,'female':0})

# Main

In [56]:
#Conver Numerical Features columns to float
data[numericalFeatures] = data[numericalFeatures].astype('float')

In [57]:
#Replace null values with Mean for each feature in nonNumerical Features
for featureName in nonNumericalFeatures[1:]:
    featureMode=int(data[featureName].mode())
    data[[featureName]]=data[[featureName]].fillna(featureMode)

In [58]:
#Replace null values with Mean for each feature in Numerical Features
for featureName in numericalFeatures:
    data[[featureName]]=data[[featureName]].fillna(data[featureName].mean())


In [59]:
data[['OUTCOME']]=data[['OUTCOME']].fillna(data['OUTCOME'].mode())

In [60]:
data[nonNumericalFeatures] = data[nonNumericalFeatures].astype('int64')
data[nonNumericalFeatures] = data[nonNumericalFeatures].astype('category')

In [61]:
data.shape

(248, 92)

In [62]:
#Create a final dataframe with all numerical variables in the first columns.
final_df = data.drop('OUTCOME', axis=1)
target = data.OUTCOME
print(f"Original class counts: {Counter(target)}")

Original class counts: Counter({'discharge=<1': 85, 'discharge=<2': 80, 'discharge=<3': 37, 'discharge>4': 34, 'discharge=<4': 12})


In [63]:
sampler = SMOTEN(random_state=0)
X_trainres, y_trainres = sampler.fit_resample(final_df, target)
print(f"Class counts after resampling {Counter(y_trainres)}")

Class counts after resampling Counter({'discharge=<1': 85, 'discharge=<2': 85, 'discharge=<3': 85, 'discharge=<4': 85, 'discharge>4': 85})


In [64]:
#Connect final with target
SMOTE_dataSet = pd.concat([X_trainres, y_trainres],axis=1)

In [65]:
# SMOTE_dataSet.to_csv(path+("test.csv"))

In [66]:
# c=SMOTE_dataSet.isnull ().sum ()
# c.to_csv(path+'final_df2nullValues.csv')

In [67]:
SMOTE_dataSet.rename(columns={'G.AGE': 'AGE','ABGF.Ph':'ABGF_Ph',
'ABGF.Pa O2':'ABGF_Pa_O2','ABGF.Pa CO2':'ABGF_Pa_CO2','ABGF.HCO3-':'ABGF_HCO3',
'PEFF.Temperature':'PEFF_Temperature','PEFF.Respiratory Rate':'PEFF_Respiratory_Rate',
'PEFF.Pulse':'PEFF_Pulse','PEFF.BP_Systolic':'PEFF_BP_Systolic','PEFF.BP_Diastolic':'PEFF_BP_Diastolic',
'PEFF_Glasgow Score':'PEFF_Glasgow','D-DIMER':'D_DIMER','Pro-BNP':'Pro_BNP','Vitamin D':'Vitamin_D',
'C.Diabetes':'C_Diabetes','C.HTN':'C_HTN','CSA.Fever':'CSA_Fever','CSA.SOB':'CSA_SOB',
'CSA.Cough':'CSA_Cough','MPA.Antibiotics':'MPA_Antibiotics','MPA.Favipiravir':'MPA_Favipiravir',
'MPA.Dexamethasone':'MPA_Dexamethasone','MPA.Convalescent plasma transfusion':'MPA_Convalescent_plasma_transfusion',
'MPA.Clexan or Heparine':'MPA_Clexan_or_Heparine','MPA.Tocilizumab':'MPA_Tocilizumab'}, inplace=True)

In [68]:
SMOTE_dataSet.to_csv(path+'SMOT_discharge_DataSet.csv',index=False) 

In [69]:
SMOTE_dataSet.to_csv(path+'SMOT_discharge_DataSet2.csv',index=False,header=False) 