In [2]:
import pandas as pd
from sklearn.preprocessing import  MinMaxScaler 
from sklearn.neural_network import MLPClassifier,MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import roc_auc_score


import sys
sys.path.append("/data_prep.py")  # path contains data_prep.py
import data_prep as EHR 

In [2]:
data = EHR.import_data("DUMMY_DATA.csv")
plausible_values = EHR.import_data("PLAUSIBLE_EHR.csv")

In [3]:
data=data[data.columns].fillna(data[data.columns].median())

In [4]:
data= EHR.capitalization_fix(data,["Ethnicity"])

In [5]:
plausible_values=plausible_values.dropna(how='all')
plausible_values=plausible_values.drop("Unit of Measure", axis=1)
plausible_values.set_index('Variable Name',inplace=True)
plausible_values= plausible_values.transpose()
plausible_values.head()

Variable Name,Albumin,Arterial_partial_pressure_co2,Arterial_ph,Bilirubin,BUN,Calcium,Creatinine,glucose,Hco3,heartrate,...,Resprate_Rate,Sodium,Oxygen_saturation,Temperature,WBC,Urineoutput,Systolic_blood_pressure_max,Systolic_blood_pressure_min,Diastolic_blood_pressure_min,Diastolic_blood_pressure_max
Minmum Plausible value,0.5,5.0,6.3,1.0,0.5,0.0,0.1,0.0,1.0,1.0,...,1.0,1.0,21.0,20.0,0.0,0.0,1.0,1.0,1.0,1.0
Maximum Plausible value,6.5,250.0,8.5,1200.0,100.0,5.3,33.9,1620.0,80.0,300.0,...,80.0,215.0,100.0,46.0,300.0,30000.0,250.0,250.0,350.0,350.0


In [6]:
data= EHR.One_hot_encode(data, "Gender")
data.head()

Unnamed: 0,encounter_id,Hospital Death,Height,Weight,Ethnicity,Age,Diagnosis code,Albumin,Arterial_partial_pressure_co2,Arterial_ph,...,Oxygen_saturation,Temperature,WBC,Urineoutput,Systolic_blood_pressure_max,Systolic_blood_pressure_min,Diastolic_blood_pressure_min,Diastolic_blood_pressure_max,F,M
0,34937,1,160.44,57.79,hispanic,56.0,355.5,3.0,34.0,7.0,...,68.0,22.0,300.0,259.0,66.0,52.8,2.4,3.6,0,1
1,483670,0,160.44,57.79,asian,53.0,355.5,3.0,29.0,7.0,...,17.0,87.0,228.0,47.0,118.0,94.4,3.2,4.8,0,0
2,150725,1,175.14,57.79,african american,56.0,249.0,1.0,127.0,7.0,...,41.0,45.0,172.0,281.0,171.0,136.8,2.4,3.6,0,0
3,278547,0,183.08,57.79,african american,56.0,355.5,6.0,68.0,8.0,...,14.0,60.0,172.0,86.0,177.0,141.6,4.0,6.0,0,0
4,457468,0,162.62,73.88,african american,58.0,355.5,6.0,225.0,7.0,...,14.0,79.0,172.0,15.0,158.0,126.4,0.8,1.2,0,0


In [7]:
data= EHR.Label_encode(data, "Ethnicity")
data.head()

Unnamed: 0,encounter_id,Hospital Death,Height,Weight,Ethnicity,Age,Diagnosis code,Albumin,Arterial_partial_pressure_co2,Arterial_ph,...,Oxygen_saturation,Temperature,WBC,Urineoutput,Systolic_blood_pressure_max,Systolic_blood_pressure_min,Diastolic_blood_pressure_min,Diastolic_blood_pressure_max,F,M
0,34937,1,160.44,57.79,3,56.0,355.5,3.0,34.0,7.0,...,68.0,22.0,300.0,259.0,66.0,52.8,2.4,3.6,0,1
1,483670,0,160.44,57.79,1,53.0,355.5,3.0,29.0,7.0,...,17.0,87.0,228.0,47.0,118.0,94.4,3.2,4.8,0,0
2,150725,1,175.14,57.79,0,56.0,249.0,1.0,127.0,7.0,...,41.0,45.0,172.0,281.0,171.0,136.8,2.4,3.6,0,0
3,278547,0,183.08,57.79,0,56.0,355.5,6.0,68.0,8.0,...,14.0,60.0,172.0,86.0,177.0,141.6,4.0,6.0,0,0
4,457468,0,162.62,73.88,0,58.0,355.5,6.0,225.0,7.0,...,14.0,79.0,172.0,15.0,158.0,126.4,0.8,1.2,0,0


In [8]:
data= EHR.group_age(data)
data= EHR.Label_encode(data, "age_by_range")

data.head()

Unnamed: 0,encounter_id,Hospital Death,Height,Weight,Ethnicity,Age,Diagnosis code,Albumin,Arterial_partial_pressure_co2,Arterial_ph,...,Temperature,WBC,Urineoutput,Systolic_blood_pressure_max,Systolic_blood_pressure_min,Diastolic_blood_pressure_min,Diastolic_blood_pressure_max,F,M,age_by_range
0,34937,1,160.44,57.79,3,56.0,355.5,3.0,34.0,7.0,...,22.0,300.0,259.0,66.0,52.8,2.4,3.6,0,1,1
1,483670,0,160.44,57.79,1,53.0,355.5,3.0,29.0,7.0,...,87.0,228.0,47.0,118.0,94.4,3.2,4.8,0,0,1
2,150725,1,175.14,57.79,0,56.0,249.0,1.0,127.0,7.0,...,45.0,172.0,281.0,171.0,136.8,2.4,3.6,0,0,1
3,278547,0,183.08,57.79,0,56.0,355.5,6.0,68.0,8.0,...,60.0,172.0,86.0,177.0,141.6,4.0,6.0,0,0,1
4,457468,0,162.62,73.88,0,58.0,355.5,6.0,225.0,7.0,...,79.0,172.0,15.0,158.0,126.4,0.8,1.2,0,0,1


In [9]:

columns = list(data.columns)
transforms = {}
for c in columns:
    scaler_mean = MinMaxScaler(feature_range=(0,1))
    _=scaler_mean.fit_transform(data[[c]].dropna())
    transforms[c] = scaler_mean

In [10]:
def transformations(dataset, columns, transforms):
    dataset_copy = dataset.copy()
    for c in columns:
        func = [value for key, value in transforms.items() if key in c][0]
        func = transforms[c]
        dataset_copy[c] = func.transform(dataset_copy[c].values.reshape(len(dataset_copy), 1))   
    return dataset_copy

In [11]:
X_train, X_test = train_test_split(data, test_size=0.2, random_state =0)

In [12]:
training_data_transformed = transformations(X_train, columns, transforms)
test_data_transformed = transformations(X_test, columns, transforms)
y_train=X_train["Hospital Death"]
y_test=X_test["Hospital Death"]
X_train= X_train.drop("Hospital Death", axis =1)
X_test= X_test.drop("Hospital Death", axis =1)
training_data_transformed= training_data_transformed.drop("Hospital Death", axis =1)
test_data_transformed= test_data_transformed.drop("Hospital Death", axis =1)

In [13]:
# LogisticRegression
clf = LogisticRegression(random_state=256).fit(training_data_transformed, y_train)
prediction =clf.predict_proba(test_data_transformed)[:,1]
score = roc_auc_score(y_test,prediction)
print("roc_auc_score: ")
score

roc_auc_score: 


0.5383989421218595

In [14]:
# Multi-layer Perceptron regressor
clf =MLPRegressor(hidden_layer_sizes=(400,200,128), max_iter=1000, learning_rate='constant',
                 activation='relu', solver='adam',random_state=0).fit(training_data_transformed, y_train)
prediction =clf.predict(X_test)
score = roc_auc_score(y_test,prediction)
print("roc_auc_score: ")
score

roc_auc_score: 


0.49465974977113214

In [15]:
#  Support Vector Machine
clf = svm.SVC(cache_size=7000,random_state=0).fit(training_data_transformed, y_train)
prediction =clf.predict(X_test)
score = roc_auc_score(y_test,prediction)
print("roc_auc_score: ")
score

roc_auc_score: 


0.5

In [16]:
# Gradient Boosting Regressor
clf= GradientBoostingRegressor(random_state=0).fit(training_data_transformed, y_train)
prediction =clf.predict(X_test)
score = roc_auc_score(y_test,prediction)
print("roc_auc_score: ")
score

roc_auc_score: 


0.4299155731868578

In [17]:
# Ridge Regressor
clf = Ridge(alpha=200, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=0, solver='auto', tol=0.001).fit(training_data_transformed, y_train)
prediction =clf.predict(test_data_transformed)
score = roc_auc_score(y_test,prediction)
print("roc_auc_score: ")
score

roc_auc_score: 


0.5198860746617842