In [1]:
import pandas as pd

targetStart = 0
targetEnd = 7

dirPath = '/superbugai-data/yash/chapter_1/workspace/EHRQC/data/icd_cohort_test/'
dataDf = pd.read_csv(dirPath + 'data_matrix.csv')

dataDf.anchor_time = dataDf.anchor_time.apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))
dataDf.death_datetime = dataDf.death_datetime.apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S'))

dataDf['target'] = (dataDf['death_datetime'] > (dataDf['anchor_time'] + pd.Timedelta(days=targetStart))) & (dataDf['death_datetime'] < (dataDf['anchor_time'] + pd.Timedelta(days=targetEnd)))
dataDf.target.fillna(value=False, inplace=True)


dropCols = [
    'person_id',
    'age',
    'gender',
    'ethnicity_WHITE',
    'ethnicity_BLACK',
    'ethnicity_UNKNOWN',
    'ethnicity_OTHER',
    'ethnicity_HISPANIC',
    'ethnicity_ASIAN',
    'ethnicity_UNABLE_TO_OBTAIN',
    'ethnicity_AMERICAN_INDIAN',
    'anchor_time',
    'death_datetime',
    'target',
]

vitalsCols = ['heartrate', 'sysbp', 'diabp', 'meanbp', 'resprate', 'tempc', 'spo2', 'gcseye', 'gcsverbal', 'gcsmotor']
labsCols = ['chloride_serum', 'creatinine', 'sodium_serum', 'hemoglobin', 'platelet_count', 'urea_nitrogen', 'glucose_serum', 'bicarbonate', 'potassium_serum', 'anion_gap', 'leukocytes_blood_manual', 'hematocrit']

X = dataDf.drop(dropCols, axis = 1)

y = dataDf["target"]

In [2]:
y.value_counts()

target
False    2514
True      437
Name: count, dtype: int64

In [3]:
def buildMLPModel(X, y, layerSize):

    from sklearn.neural_network import MLPClassifier

    mlp = MLPClassifier(random_state=1, max_iter=1000, hidden_layer_sizes = (layerSize, layerSize), learning_rate_init=0.00001)
    mlp.fit(X, y)

    from sklearn.model_selection import cross_validate

    mlpScores = cross_validate(mlp, X, y, cv=5, scoring=['accuracy', 'balanced_accuracy', 'average_precision', 'f1', 'roc_auc'])
    # mlpScores['test_mccf1_score'] = cross_validate(mlp, X, y, cv=5, scoring = make_scorer(calculateMccF1, greater_is_better=True))['test_score']
    return mlpScores


In [4]:
X.iloc[:, 80:90]

Unnamed: 0,potassium_serum_max,anion_gap_max,leukocytes_blood_manual_max,hematocrit_max,chloride_serum_avg,creatinine_avg,sodium_serum_avg,hemoglobin_avg,platelet_count_avg,urea_nitrogen_avg
0,-1.032513,-0.678093,-0.021259,-0.472858,-0.007083,-0.012787,-0.012926,-0.013854,0.405787,-0.921470
1,0.285747,0.082170,-0.010688,0.446159,-0.008696,-0.012383,-0.001477,-0.013210,0.343875,-0.427899
2,-0.373383,-0.831807,0.035511,1.047937,-0.031119,-0.012699,-0.036804,-0.011740,0.351605,-0.562265
3,0.615312,-0.605390,-0.001853,-0.050755,-0.013939,-0.012971,-0.005157,-0.013735,1.225756,-0.878818
4,-0.702948,1.073005,-0.015500,0.143288,-0.005470,-0.011875,-0.016197,-0.013785,-0.063600,0.362122
...,...,...,...,...,...,...,...,...,...,...
2946,-0.702948,-0.393514,0.000146,-0.297642,-0.015955,-0.012491,-0.023556,-0.014199,-0.735650,-0.707966
2947,-0.263528,0.021931,0.047161,-1.073812,-0.017031,-0.010571,-0.030371,-0.013607,0.125970,0.706917
2948,1.604007,1.683708,-0.045485,1.040011,-0.026148,-0.009795,-0.053664,-0.012925,-0.681798,2.135206
2949,1.164587,0.645097,0.008443,1.320753,-0.009825,-0.010595,-0.006384,-0.011776,-1.018369,2.220513


In [5]:
mlpScores = buildMLPModel(X, y, 150)
mlpScores



{'fit_time': array([39.46194911, 39.73794484, 40.91249561, 40.09673738, 39.04583216]),
 'score_time': array([0.01528883, 0.01414251, 0.01390457, 0.01400828, 0.01344228]),
 'test_accuracy': array([0.90186125, 0.91355932, 0.91186441, 0.92372881, 0.92881356]),
 'test_balanced_accuracy': array([0.74077128, 0.782946  , 0.77719888, 0.79366331, 0.81759326]),
 'test_average_precision': array([0.71278652, 0.72105932, 0.68754659, 0.7649783 , 0.78750137]),
 'test_f1': array([0.60810811, 0.67096774, 0.66233766, 0.70198675, 0.73417722]),
 'test_roc_auc': array([0.87052684, 0.87710518, 0.88722835, 0.9076118 , 0.89084571])}