In [1]:
import os.path
import numpy as np
import scipy as sp
import pandas as pd
from constants import PROCESSED_PATH
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize, OneHotEncoder, LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sklearn.metrics as skm

In [3]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [4]:
datafn = 'HOUR_00001.csv'

In [5]:
df = pd.read_csv(os.path.join(PROCESSED_PATH, datafn), na_values=['?'])
df.replace('!.+', np.nan, regex=True, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER,ETHNICITY,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,...,MACROCYTES,PEEP,ATYPICAL LYMPHOCYTES,METAMYELOCYTES,MYELOCYTES,ANISOCYTOSIS,MICROCYTES,SODIUM.2,TSTAGE,STAGE
0,3,145834,76,1,WHITE,,,,,,...,,5.0,,,,,,,1,2
1,4,185777,47,0,WHITE,53.6,,116.0,63.0,37.4444,...,,,,,,,,,0,0
2,6,107064,65,0,WHITE,,,,,,...,,,,,,,,,9,3
3,9,150750,41,1,UNKNOWN/NOT SPECIFIED,,,165.14285714285714,86.71428571428571,35.5,...,,,,,,,,,37,1
4,11,194540,50,0,WHITE,,,111.0,55.0,36.8889,...,,,,,,,,,0,0


In [7]:
check_for_nan_columns = set(df.columns) - {'SUBJECT_ID', 'HADM_ID', 'AGE', 'GENDER', 'ETHNICITY','P TSTAGE','P STAGE','TSTAGE','STAGE'}
df = df.astype({k: np.float64 for k in check_for_nan_columns}, inplace=True)

In [8]:
# drop rows where all features=nan
row_nan_bool = np.logical_not(np.all(np.isnan(df.iloc[:,5:-1]), axis=1))
df = df[row_nan_bool]

In [9]:
rest, test = train_test_split(df, test_size=0.20)
devel, valid = train_test_split(rest, test_size=0.25)
devel.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,AGE,GENDER,ETHNICITY,P WEIGHT,P HEIGHT,P SYSTOLIC BP,P DIASTOLIC BP,P TEMPERATURE,...,MACROCYTES,PEEP,ATYPICAL LYMPHOCYTES,METAMYELOCYTES,MYELOCYTES,ANISOCYTOSIS,MICROCYTES,SODIUM.2,TSTAGE,STAGE
6892,15046,191832,71,0,WHITE,58.534004,,,,37.888889,...,,,,,,,,,0,0
16695,51515,141750,59,1,BLACK/AFRICAN AMERICAN,74.0,,,,36.666667,...,,,,,,,,,0,0
15042,40175,103117,85,1,WHITE,,,,,,...,,,,,,,,,0,0
12811,28010,188702,36,1,BLACK/AFRICAN AMERICAN,,,143.0,82.5,37.166683,...,,,,,,,,,0,0
14582,31858,188159,67,1,ASIAN - VIETNAMESE,57.6,,169.0,78.0,,...,,,,,,,,,7,3


In [10]:
#drop columns where all rows=nan
check_nan = devel.isna().sum()
devel.drop(labels=check_nan[(check_nan == devel.shape[0])].keys(), axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [11]:
devel = devel[devel.columns[[0,1,4,2,3] + list(range(5,len(devel.columns)))]]

In [12]:
data3 = devel.iloc[:,3:-2]
devel.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ETHNICITY', 'AGE', 'GENDER', 'P WEIGHT',
       'P HEIGHT', 'P SYSTOLIC BP', 'P DIASTOLIC BP', 'P TEMPERATURE',
       ...
       'FIBRINOGEN', 'TIDAL VOLUME', 'CREATININE.1', 'PEEP',
       'ATYPICAL LYMPHOCYTES', 'METAMYELOCYTES', 'MYELOCYTES', 'SODIUM.2',
       'TSTAGE', 'STAGE'],
      dtype='object', length=173)

In [13]:
# calculate Kruskal-Wallis H-test for each feature
dfs_by_class = [data3.loc[devel['STAGE'] == c] for c in [0,1,2,3]]
kruskals = {}
for col in data3:
    kruskals[col] = sp.stats.kruskal(*[np.asarray(c[col].dropna()) for c in dfs_by_class])[1]
           
devel_kruskal = devel[list(devel.columns[:3])+[k for k, v in kruskals.items() if v > 0.05]+list(devel.columns[-2:])]
print(devel.head())

       SUBJECT_ID  HADM_ID               ETHNICITY  AGE  GENDER   P WEIGHT  \
6892        15046   191832                   WHITE   71       0  58.534004   
16695       51515   141750  BLACK/AFRICAN AMERICAN   59       1  74.000000   
15042       40175   103117                   WHITE   85       1        NaN   
12811       28010   188702  BLACK/AFRICAN AMERICAN   36       1        NaN   
14582       31858   188159      ASIAN - VIETNAMESE   67       1  57.600000   

       P HEIGHT  P SYSTOLIC BP  P DIASTOLIC BP  P TEMPERATURE  ...    \
6892        NaN            NaN             NaN      37.888889  ...     
16695       NaN            NaN             NaN      36.666667  ...     
15042       NaN            NaN             NaN            NaN  ...     
12811       NaN          143.0            82.5      37.166683  ...     
14582       NaN          169.0            78.0            NaN  ...     

       FIBRINOGEN  TIDAL VOLUME  CREATININE.1  PEEP  ATYPICAL LYMPHOCYTES  \
6892          NaN    

In [14]:
means = devel_kruskal.mean()
devel_kruskal.fillna(means, inplace=True)
print(devel_kruskal.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


       SUBJECT_ID  HADM_ID               ETHNICITY  P HEART RATE  \
6892        15046   191832                   WHITE          81.0   
16695       51515   141750  BLACK/AFRICAN AMERICAN         111.5   
15042       40175   103117                   WHITE          92.0   
12811       28010   188702  BLACK/AFRICAN AMERICAN          76.0   
14582       31858   188159      ASIAN - VIETNAMESE          73.5   

       P WHITE BLOOD CELLS     P MCH  P CALCIUM  P MONOCYTES  P NEUTROPHILS  \
6892              9.400357  30.38048   8.385105     4.443768      78.173111   
16695             9.400357  30.38048   8.385105     4.443768      78.173111   
15042             9.400357  30.38048   8.385105     4.443768      78.173111   
12811             9.400357  30.38048   8.385105     4.443768      78.173111   
14582             9.400357  30.38048   8.385105     4.443768      78.173111   

       P EOSINOPHILS  ...    TROPONIN T  TEMPERATURE.1     OXYGEN    AMYLASE  \
6892        0.989254  ...      1.222

In [15]:
# calculate VIFs
features = devel_kruskal[devel_kruskal.columns[3:-2]]
print(features.columns)
done = -1
while done != 0:
    vifs = {}
    for i, n in enumerate(features):
        if i in range(3,features.shape[1]):
            vifs[n] = variance_inflation_factor(np.asarray(features), i)

    drop_vifs = [k for k,v in vifs.items() if v >= 5 or np.isnan(v)]
    print(drop_vifs)
    features.drop(labels=drop_vifs, axis=1, inplace=True)
    done = len(drop_vifs)

Index(['P HEART RATE', 'P WHITE BLOOD CELLS', 'P MCH', 'P CALCIUM',
       'P MONOCYTES', 'P NEUTROPHILS', 'P EOSINOPHILS', 'P PO2',
       'P ASPARATE AMINOTRANSFERASE (AST)', 'P GLUCOSE.1', 'P RBC',
       'P EPITHELIAL CELLS', 'P CREATINE KINASE (CK)', 'P CREATINE KINASE',
       'P LACTATE DEHYDROGENASE (LD)', 'P SODIUM.1', 'P LIPASE',
       'P TROPONIN T', 'P TEMPERATURE.1', 'P OXYGEN', 'P AMYLASE',
       'P CREATININE.1', 'P PEEP', 'P ATYPICAL LYMPHOCYTES', 'P MYELOCYTES',
       'HEART RATE', 'WHITE BLOOD CELLS', 'MCH', 'CALCIUM', 'MONOCYTES',
       'NEUTROPHILS', 'EOSINOPHILS', 'PO2', 'ASPARATE AMINOTRANSFERASE (AST)',
       'GLUCOSE.1', 'RBC', 'EPITHELIAL CELLS', 'CREATINE KINASE (CK)',
       'CREATINE KINASE', 'LACTATE DEHYDROGENASE (LD)', 'SODIUM.1', 'LIPASE',
       'TROPONIN T', 'TEMPERATURE.1', 'OXYGEN', 'AMYLASE', 'CREATININE.1',
       'PEEP', 'ATYPICAL LYMPHOCYTES', 'MYELOCYTES'],
      dtype='object')


  vif = 1. / (1. - r_squared_i)


['P CALCIUM', 'P MONOCYTES', 'P NEUTROPHILS', 'P EOSINOPHILS', 'P PO2', 'P ASPARATE AMINOTRANSFERASE (AST)', 'P GLUCOSE.1', 'P RBC', 'P EPITHELIAL CELLS', 'P CREATINE KINASE (CK)', 'P CREATINE KINASE', 'P LACTATE DEHYDROGENASE (LD)', 'P SODIUM.1', 'P LIPASE', 'P TROPONIN T', 'P TEMPERATURE.1', 'P OXYGEN', 'P AMYLASE', 'P CREATININE.1', 'P PEEP', 'P ATYPICAL LYMPHOCYTES', 'P MYELOCYTES', 'HEART RATE', 'WHITE BLOOD CELLS', 'MCH', 'CALCIUM', 'MONOCYTES', 'NEUTROPHILS', 'EOSINOPHILS', 'PO2', 'ASPARATE AMINOTRANSFERASE (AST)', 'GLUCOSE.1', 'RBC', 'EPITHELIAL CELLS', 'CREATINE KINASE (CK)', 'CREATINE KINASE', 'LACTATE DEHYDROGENASE (LD)', 'SODIUM.1', 'LIPASE', 'TROPONIN T', 'TEMPERATURE.1', 'OXYGEN', 'AMYLASE', 'CREATININE.1', 'PEEP', 'ATYPICAL LYMPHOCYTES', 'MYELOCYTES']
[]


In [16]:
devel_vif = devel_kruskal[list(devel_kruskal.columns[:3]) + list(features.columns) + list(devel_kruskal.columns[-2:])]
devel_vif.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ETHNICITY,P HEART RATE,P WHITE BLOOD CELLS,P MCH,TSTAGE,STAGE
6892,15046,191832,WHITE,81.0,9.400357,30.38048,0,0
16695,51515,141750,BLACK/AFRICAN AMERICAN,111.5,9.400357,30.38048,0,0
15042,40175,103117,WHITE,92.0,9.400357,30.38048,0,0
12811,28010,188702,BLACK/AFRICAN AMERICAN,76.0,9.400357,30.38048,0,0
14582,31858,188159,ASIAN - VIETNAMESE,73.5,9.400357,30.38048,7,3


In [17]:
devel_vif

Unnamed: 0,SUBJECT_ID,HADM_ID,ETHNICITY,P HEART RATE,P WHITE BLOOD CELLS,P MCH,TSTAGE,STAGE
6892,15046,191832,WHITE,81.000000,9.400357,30.38048,0,0
16695,51515,141750,BLACK/AFRICAN AMERICAN,111.500000,9.400357,30.38048,0,0
15042,40175,103117,WHITE,92.000000,9.400357,30.38048,0,0
12811,28010,188702,BLACK/AFRICAN AMERICAN,76.000000,9.400357,30.38048,0,0
14582,31858,188159,ASIAN - VIETNAMESE,73.500000,9.400357,30.38048,7,3
4496,9794,146798,BLACK/AFRICAN AMERICAN,118.500000,9.400357,25.40000,0,0
18045,60074,165473,WHITE,75.500000,9.400357,30.38048,0,0
20998,80596,133963,HISPANIC OR LATINO,80.500000,9.400357,30.38048,0,0
5431,11753,160746,UNABLE TO OBTAIN,87.598062,9.400357,27.20000,0,0
12612,27657,114460,BLACK/AFRICAN AMERICAN,74.750000,9.400357,30.38048,0,0


In [18]:
test.fillna(means, inplace=True)
valid.fillna(means, inplace=True)

test = test[devel_vif.columns]
valid = valid[devel_vif.columns]

train = devel_vif.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE'], axis=1)
testv = test.drop(['SUBJECT_ID','HADM_ID','ETHNICITY','TSTAGE'], axis=1)

x_train = train.values[:, :-2]
y_train = train.values[:, -1]
x_train = normalize(x_train, axis=0)
ohe = LabelBinarizer()
ohe.fit(y_train.reshape(-1, 1))

x_test = testv.values[:, :-2]
x_test = normalize(x_test, axis=0)
y_test = testv.values[:, -1]

ohe_y_train = ohe.transform(y_train.reshape(-1,1))
ohe_y_test = ohe.transform(y_test.reshape(-1,1))
print(ohe_y_train.shape)
print(ohe_y_test.shape)

(14342, 4)
(4781, 4)


In [19]:
def runModel(model, x, y, x_test, y_test):
    m = OneVsRestClassifier(model)
    m.fit(x,y)
    y_predict = m.predict(x_test)
    
    return m, y_predict

In [20]:
def runMetrics(y_test, y_predict):
    TP = {}
    FP = {}
    TN = {}
    FN = {}

    for i in range(y_predict.shape[1]): 
        TP[i] = np.sum(np.logical_and(y_predict[:,i]==1,y_test[:,i]==1))
        FP[i] = np.sum(np.logical_and(y_predict[:,i]==1,y_test[:,i]!=y_predict[:,i]))
        TN[i] = np.sum(np.logical_and(y_predict[:,i]==0,y_test[:,i]==0))
        FN[i] = np.sum(np.logical_and(y_predict[:,i]==0,y_test[:,i]!=y_predict[:,i]))
    
    out = {}
    for i in range(y_predict.shape[1]): 
        out[i] = [TP[i], FP[i], TN[i], FN[i]]
    return out

In [21]:
def multiclass_auc(y_test, y_score):
    n_classes = 4
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = skm.roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = skm.auc(fpr[i], tpr[i])
    roc_auc['avg'] = sum(roc_auc.values())/n_classes
    return roc_auc

In [22]:
models = {
    'SVC': SVC(),
    'SGDClassifier': SGDClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
#     'GaussianProcessClassifier': GaussianProcessClassifier(), # this one was giving me an out of memory error
    'MLPClassifier': MLPClassifier(),
}

In [23]:
rawScores = {}
accScores = {}
for n, m in models.items():
    print(n)
    m, y_predict = runModel(m, x_train, ohe_y_train, x_test, ohe_y_test)
    accScores[n] = skm.accuracy_score(ohe_y_test, y_predict)
    rawScores[n] = runMetrics(ohe_y_test, y_predict)
    if n in {'SVC', 'SGDClassifier', 'GradientBoostingClassifier'}:
        aucs = multiclass_auc(ohe_y_test, m.decision_function(x_test))
    else:
        aucs = multiclass_auc(ohe_y_test, m.predict_proba(x_test))
        
    for k in rawScores[n].keys():
        rawScores[n][k].append(aucs[k])
    

SVC




SGDClassifier
GradientBoostingClassifier




DecisionTreeClassifier
GaussianNB
KNeighborsClassifier
MLPClassifier


In [24]:
print(rawScores)

{'SVC': {0: [3461, 1320, 0, 0, 0.487149886615359], 1: [0, 0, 4187, 594, 0.5122730368729891], 2: [0, 0, 4492, 289, 0.5010618646914007], 3: [0, 0, 4344, 437, 0.49791685103944106]}, 'SGDClassifier': {0: [3461, 1320, 0, 0, 0.4915484664617863], 1: [0, 0, 4187, 594, 0.4716603178509078], 2: [0, 0, 4492, 289, 0.4980638397520236], 3: [0, 0, 4344, 437, 0.5011881508358935]}, 'GradientBoostingClassifier': {0: [3318, 1270, 50, 143, 0.5062059485347552], 1: [1, 10, 4177, 593, 0.5119147851414391], 2: [9, 184, 4308, 280, 0.5045852372691784], 3: [0, 2, 4342, 437, 0.49163237333063625]}, 'DecisionTreeClassifier': {0: [2640, 1006, 314, 821, 0.49691792965774473], 1: [34, 240, 3947, 560, 0.49973824705135905], 2: [14, 215, 4277, 275, 0.5038076149217217], 3: [8, 97, 4247, 429, 0.49412061561542575]}, 'GaussianNB': {0: [2835, 1051, 269, 626, 0.5118677383485242], 1: [94, 723, 3464, 500, 0.5028422912349352], 2: [98, 1127, 3365, 191, 0.5466192107768675], 3: [0, 0, 4344, 437, 0.5113750100088078]}, 'KNeighborsClassif

In [25]:
with open("RAW_OUTPUT_"+datafn, 'w') as fout:
    header = ',' + ',,,,'.join("CLASS %d (%d)" % (c, np.sum(ohe_y_test[:,c])) for c in [0,1,2,3]) + '\n'
    header+= 'MODEL,' + ','.join("TP,FP,TN,FN" for c in [0,1,2,3]) + '\n'
    fout.write(header)
    for m in rawScores:
        fout.write(m+',')
        for c in rawScores[m]:
            for i in rawScores[m][c][:-1]:
                fout.write(str(i)+',')
        fout.write('\n')

In [26]:
# AUC,PPV,NPV,SEN,SPE,F1
# TP,FP,TN,FN
def calcScores(rs):
#     print('?',rs)
    auc = rs[4]
    ppv = rs[0]/(rs[0]+rs[1])
    npv = rs[2]/(rs[2]+rs[3])
    sen = rs[0]/(rs[0]+rs[3])
    spe = rs[2]/(rs[0]+rs[1])
    f1  = (2*rs[0])/((2*rs[0])+rs[1]+rs[3])
    return (auc,ppv,npv,sen,spe,f1)

In [27]:
with open("OUTPUT_"+datafn, 'w') as fout:
    header = ',AVG (%d),,,,,,' % ohe_y_test.shape[0] + ',,,,,,'.join("CLASS %d (%d)" % (c, np.sum(ohe_y_test[:,c])) for c in [0,1,2,3]) + '\n'
    header+= 'MODEL,ACC,AUC,PPV,NPV,SEN,SPE,F1,' + ','.join("AUC,PPV,NPV,SEN,SPE,F1" for c in [0,1,2,3]) + '\n'
    fout.write(header)
    for m in rawScores:
        calcedScores = [calcScores(rawScores[m][k]) for k in rawScores[m]]
        print(calcedScores)
        avgScores = [0 for i in calcedScores[0]]
        print(avgScores)
        for i, c in enumerate(calcedScores):
            for j, _ in enumerate(c):
                avgScores[j] += calcedScores[i][j]
        for i, v in enumerate(avgScores):
            avgScores[i] = v/len(calcedScores)       
        
        fout.write(m+',')
        fout.write(str(accScores[m])+',')
        for v in avgScores:
            fout.write(str(v)+',')
        for c in calcedScores:
            for i in c:
                fout.write(str(i)+',')
        fout.write('\n')

[(0.487149886615359, 0.7239071323990797, nan, 1.0, 0.0, 0.839844697888862), (0.5122730368729891, nan, 0.8757582095795858, 0.0, inf, 0.0), (0.5010618646914007, nan, 0.9395523948964651, 0.0, inf, 0.0), (0.49791685103944106, nan, 0.9085965279230287, 0.0, inf, 0.0)]
[0, 0, 0, 0, 0, 0]
[(0.4915484664617863, 0.7239071323990797, nan, 1.0, 0.0, 0.839844697888862), (0.4716603178509078, nan, 0.8757582095795858, 0.0, inf, 0.0), (0.4980638397520236, nan, 0.9395523948964651, 0.0, inf, 0.0), (0.5011881508358935, nan, 0.9085965279230287, 0.0, inf, 0.0)]
[0, 0, 0, 0, 0, 0]
[(0.5062059485347552, 0.7231909328683522, 0.25906735751295334, 0.9586824617162669, 0.010897994768962511, 0.8244502422661201), (0.5119147851414391, 0.09090909090909091, 0.8756813417190775, 0.0016835016835016834, 379.72727272727275, 0.003305785123966942), (0.5045852372691784, 0.046632124352331605, 0.9389712292938099, 0.031141868512110725, 22.321243523316063, 0.03734439834024896), (0.49163237333063625, 0.0, 0.9085582757899142, 0.0, 217

  import sys
  
  if __name__ == '__main__':
