In [1]:
'''
Script for Mortality prediction from ICU data 
Author : srinivasan@cs.toronto.edu
'''

import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

feature_max = {}
feature_min = {}
feature_mean = {}
feature_std = {}

def min_max_normalize(feature, feature_name, test=False):
    if not test:
        max_val = feature.max()
        min_val = feature.min()
        feature_max[feature_name] = max_val
        feature_min[feature_name] = min_val
    else:
        max_val = feature_max[feature_name]
        min_val = feature_min[feature_name]

    feature = (feature - min_val) / (max_val - min_val)
    return feature


def mean_std_normalize(feature, feature_name, test=False):
    if not test:
        mean = feature.mean()
        std = feature.std()
        feature_mean[feature_name] = mean
        feature_std[feature_name] = std
    else:
        mean = feature_mean[feature_name]
        std = feature_std[feature_name]

    feature = (feature - mean) / (std)
    return feature

def normalize_data(data, features, type='min_max', test=False):
    for feature in features:
        if type=='min_max':
            data[feature] = min_max_normalize(data[feature], feature, test)
        elif type=='mean_std':
            data[feature] = mean_std_normalize(data[feature], feature, test)
    return data

def predict_mortality():
    mimicdir = os.path.expanduser("~/Coursework/ML4Health/Assignment")
    data = pd.read_csv(os.path.join(mimicdir, 'adult_icu.gz'), compression='gzip')
    print(list(data))

    train_data = data.loc[data['train'] == 1]
    test_data = data.loc[data['train'] == 0]
    
    feature_cols = ['age', 'first_hosp_stay', 'first_icu_stay', 'adult_icu', 'eth_asian', 'eth_black', 'eth_hispanic', 'eth_other', 'eth_white', 'admType_ELECTIVE', 'admType_EMERGENCY', 'admType_NEWBORN', 'admType_URGENT',
     'heartrate_min', 'heartrate_max', 'heartrate_mean', 'sysbp_min', 
     'sysbp_max', 'sysbp_mean', 'diasbp_min', 'diasbp_max', 'diasbp_mean', 
     'meanbp_min', 'meanbp_max', 'meanbp_mean', 'resprate_min', 'resprate_max', 
     'resprate_mean', 'tempc_min', 'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max', 
     'spo2_mean', 'glucose_min', 'glucose_max', 'glucose_mean', 'aniongap', 'albumin', 
     'bicarbonate', 'bilirubin', 'creatinine','chloride', 'glucose', 'hematocrit', 
     'hemoglobin', 'lactate', 'magnesium', 'phosphate', 'platelet', 'potassium', 'ptt', 
     'inr', 'pt', 'sodium', 'bun', 'wbc']

    normalize_features = ['age', 'heartrate_min','heartrate_max','heartrate_mean', 'sysbp_min', 
     'sysbp_max', 'sysbp_mean', 'diasbp_min', 'diasbp_max', 'diasbp_mean', 
     'meanbp_min', 'meanbp_max', 'meanbp_mean', 'resprate_min', 'resprate_max', 
     'resprate_mean', 'tempc_min', 'tempc_max', 'tempc_mean', 'spo2_min', 'spo2_max', 
     'spo2_mean', 'glucose_min', 'glucose_max', 'glucose_mean','aniongap', 'albumin', 
     'bicarbonate', 'bilirubin', 'creatinine','chloride', 'glucose', 'hematocrit', 
     'hemoglobin', 'lactate', 'magnesium', 'phosphate', 'platelet', 'potassium','ptt', 
     'inr', 'pt', 'sodium', 'bun', 'wbc']
    mortality_target = ['mort_icu']

    train_X = train_data.loc[:,feature_cols]
    train_X = normalize_data(train_X, normalize_features,type='min_max', test=False)
    train_Y = train_data.loc[:,mortality_target]

    print(train_X.shape)
    print(train_Y.shape)

    test_X = test_data.loc[:,feature_cols]
    test_X = normalize_data(test_X, normalize_features,type='min_max', test=True)
    test_Y = test_data.loc[:,mortality_target]

    classifier = LogisticRegression(penalty='l2', C=1.0, random_state=0, solver='lbfgs', multi_class='ovr')
    classifier.fit(train_X, train_Y)
    W = classifier.coef_
    influence_weights = W.argsort(axis=-1)
    
    min_influence = influence_weights[0][0:5]
    max_influence = influence_weights[0][-5:][::-1]

    print(max_influence)
    print(np.array(feature_cols)[max_influence])
    print(np.array(feature_cols)[min_influence])
    print(np.array(feature_cols)[[27,46,7,29,10]])


    pred_Y = classifier.predict_proba(test_X)[:,1]
    print(pred_Y.shape)
    auc = roc_auc_score(test_Y, pred_Y)
    

    print(auc)


In [2]:

def predict_mortality_from_notes():
    mimicdir = os.path.expanduser("~/Coursework/ML4Health/Assignment")
    data = pd.read_csv(os.path.join(mimicdir, 'adult_notes.gz'), compression='gzip')
    print(list(data))

    train_data = data.loc[data['train'] == 1]
    test_data = data.loc[data['train'] == 0]
    train_data = data.loc[0:100]
    print(train_data)
    for i in range(0, train_data.shape[0]):
        text = train_data.iloc[i]['chartext']
        try:
            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(text)
        except:
            tokens = ['test']
        stop = set(stopwords.words('english'))
        clean_tokens = [k for k in tokens if k not in stop]
        train_data = train_data.set_value(i,'chartext', (' ').join(clean_tokens))
    
    print(train_data)
    vectorizer = TfidfVectorizer()
    train_X = vectorizer.fit_transform(train_data.loc[:,'chartext'].values.astype('U'))
    train_Y = train_data.loc[:,'mort_icu']
    train_X = train_X.todense()
    
    classifier = LogisticRegression(penalty='l2', C=1.0, random_state=0, solver='lbfgs', multi_class='ovr')
    classifier.fit(train_X, train_Y)
    W = classifier.coef_
    influence_weights = W.argsort(axis=-1)

    for i in range(0, test_data.shape[0]):
        text = test_data.iloc[i]['chartext']
        try:
            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(text)
        except:
            tokens = ['test']
        stop = set(stopwords.words('english'))
        clean_tokens = [i for i in tokens if i not in stop]
        test_data = test_data.set_value(i,'chartext', (' ').join(clean_tokens))

    test_X = vectorizer.transform(test_data.loc[:,'chartext'].values.astype('U'))
    test_Y = test_data.loc[:,'mort_icu'].values.astype('int')

    pred_Y = classifier.predict_proba(test_X)[:,1]
    print(pred_Y.shape)
    print(test_Y)
    auc = roc_auc_score(test_Y, pred_Y)
    
    print(auc)
    

if __name__ == '__main__':
    #predict_mortality()
    predict_mortality_from_notes()


['subject_id', 'hadm_id', 'icustay_id', 'chartext', 'train', 'mort_icu']
     subject_id  hadm_id  icustay_id  \
0             3   145834      211552   
1             6   107064      228232   
2             9   150750      220597   
3            12   112213      232669   
4            13   143045      263738   
5            17   194023      277042   
6            21   109451      217847   
7            21   111970      216859   
8            25   129635      203487   
9            26   197661      244882   
10           31   128652      254478   
11           32   175413      295037   
12           35   166707      282039   
13           36   122659      211200   
14           36   165660      241249   
15           38   185910      248910   
16           41   101757      237024   
17           41   101757      261027   
18           43   146828      225852   
19           44   181750      291554   
20           59   104130      224440   
21           61   176332      252348   
22     



     subject_id  hadm_id  icustay_id  \
0             3   145834      211552   
1             6   107064      228232   
2             9   150750      220597   
3            12   112213      232669   
4            13   143045      263738   
5            17   194023      277042   
6            21   109451      217847   
7            21   111970      216859   
8            25   129635      203487   
9            26   197661      244882   
10           31   128652      254478   
11           32   175413      295037   
12           35   166707      282039   
13           36   122659      211200   
14           36   165660      241249   
15           38   185910      248910   
16           41   101757      237024   
17           41   101757      261027   
18           43   146828      225852   
19           44   181750      291554   
20           59   104130      224440   
21           61   176332      252348   
22           61   189535      217135   
23           62   116009      216609   


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc[index, col] = value


(9990,)
[                   0                    0                    0 ...,
 -9223372036854775808 -9223372036854775808 -9223372036854775808]


ValueError: multiclass format is not supported

In [4]:
mimicdir = os.path.expanduser("~/Coursework/ML4Health/Assignment")
data = pd.read_csv(os.path.join(mimicdir, 'adult_notes.gz'), compression='gzip')
print(list(data))

train_data = data.loc[data['train'] == 1]
test_data = data.loc[data['train'] == 0]
train_data = train_data[0:100]

print(train_data)
for i in range(0, train_data.shape[0]):
    text = train_data.iloc[i]['chartext']
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
    except:
        tokens = ['test']
    stop = set(stopwords.words('english'))
    clean_tokens = [k for k in tokens if k not in stop]
    train_data = train_data.set_value(i,'chartext', (' ').join(clean_tokens))

print(train_data)
vectorizer = TfidfVectorizer()
train_X = vectorizer.fit_transform(train_data.loc[:,'chartext'].values.astype('U'))
train_Y = train_data.loc[:,['mort_icu']]
train_X = train_X.todense()

classifier = LogisticRegression(penalty='l2', C=1.0, random_state=0, solver='lbfgs', multi_class='ovr')
classifier.fit(train_X, train_Y)
W = classifier.coef_
influence_weights = W.argsort(axis=-1)

for i in range(0, test_data.shape[0]):
    text = test_data.iloc[i]['chartext']
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(text)
    except:
        tokens = ['test']
    stop = set(stopwords.words('english'))
    clean_tokens = [i for i in tokens if i not in stop]
    test_data = test_data.set_value(i,'chartext', (' ').join(clean_tokens))

test_X = vectorizer.transform(test_data.loc[:,'chartext'].values.astype('U'))
test_Y = test_data.loc[:,['mort_icu']]

pred_Y = classifier.predict_proba(test_X)[:,1]
print(pred_Y.shape)
print(test_Y)
auc = roc_auc_score(test_Y, pred_Y)

print(auc)

['subject_id', 'hadm_id', 'icustay_id', 'chartext', 'train', 'mort_icu']
     subject_id  hadm_id  icustay_id  \
0             3   145834      211552   
1             6   107064      228232   
2             9   150750      220597   
3            12   112213      232669   
4            13   143045      263738   
5            17   194023      277042   
7            21   111970      216859   
8            25   129635      203487   
9            26   197661      244882   
10           31   128652      254478   
11           32   175413      295037   
12           35   166707      282039   
13           36   122659      211200   
14           36   165660      241249   
16           41   101757      237024   
18           43   146828      225852   
20           59   104130      224440   
21           61   176332      252348   
24           64   172056      232593   
25           65   143430      244776   
27           68   170467      294232   
28           71   111944      211832   
29     



    subject_id   hadm_id  icustay_id  \
0          3.0  145834.0    211552.0   
1          6.0  107064.0    228232.0   
2          9.0  150750.0    220597.0   
3         12.0  112213.0    232669.0   
4         13.0  143045.0    263738.0   
5         17.0  194023.0    277042.0   
7         21.0  111970.0    216859.0   
8         25.0  129635.0    203487.0   
9         26.0  197661.0    244882.0   
10        31.0  128652.0    254478.0   
11        32.0  175413.0    295037.0   
12        35.0  166707.0    282039.0   
13        36.0  122659.0    211200.0   
14        36.0  165660.0    241249.0   
16        41.0  101757.0    237024.0   
18        43.0  146828.0    225852.0   
20        59.0  104130.0    224440.0   
21        61.0  176332.0    252348.0   
24        64.0  172056.0    232593.0   
25        65.0  143430.0    244776.0   
27        68.0  170467.0    294232.0   
28        71.0  111944.0    211832.0   
29        73.0  194730.0    284305.0   
31        83.0  158569.0    254066.0   


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [3]:
print(test_X.shape)
print(test_Y.shape)
print(data.shape)
print(train_data.shape)
print(test_data.shape)

(9990, 10474)
(9990, 1)
(27348, 6)
(101, 6)
(9990, 6)
