In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics, preprocessing
from scipy import stats
import math
from sklearn.pipeline import Pipeline
# If pandas is not installed, please uncomment the following line:
#!pip install pandas
#!pip install sklearn


Import data

In [55]:
def read_mimic(file, dict_of_str_cols):
    col_names = pd.read_csv(file, nrows=0).columns
    dict_of_str_cols.update({col: object for col in col_names if col not in dict_of_str_cols})
    return pd.read_csv(file, dtype=dict_of_str_cols, low_memory=False, nrows=100000000)


In [None]:
import time
start = time.time()
charts = pd.read_csv(path + 'CHARTEVENTS.csv', nrows=100000000)
print(time.time() - start)

In [None]:
start = time.time()
read_mimic(path + 'CHARTEVENTS.csv', numeric_cols)
print(time.time() - start)

In [49]:
#full
path = "payload/" # change as needed
numeric_cols = {'ROW_ID':float,
                'SUBJECT_ID':float,
                'HADM_ID':float,
                'HAS_CHARTEVENTS_DATA':float,
                'HOSPITAL_EXPIRE_FLAG':float,
                'EXPIRE_FLAG':float,
                'SEQ_NUM':float,
                'ITEMID':float,
                'AMOUNT':float, 
                'RATE':float, 
                'RATEUOM':float,
                'CGID':float,
                'ORDERID':float,
                'LINKORDERID':float,
                'STOPPED':float,
                'NEWBOTTLE':float,
                'ORIGINALAMOUNT':float,
                'ORIGINALRATE':float,
                'ORIGINALRATEUOM':float,
                'ORIGINALSITE':float,
                'VALUENUM':float,
                'CGID':float,
                'ISERROR':float,
                #'STORETIME':float, # could be wrong
                #'CHARTTIME':float, # could be wrong
                'RESULTSTATUS':float,
                'STOPPED': float,
                'WARNING':float,
                'ERROR':float,
                'VALUENUM':float,
                'ICUSTAY_ID':float} 

In [None]:
patients = pd.read_csv(path + 'PATIENTS.csv')
print('Patients')
diagnoses = pd.read_csv(path + 'DIAGNOSES_ICD.csv')
print('Diagnoses')
input_cv = pd.read_csv(path + 'INPUTEVENTS_CV.csv')
print('Input_cv')
input_mv = pd.read_csv(path + 'INPUTEVENTS_MV.csv')
print('Input_mv')
chartevents = pd.read_csv(path + 'CHARTEVENTS.csv', low_memory=False)
print('Chartevents')
labevents = pd.read_csv(path + 'MICROBIOLOGYEVENTS.csv')
print('Microbiology')
prescriptions = pd.read_csv(path + 'PRESCRIPTIONS.csv')
print('Prescriptions')
procedureevents = pd.read_csv(path + 'PROCEDUREEVENTS_MV.csv')
print('Procedure Events MV')
procedures_icd = pd.read_csv(path + 'PROCEDURES_ICD.csv')
print('Procedure Events ICD')
services = pd.read_csv(path + 'SERVICES.csv') 
print('Services')

#### ICD9 Feature Engineering

In [None]:
ccs = open("ccs.txt", "r").read()

# Create dictionary. Key is ICD9 code for a diagnosis. Value is general description of diagnosis.
ccs = ccs[ccs.find('Tuberculosis'):]
icd9={}

def update_icd9(cur_value, section):
    while section:
        if section[:4] == '\n\n':
            print('new value')
            section = section[4:]
            cur_value = section[:section.find('\n')]
            section = section[section.find('\n'):]
        elif section[0] == ' ':
            section = section[1:]
        elif section[:2] == '\n':
            section = section[2:]
        else:
            if section.find(' ') >= 0: # not end of document
                if -1 < section.find('\n') < section.find(' '): # if end of line
                    cur_key = section[:section.find('\n')]
                else: # if not end of line
                    cur_key = section[:section.find(' ')]
                section = section[section.find(' '):]
                icd9[cur_key] = cur_value

            else: # end of section
                cur_key = section
                icd9[cur_key] = cur_value
                section = ""
            
for section in ccs.split(sep='\n\n'): # for each family of codes
    cur_value = section[:section.find('\n')] # get the name for that family
    section = section[section.find('\n')+1:] # and for all the codes under that family
    update_icd9(cur_value, section) # add those codes as keys to a dictionary, where their values
                                    # are the name for the family of codes

diagnoses.ICD9_CODE = diagnoses.ICD9_CODE.apply(lambda x: icd9.get(x,-1))

#### Create LOS feature

In [None]:
# commented out for mortality classification:
# admissions = admissions[pd.isnull(admissions['DEATHTIME'])]
df = admissions[['SUBJECT_ID',
                 'HADM_ID',
                 'ADMISSION_TYPE',
                 'ADMITTIME',
                 'ADMISSION_LOCATION',
                 'INSURANCE',
                 'LANGUAGE',
                 'RELIGION',
                 'MARITAL_STATUS',
                 'ETHNICITY']].copy()

df = pd.merge(df, # drop DOD_HOSP too if not classifying mortality
              diagnoses.drop(columns = ['SEQ_NUM','SUBJECT_ID']),
              on='HADM_ID',
              how='left') 

df['LOS'] = (pd.to_datetime(admissions['DISCHTIME']) - pd.to_datetime(admissions['ADMITTIME'])).astype('timedelta64[h]') 
df.columns

Note to self: There are negative LOS values for when a patient dies prior to arriving to the hospital. I keep these in for mortality classification. But these values kinda lead to meaningless LOS values.

#### Extracting age feature

In [None]:
# For mortality classification, I'm keeping DOD_HOSP so I can create a boolean response for death
# NB: DOD includes ALL deaths (before & after), while DOD_HOSP only includes deaths occuring inside the hospital. 
df = pd.merge(df, # drop DOD_HOSP too if not classifying mortality
              patients.drop(columns = ['DOD', 'DOD_SSN','ROW_ID','EXPIRE_FLAG']),
              on='SUBJECT_ID',
              how='left') 
median_dob_shift = 300 - 91.4 # For old patients (median age of 91.4), dob was shifted to be 300 yrs prior to first visit
df['AGE'] = (pd.to_datetime(df['ADMITTIME']).dt.date - pd.to_datetime(df['DOB']).dt.date)
df['AGE'] = [age.days/365 if age.days/365<300 else age.days/365-median_dob_shift for age in df['AGE']]

#### Extracting whether-they-died feature

In [None]:
df['DIED'] = df['DOD_HOSP'].apply(lambda x: not pd.isnull(x))


#### Trig transform for admit time

In [None]:
df['ADMITHOUR_trig_x'] = pd.to_datetime(df['ADMITTIME']).dt.hour.apply(math.cos)
df['ADMITHOUR_trig_y'] = pd.to_datetime(df['ADMITTIME']).dt.hour.apply(math.sin)

## Christine's features

#### Delete unnecessary features

In [10]:
df.drop(labels=['DOB', 'DOD_HOSP','ADMITTIME','SUBJECT_ID','HADM_ID','ROW_ID'],axis=1, inplace=True)

"DOB is the date of birth of the given patient. Patients who are older than 89 years old at any time in the database have had their date of birth shifted to obscure their age and comply with HIPAA. The shift process was as follows: the patient’s age at their first admission was determined. The date of birth was then set to exactly 300 years before their first admission"


In [11]:
df.columns

Index(['ADMISSION_TYPE', 'ADMISSION_LOCATION', 'INSURANCE', 'LANGUAGE',
       'RELIGION', 'MARITAL_STATUS', 'ETHNICITY', 'ICD9_CODE', 'LOS', 'GENDER',
       'AGE', 'DIED', 'ADMITHOUR_trig_x', 'ADMITHOUR_trig_y'],
      dtype='object')

Create dummy variables and drop those with less information

In [12]:
print('Shape before adding dummy variables:',df.shape)
df = pd.get_dummies(df, drop_first=True)
print('Shape after adding dummy variables:', df.shape)
#df.drop([col for col, val in df.sum().iteritems() if val < 10], axis=1, inplace=True)
#print('Shape after dropping columns with few observations:', df.shape)

# It turns out ADMITHOUR after trig transform is highly predictive of whether you die

Shape before adding dummy variables: (651047, 14)
Shape after adding dummy variables: (651047, 441)


In [14]:
df_pred = df.drop(['LOS'],axis=1)
X= df_pred.drop(['DIED',],axis=1)
y= df_pred['DIED']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [25]:
pipeline = Pipeline([
    ('scaler',preprocessing.StandardScaler()),
    ('clf', LogisticRegression())
])
print('Pipeline created')
pipeline.fit(X_train,y_train)
print('Pipeline fit')
pipeline.score(X_test,y_test)

Pipeline created
Pipeline fit


0.7073343061208817

In [27]:
0.7073343061208817 - (1-sum(y_test)/len(y_test))

0.022048997772828538

In [29]:
X_train.apply(lambda x: abs(x.corr(y_train))).sort_values(ascending=False).head(20)

AGE                                                                               0.252403
INSURANCE_Medicare                                                                0.195332
ADMISSION_TYPE_EMERGENCY                                                          0.184657
ADMISSION_TYPE_NEWBORN                                                            0.178849
ADMISSION_LOCATION_EMERGENCY ROOM ADMIT                                           0.149062
INSURANCE_Private                                                                 0.146854
ADMISSION_LOCATION_PHYS REFERRAL/NORMAL DELI                                      0.137990
RELIGION_JEWISH                                                                   0.095305
ICD9_CODE_224  Other perinatal conditions                                         0.083662
MARITAL_STATUS_WIDOWED                                                            0.082550
ICD9_CODE_42   Secondary malignancies                                             0.079806