In [1]:
import pandas as pd
from pandas import Series
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold
import random
import os
import sys
import shutil
import csv
import re

mimic3_path="/home/rafiparvez1706/mimic"
output_path ="../data/root"
variable_map_file='../resources/itemid_to_variable_map.csv'
variable_ranges_file='../resources/variable_ranges.csv'
icu_stays_root_path=output_path
channel_info_file ='../resources/channel_info.json'
import json

In [2]:
icu_stay_ids = pd.read_csv('../data/working_ids.csv', header=None, names =['icustay_id'])
print(len(icu_stay_ids))
icu_stay_ids.head()

49891


Unnamed: 0,icustay_id
0,262146
1,262147
2,262156
3,262158
4,262164


In [3]:
# read all stays data
all_stays = pd.read_csv('../data/clean_readm_details.csv')
all_stays.shape

(50710, 11)

In [4]:
working_stays = all_stays.loc[all_stays.icustay_id.isin(icu_stay_ids.icustay_id)]
working_stays.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,last_wardid,first_careunit,last_careunit,age,gender,marital_status,insurance,IsReadmitted
0,3,145834,211552,12,MICU,MICU,76.526788,M,MARRIED,Medicare,0
1,4,185777,294638,52,MICU,MICU,47.845044,F,SINGLE,Private,0
2,6,107064,228232,33,SICU,SICU,65.94067,F,MARRIED,Medicare,0
3,9,150750,220597,15,MICU,MICU,41.790226,M,,Medicaid,0
4,11,194540,229441,57,SICU,SICU,50.148292,F,MARRIED,Private,0


In [5]:
sum(working_stays.IsReadmitted==1)*100/len(working_stays)

11.667435008318133

In [6]:
icustay_id_train, icustay_id_cv = train_test_split(icu_stay_ids, test_size=0.25, random_state=29)


train_set = all_stays.loc[all_stays.icustay_id.isin(icustay_id_train.icustay_id)]
cv_set = all_stays.loc[all_stays.icustay_id.isin(icustay_id_cv.icustay_id)]
print('Train Set:')
print(sum(train_set.IsReadmitted==1)*100/len(train_set))

print('\nCV Set:')
print(sum(cv_set.IsReadmitted==1)*100/len(cv_set))

Train Set:
11.71895878988722

CV Set:
11.512867794435982


In [7]:
folders = os.listdir(icu_stays_root_path)
folders = list((filter(str.isdigit, folders)))

In [8]:
train_icu_ids = [x for x in folders if int(x) in list(icustay_id_train.icustay_id)]

In [9]:
cv_icu_ids = [x for x in folders if int(x) in list(cv_set.icustay_id)]

In [10]:
def move_to_partition(icu_stay_ids, partition):
    if not os.path.exists(os.path.join(icu_stays_root_path, partition)):
        os.mkdir(os.path.join(icu_stays_root_path, partition))
    for icu_stay_id in icu_stay_ids:
        src = os.path.join(icu_stays_root_path, icu_stay_id)
        dest = os.path.join(icu_stays_root_path, partition, icu_stay_id)
        shutil.move(src, dest)

In [12]:
normal_values = {
            'Capillary refill rate': '0.0',
            'Diastolic blood pressure': '59.0',
            'Fraction inspired oxygen': '0.21',
            'Glascow coma scale eye opening': '4 Spontaneously',
            'Glascow coma scale motor response': '6 Obeys Commands',
            'Glascow coma scale total': '15',
            'Glascow coma scale verbal response': '5 Oriented',
            'Glucose': '128.0',
            'Heart Rate': '86',
            'Height': '170.0',
            'Mean blood pressure': '77.0',
            'Oxygen saturation': '98.0',
            'Respiratory rate': '19',
            'Systolic blood pressure': '118.0',
            'Temperature': '36.6',
            'Weight': '81.0',
            'pH': '7.4',
        }

num_cols=['Diastolic blood pressure','Fraction inspired oxygen', \
 'Glucose', 'Heart Rate','Height', 'Mean blood pressure', 'Oxygen saturation',
 'Respiratory rate','Systolic blood pressure','Temperature','Weight', 'pH']

cat_cols=['Glascow coma scale eye opening',\
          'Glascow coma scale motor response','Glascow coma scale total',\
          'Glascow coma scale verbal response']
#'Capillary refill rate',

with open(channel_info_file) as data_file:
    channel_info = json.load(data_file)

In [13]:
#Function to bin data timeseries
def bin_icu_id(timeseries,bin_width=0.8):
    #timeseries.drop(['Height','Weight'],axis=1,inplace=True)
    timeseries['Hours']=(timeseries['Hours']/bin_width).astype('int')       
    timeseries=timeseries.groupby('Hours').mean().reset_index()
    new_index = range(timeseries['Hours'].iloc[-1]+1)
    timeseries=timeseries.set_index('Hours').reindex(new_index).reset_index()
    
    #missing value imputations
    timeseries=timeseries.fillna(method='ffill')
    timeseries=timeseries.fillna(method='bfill')  
    return timeseries

In [14]:
#Normalizing training dataset
variable_ranges=pd.read_csv(variable_ranges_file)

#s_min = pd.DataFrame(columns=num_cols)
s_min = pd.Series(index=num_cols)
#s_max = pd.DataFrame(columns=num_cols)
s_max = pd.Series(index=num_cols)

#num_train = len(icustay_id_train.icustay_id)
num_cv = len(icustay_id_cv.icustay_id)

for c,icu_id in enumerate(list(icustay_id_cv.icustay_id)):
    #sys.stdout.write('\ricustay_id={2} {0} of {1}...'.format(c+1, num_train, icu_id))
    sys.stdout.write('\ricustay_id={2} {0} of {1}...'.format(c+1, num_cv, icu_id))
    
    #id_file = os.path.join(output_path, 'train',str(icu_id), 'episode_timeseries.csv')
    id_file = os.path.join(output_path, 'valid',str(icu_id), 'episode_timeseries.csv')
    df_id = pd.read_csv(id_file)
    
    #cleaned_id_file = os.path.join(output_path, 'train',str(icu_id), 'cleaned_timeseries.csv')
    cleaned_id_file = os.path.join(output_path, 'valid',str(icu_id), 'cleaned_timeseries.csv') 
    
    
    #handling missing values
    df_id=df_id.fillna(method='ffill')
    df_id=df_id.fillna(method='bfill')
    df_id=df_id.fillna(value=normal_values)
    
    df_id[num_cols] = df_id[num_cols].astype('float')
    df_id['Glascow coma scale total']=df_id['Glascow coma scale total'].astype(int).astype(str)
    df_id['Capillary refill rate']=df_id['Capillary refill rate'].astype(float).astype(int)
    
    #correcting outliers
    for num_col in num_cols:
        if num_col=='Heart Rate':
            num_col='Heart rate'  
        var_range = variable_ranges[variable_ranges.LEVEL2==num_col] 
        #o_min, o_max = var_range['OUTLIER LOW'].values[0], var_range['OUTLIER HIGH'].values[0]
        val_min, val_max = var_range['VALID LOW'].values[0], var_range['VALID HIGH'].values[0]
        impute_val = var_range['IMPUTE'].values[0]

        if num_col=='Heart rate':
            num_col='Heart Rate'
        '''
        df_id[num_col] = df_id[num_col].apply(
            lambda x: (impute_val-val_min)/(val_max-val_min)
            if (x>val_max or x<val_min) else (x-val_min)/(val_max-val_min))
        '''              
        df_id[num_col] = df_id[num_col].apply(
            lambda x:1 if x>val_max else 0 if x<val_min else (x-val_min)/(val_max-val_min))

    #dummy columns for categorical variables
    for cat_col in cat_cols:
        vals=channel_info[cat_col]['possible_values']
        new_vals = [cat_col+'_'+s for s in vals]
        dummies = pd.get_dummies(df_id[cat_col], prefix=cat_col)
        dummies = dummies.T.reindex(new_vals).T.fillna(0)
        df_id=df_id.drop([cat_col], axis=1)
        df_id=df_id.join(dummies)
        
    #Binning the Dataset
    df_id= bin_icu_id(df_id,bin_width=0.8)
    df_id.to_csv(cleaned_id_file, index=False)

icustay_id=289460 12473 of 12473...