### Data Preperation and preprocessing data for LSTM

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_data = pd.read_csv("C:\\Users\\cssar\\890CA\\lstm-mimic\\lstmmimic.csv")

In [3]:
raw_data['charttime'] = pd.to_datetime(raw_data['charttime'])
raw_data['starttime'] = pd.to_datetime(raw_data['starttime'])
raw_data['endtime'] = pd.to_datetime(raw_data['endtime'])

In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2423949 entries, 0 to 2423948
Data columns (total 24 columns):
 #   Column                Dtype         
---  ------                -----         
 0   hadm_id               int64         
 1   age                   float64       
 2   gender                int64         
 3   los                   float64       
 4   charttime             datetime64[ns]
 5   starttime             datetime64[ns]
 6   endtime               datetime64[ns]
 7   emergency_admission   int64         
 8   newborn_admission     int64         
 9   urgent_admission      int64         
 10  last_micu_unit        int64         
 11  last_csru_unit        int64         
 12  last_ccu_unit         int64         
 13  last_tsicu_unit       int64         
 14  heart_rate            float64       
 15  temperature           float64       
 16  bun                   float64       
 17  urinout24hrs          float64       
 18  sodium_citrate        float64       
 19  

Converting date time format for charttime, starting time, and ending time

In [5]:
distinct_hospital_admissions = raw_data['hadm_id'].unique()

In [6]:
distinct_hospital_admissions.shape

(6536,)

In [7]:
def fill_missing_chartevents(hadm_data):
    imputed_data = hadm_data
    imputed_data['heart_rate'] = 0
    imputed_data['temperature'] = 0
    imputed_data['bun'] = 0 
    imputed_data['urinout24hrs'] = 0
    imputed_data['sodium_citrate'] = 0
    imputed_data['potasium'] = 0
    imputed_data['bilirubin'] = 0
    imputed_data['wbc'] = 0
    imputed_data['chronic_health'] = 0
    imputed_data.drop(['charttime', 'starttime', 'endtime', 'hospital_expire_flag', 'hadm_id'], axis=1, inplace=True)
    return imputed_data


In [8]:
def time_dimension(hadm_id):
    subject_data = np.empty((24,19))
    targets = raw_data.loc[raw_data['hadm_id'] == hadm_id]
    targets = targets.fillna(0)
    hospital_expire_flag = targets[0:1:].values[0,23]
    date_intervals = pd.date_range(start=targets[0:1:].values[0,5], periods=25, freq='H')
    for i in range(24):
        capture = targets.loc[(targets['charttime'] > date_intervals[i] ) & (targets['charttime'] <= date_intervals[i+1])]
        capture.drop(['charttime', 'starttime', 'endtime', 'hospital_expire_flag', 'hadm_id'], axis=1, inplace=True)
        if capture.shape[0] <= 0:
            capture = fill_missing_chartevents(targets[0:1:])
        subject_data[i] = capture.values[-1]
        # subject_data.append(np.asanyarray(capture.values[-1]))
    return subject_data, hospital_expire_flag

In [9]:
sorted_hadmId = np.sort(distinct_hospital_admissions, axis=None)
for i, admission in enumerate(sorted_hadmId):
    data, flag = time_dimension(admission)
    if i == 0:
        complete_data = data
        hospital_expire_flag = np.ones(1) if flag else np.zeros(1)
    else:
        complete_data = np.vstack([complete_data, data])
        hospital_expire_flag = np.vstack([hospital_expire_flag, np.ones(1) if flag else np.zeros(1)])
X_data = complete_data.reshape(sorted_hadmId.shape[0],24,19)
y_data = hospital_expire_flag

In [10]:
print(f"X_data: {X_data.shape} y_data: {y_data.shape}")

X_data: (6536, 24, 19) y_data: (6536, 1)


### Save numpy file

In [11]:
numpy_feature_file = "C:\\Users\\cssar\\890CA\\lstm-mimic\\features.npy"
numpy_target_file = "C:\\Users\\cssar\\890CA\\lstm-mimic\\targets.npy"


In [12]:
# write features to file 
with open(numpy_feature_file, 'wb') as f:
    np.save(f, X_data)

# write targets to file
with open(numpy_target_file, 'wb') as f:
    np.save(f, y_data)
