In [1]:
import pymongo
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sb
import dask.dataframe as ddf

client = pymongo.MongoClient()
db = client.VentDB
breathData = db.breath_collection
RN = db.RN_collection

%matplotlib inline

In [2]:
def bin_samples(x):
    if x <= 0.05:
        return 0
    elif x == np.nan:
        return np.nan
    elif x > 0.05:
        return 1

In [3]:
breathData.find().count()

1964338

In [4]:
ds_types = 'ds'

In [5]:
patient_df = pd.read_csv('C:\Research_data\Demographic Data v2.csv', engine = 'c',
                             usecols = ['Study ID', 'Age', 'Gender', 'P/F Ratio', 'NMB', 'Start_End_NMB', 'Hospital Discharge Date/Time', 'Discharge Location'])
patient_df.set_index(['Study ID'], inplace = True, drop=False)
patient_df.head(15)
patient_df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_patient_test.h5', key='patient', format='table', append=False)

In [6]:
data = breathData.find({},
                       {'patient_id': 1, 'date_time': 1, 'breath_num': 1, 'breath_settings.peep': 1,
                        'breath_settings.fio2': 1, 'breath_settings.set_VT': 1, 'breath_character.elapse_time':1, 
                        'breath_character.peak_vol':1, 'breath_settings.peak_paw': 1, 'label.' + ds_types: 1, '_id': 1})


rn = RN.find({'$or': [{'FiO2': {'$exists': 1}}, {'PEEP': {'$exists': 1}}]},
             {'patientID': 1, 'date_time': 1, 'FiO2': 1, 'PEEP': 1, 'SpO2': 1, 'Set Vt': 1, 'Plat':1,
              '_id': 0})

In [7]:
df = pd.io.json.json_normalize(data)
rn_df = pd.io.json.json_normalize(list(rn))

In [8]:
df['date_time'] = pd.to_datetime(df['date_time'], unit='s').dt.tz_localize('UTC').dt.tz_convert('US/Mountain').dt.tz_localize(None)
df.dropna(axis=0, how='all', subset=['breath_settings.fio2', 'breath_settings.peep'], inplace=True)
df.drop_duplicates(subset = 'date_time', keep = 'last', inplace = True)
df.set_index(['patient_id'], inplace = True, drop = False)
df.sort_index(inplace = True)
df['breath_settings.peep'] = df['breath_settings.peep'].astype(np.float64)
df['breath_num'] = df['breath_num'].astype(np.float64)
df['label.' + ds_types] = df['label.' + ds_types].astype(np.float64)

df.rename(columns={'breath_settings.fio2':'fio2', 'breath_settings.peep': 'peep',
                        'breath_settings.set_VT': 'set_vt', 'breath_character.elapse_time': 'elapse_time', 
                        'breath_character.peak_vol': 'peak_vol', 'breath_settings.peak_paw': 'peak_paw'}, inplace=True)

df = df[(df['elapse_time'] < 10000) & \
        (df['peak_vol'] < 1000) & (df['peak_vol'] > 100)]

In [9]:
rn_df['date_time'] = pd.to_datetime(rn_df['date_time'], format='%m/%d/%Y %H:%M:%S')
rn_df.rename(columns={'patientID':'patient_id', 'FiO2':'fio2', 'PEEP':'peep', 'Set Vt': 'set_vt', 'SpO2':'spo2', 'Plat':'plat'}, inplace=True)
rn_df.dropna(axis=0, how='all', subset=['fio2', 'peep'], inplace=True)
rn_df.drop_duplicates(subset = 'date_time', keep = 'last', inplace = True)
rn_df.set_index(['patient_id', 'date_time'], inplace = True, drop=False)

In [10]:
df.set_index(['patient_id', 'date_time'], inplace=True, drop=False)
df.sort_index(inplace=True)
df.set_index(['patient_id'], inplace=True, drop=False)

In [11]:
df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_breath_data.h5', key='breath', format='table', append=False)
rn_df.reset_index(inplace=True, drop=True)
rn_df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_rn_data.h5', key='rn', format='table', append=False)
rn_df.set_index(['patient_id', 'date_time'], inplace = True, drop=False)

In [12]:
#df = pd.read_hdf('c:\Research_data\Analysis\Lagged_DS\ds_breath_data.h5')
#rn_df = pd.read_hdf('c:\Research_data\Analysis\Lagged_DS\ds_rn_data.h5')

In [13]:
df.dtypes

_id                    object
elapse_time             int64
peak_vol              float64
breath_num            float64
fio2                  float64
peak_paw              float64
peep                  float64
set_vt                float64
date_time      datetime64[ns]
label.ds              float64
patient_id              int64
dtype: object

In [14]:
rn_df.dtypes

fio2                 float64
peep                 float64
plat                 float64
set_vt               float64
spo2                 float64
date_time     datetime64[ns]
patient_id             int64
dtype: object

In [15]:
df.count()

_id            1867910
elapse_time    1867910
peak_vol       1867910
breath_num     1867910
fio2           1867596
peak_paw       1867566
peep           1867910
set_vt         1494849
date_time      1867910
label.ds       1867605
patient_id     1867910
dtype: int64

In [16]:
df.date_time.min()

Timestamp('2014-04-21 13:01:33')

In [17]:
df.date_time.max()

Timestamp('2015-11-10 13:09:52')

In [18]:
rn_df.count()

fio2          2054
peep          1915
plat           518
set_vt        1335
spo2          1787
date_time     2204
patient_id    2204
dtype: int64

In [19]:
rn_df.date_time.min()

Timestamp('2014-04-21 03:26:00')

In [22]:
rn_df.date_time.max()

Timestamp('2015-11-12 16:00:00')

In [28]:
import numba

lag_list = 6        #needs to be 36/resmaple_freq
resample_freq = '6H'   #hours to resample to

numba.jit()
def link_groups(row, items):
    if np.isnan(row[items+'_x']):
        return row[items+'_y']
    else: 
        return row[items+'_x']

grouped_df = df.groupby('patient_id')
grouped_rn_df = rn_df.groupby('patient_id')

resampled_df = pd.DataFrame()

for name, group in grouped_df:  
    try:
        rn_group = grouped_rn_df.get_group(name)
        group = pd.merge(group, rn_group, left_on='date_time', right_on='date_time', how='outer', indicator=True)
        
        for items in ['fio2', 'peep', 'set_vt', 'patient_id']:
            group[items] = group.apply(link_groups, items=items, axis=1)
            group.drop([items+'_x', items+'_y'], inplace=True, axis=1)
        
        print(group['_merge'].value_counts())
        
    except KeyError:
        print(name, ' is missing')  
        
    print(name)
    group.drop_duplicates(subset='date_time', keep='first', inplace=True)
    group.set_index(['date_time'], inplace = True, drop = False, verify_integrity=True)
    
    group.sort_index(inplace=True)
    group = group.resample(resample_freq).agg({'label.ds':'sum', 'breath_num':'count', 'patient_id': 'max', 'elapse_time':'mean', 
                                      'fio2': 'mean', 'peep':'mean', 'peak_paw':'mean', 'set_vt':'mean', 'peak_vol':'mean'})
    group['NMB'] = 0
   
    start = group.index.min()
    end = group.index.max()
    
    periods = (end - start) / pd.to_timedelta(resample_freq)
    periods = periods + lag_list + 1
        
    group = group.reindex(pd.date_range(start, periods = periods, freq=resample_freq), copy=True)    
    group.reset_index(drop=False, inplace=True)
    group.rename(columns={'index':'date_time'}, inplace=True)
    group.set_index(['date_time'], drop=False, inplace=True, verify_integrity=True)
    
    for items in ['fio2', 'peep', 'set_vt', 'peak_paw']:
        group[items].interpolate(method='linear', limit=lag_list, limit_direction='forward', inplace=True)
        #group[['fio2', 'peep', 'set_vt', 'peak_paw']].fillna(method='ffill', limit=lag_list, inplace=True)
    
    group.patient_id = name
    
    try:
        patient_info = patient_df.ix[name]
        if patient_info['NMB'] == 'Yes':
            start_stop = patient_info['Start_End_NMB'].strip('[]').split(';')

            for items in start_stop:
                start, stop = items.strip('()').split(',')
                start = pd.to_datetime(start)
                stop = pd.to_datetime(stop)

                if start < stop:
                    group.loc[(group.index >= start) & (group.index <= stop), 'NMB'] = 1
                else:
                    rn_df.loc[(group.index >= stop) & (group.index <= start), 'NMB'] = 1
        group['NMB_sum'] = group['NMB'].rolling(window=lag_list, center=False).sum() 
    except KeyError:
        print(name, ' is missing from patient data')  

    try:
        group['ds_freq'] = group['label.' + ds_types] / group['breath_num']
    except ZeroDivisionError:
        print(group['label.' + ds_types, 'breath_num'])

    for lags in [4,6,8]:
        for items in ['ds', 'fio2', 'peep', 'peak_paw', 'set_vt']:
            if items == 'ds':
                group[items + '_lag_' + str(lags)] = np.nan
                group[items + '_lag_' + str(lags)] = group['ds_freq'].shift(lags)
                group[items + '_lag_' + str(lags)] = group[items + '_lag_' + str(lags)].astype(np.float64)
            else:
                group[items + '_lag_' + str(lags)] = group[items].shift(lags)                
                group[items + '_diff_lag_' + str(lags)] = group[items] - group[items + '_lag_' + str(lags)]

        try:
            group['NMB_lag'+str(lags)] = 0
            group['NMB_lag'+str(lags)] = group['NMB_sum'].shift(lags)
        except KeyError:
            pass

    resampled_df = pd.concat([resampled_df, group])

left_only     208
right_only     19
both            0
dtype: int64
100
left_only     44058
right_only      103
both              8
dtype: int64
101
left_only     836
right_only     61
both            1
dtype: int64
102
left_only     36288
right_only       16
both              6
dtype: int64
103
left_only     41255
right_only       95
both              4
dtype: int64
105
left_only     49755
right_only       17
both              7
dtype: int64
106
left_only     5841
right_only       8
both             1
dtype: int64
107
left_only     4594
right_only      41
both             1
dtype: int64
108
left_only     28153
right_only       12
both              9
dtype: int64
109
left_only     132865
right_only       120
both              29
dtype: int64
110
left_only     47945
both              7
right_only        7
dtype: int64
112
left_only     161676
right_only        57
both              22
dtype: int64
113
left_only     49009
right_only       43
both             11
dtype: int64
114
left_only  

In [29]:
resampled_df.describe()

Unnamed: 0,breath_num,label.ds,elapse_time,peep,patient_id,set_vt,peak_paw,fio2,peak_vol,NMB,...,ds_lag_8,fio2_lag_8,fio2_diff_lag_8,peep_lag_8,peep_diff_lag_8,peak_paw_lag_8,peak_paw_diff_lag_8,set_vt_lag_8,set_vt_diff_lag_8,NMB_lag8
count,729.0,319.0,319.0,888.0,909.0,781.0,546.0,897.0,319.0,729.0,...,288.0,657.0,647.0,653.0,636.0,406.0,307.0,604.0,524.0,530.0
mean,2562.290809,165.84953,2762.994912,7.605947,117.59956,426.804306,22.869072,52.472266,438.819626,0.078189,...,0.03333,51.124466,-2.280003,7.640314,-0.560834,23.133767,-1.357022,428.841361,0.01299,0.484906
std,3587.663474,263.369165,791.998355,3.012644,9.175734,71.809886,7.224132,18.947549,115.544632,0.268654,...,0.052077,16.291534,14.177748,2.765006,2.219853,6.168366,4.8744,75.824296,42.94687,1.464372
min,0.0,0.0,1528.110243,1.666667,100.0,270.0,10.666667,30.0,237.125064,0.0,...,0.0,30.0,-70.0,1.666667,-15.005917,10.666667,-14.701162,270.0,-165.0,0.0
25%,0.0,17.0,2130.600919,5.0,110.0,400.0,17.146631,40.0,387.716426,0.0,...,0.003199,40.0,-8.343605,5.0,-1.5,18.197592,-4.396171,400.0,0.0,0.0
50%,0.0,56.0,2659.770502,7.332418,119.0,440.0,22.877358,41.095274,421.327695,0.0,...,0.009213,43.333333,0.0,8.0,0.0,22.987097,-1.169389,440.0,0.0,0.0
75%,5621.0,197.0,3191.749467,10.0,125.0,480.0,28.865878,60.0,475.2117,0.0,...,0.040499,60.0,0.0,10.0,0.0,28.254411,1.388666,480.0,0.0,0.0
max,12243.0,1511.0,6978.331512,20.005917,132.0,630.0,40.143186,100.0,930.680628,1.0,...,0.361323,100.0,56.757455,20.005917,8.0,40.143186,14.824436,630.0,165.0,6.0


In [30]:
for lag_list in [4,6,8]:
    resampled_df['peep_diff_lag_'+str(lag_list)] = resampled_df['peep'] - resampled_df['peep_lag_'+str(lag_list)]
    resampled_df['fio2_diff_lag_'+str(lag_list)] = resampled_df['fio2'] - resampled_df['fio2_lag_'+str(lag_list)]
    resampled_df['set_vt_diff_lag_'+str(lag_list)] = resampled_df['set_vt'] - resampled_df['set_vt_lag_'+str(lag_list)]
    resampled_df['peak_paw_diff_lag_'+str(lag_list)] = resampled_df['peak_paw'] - resampled_df['peak_paw_lag_'+str(lag_list)]

In [31]:
#resampled_df.dropna(how = 'any', subset = ['fio2'], inplace = True)
#resampled_df.replace({0: np.nan}, inplace = True)

for times in ['6','12','18','24']:
    resampled_df['ds_lag_' + times + '_bin'] = resampled_df['ds_lag_' + times].apply(bin_samples)

resampled_df.to_csv('c:\Research_data\Analysis\Lagged_DS\lagged_analysis_' + ds_types + resample_freq + 'no_extrap.csv')

KeyError: 'ds_lag_12'

In [None]:
resampled_df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_lagged_data_' + resample_freq +'no_extrap.h5', 'table', append=False)