In [1]:
import pymongo
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sb
import dask.dataframe as ddf

client = pymongo.MongoClient()
db = client.VentDB
breathData = db.breath_collection
RN = db.RN_collection

%matplotlib inline

In [2]:
def bin_samples(x):
    if x <= 0.05:
        return 0
    elif x == np.nan:
        return np.nan
    elif x > 0.05:
        return 1

In [3]:
breathData.find().count()

1964338

In [4]:
ds_types = 'ds'

In [5]:
patient_df = pd.read_csv('C:\Research_data\Demographic Data v2.csv', engine = 'c',
                             usecols = ['Study ID', 'Age', 'Gender', 'P/F Ratio', 'NMB', 'Start_End_NMB', 'Hospital Discharge Date/Time', 'Discharge Location'])
patient_df.set_index(['Study ID'], inplace = True, drop=False)
patient_df.head(15)
patient_df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_patient_test.h5', key='patient', format='table', append=False)

In [6]:
data = breathData.find({},
                       {'patient_id': 1, 'date_time': 1, 'breath_num': 1, 'breath_settings.peep': 1,
                        'breath_settings.fio2': 1, 'breath_settings.set_VT': 1, 'breath_character.elapse_time':1, 
                        'breath_character.peak_vol':1, 'breath_settings.peak_paw': 1, 'label.' + ds_types: 1, '_id': 1})


rn = RN.find({'$or': [{'FiO2': {'$exists': 1}}, {'PEEP': {'$exists': 1}}]},
             {'patientID': 1, 'date_time': 1, 'FiO2': 1, 'PEEP': 1, 'SpO2': 1, 'Set Vt': 1, 'Plat':1,
              '_id': 0})

In [7]:
df = pd.io.json.json_normalize(data)
rn_df = pd.io.json.json_normalize(list(rn))

KeyboardInterrupt: 

In [None]:
df['date_time'] = pd.to_datetime(df['date_time'], unit='s').dt.tz_localize('UTC').dt.tz_convert('US/Mountain').dt.tz_localize(None)
df.dropna(axis=0, how='all', subset=['breath_settings.fio2', 'breath_settings.peep'], inplace=True)
df.drop_duplicates(subset = 'date_time', keep = 'last', inplace = True)
df.set_index(['patient_id'], inplace = True, drop = False)
df.sort_index(inplace = True)
df['breath_settings.peep'] = df['breath_settings.peep'].astype(np.float64)
df['breath_num'] = df['breath_num'].astype(np.float64)
df['label.' + ds_types] = df['label.' + ds_types].astype(np.float64)

df.rename(columns={'breath_settings.fio2':'fio2', 'breath_settings.peep': 'peep',
                        'breath_settings.set_VT': 'set_vt', 'breath_character.elapse_time': 'elapse_time', 
                        'breath_character.peak_vol': 'peak_vol', 'breath_settings.peak_paw': 'peak_paw'}, inplace=True)

df = df[(df['elapse_time'] < 10000) & \
        (df['peak_vol'] < 1000) & (df['peak_vol'] > 100)]

In [None]:
rn_df['date_time'] = pd.to_datetime(rn_df['date_time'], format='%m/%d/%Y %H:%M:%S')
rn_df.rename(columns={'patientID':'patient_id', 'FiO2':'fio2', 'PEEP':'peep', 'Set Vt': 'set_vt', 'SpO2':'spo2', 'Plat':'plat'}, inplace=True)
rn_df.dropna(axis=0, how='all', subset=['fio2', 'peep'], inplace=True)
rn_df.drop_duplicates(subset = 'date_time', keep = 'last', inplace = True)
rn_df.set_index(['patient_id', 'date_time'], inplace = True, drop=False)

In [None]:
df.set_index(['patient_id', 'date_time'], inplace=True, drop=False)
df.sort_index(inplace=True)
df.set_index(['patient_id'], inplace=True, drop=False)

In [None]:
df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_breath_data.h5', key='breath', format='table', append=False)
rn_df.reset_index(inplace=True, drop=True)
rn_df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_rn_data.h5', key='rn', format='table', append=False)
rn_df.set_index(['patient_id', 'date_time'], inplace = True, drop=False)

In [None]:
#df = pd.read_hdf('c:\Research_data\Analysis\Lagged_DS\ds_breath_data.h5')
#rn_df = pd.read_hdf('c:\Research_data\Analysis\Lagged_DS\ds_rn_data.h5')

In [None]:
df.dtypes

In [None]:
rn_df.dtypes

In [None]:
df.count()

In [None]:
df.date_time.min()

In [None]:
df.date_time.max()

In [None]:
rn_df.count()

In [None]:
rn_df.date_time.min()

In [None]:
rn_df.date_time.max()

In [None]:
import numba

lag_list = 18        #needs to be 36/resmaple_freq
resample_freq = '2H'   #hours to resample to

numba.jit()
def link_groups(row, items):
    if np.isnan(row[items+'_x']):
        return row[items+'_y']
    else: 
        return row[items+'_x']

grouped_df = df.groupby('patient_id')
grouped_rn_df = rn_df.groupby('patient_id')

resampled_df = pd.DataFrame()

for name, group in grouped_df:  
    try:
        rn_group = grouped_rn_df.get_group(name)
        group = pd.merge(group, rn_group, left_on='date_time', right_on='date_time', how='outer', indicator=True)
        
        for items in ['fio2', 'peep', 'set_vt', 'patient_id']:
            group[items] = group.apply(link_groups, items=items, axis=1)
            group.drop([items+'_x', items+'_y'], inplace=True, axis=1)
        
        print(group['_merge'].value_counts())
        
    except KeyError:
        print(name, ' is missing')  
        
    print(name)
    group.drop_duplicates(subset='date_time', keep='first', inplace=True)
    group.set_index(['date_time'], inplace = True, drop = False, verify_integrity=True)
    
    group.sort_index(inplace=True)
    group = group.resample(resample_freq).agg({'label.ds':'sum', 'breath_num':'count', 'patient_id': 'max', 'elapse_time':'mean', 
                                      'fio2': 'mean', 'peep':'mean', 'peak_paw':'mean', 'set_vt':'mean', 'peak_vol':'mean'})
    group['NMB'] = 0
   
    start = group.index.min()
    end = group.index.max()
    
    periods = (end - start) / pd.to_timedelta(resample_freq)
    periods = periods + lag_list + 1
        
    #group = group.reindex(pd.date_range(start, periods = periods, freq=resample_freq), copy=True)    
    group.reset_index(drop=False, inplace=True)
    group.rename(columns={'index':'date_time'}, inplace=True)
    group.set_index(['date_time'], drop=False, inplace=True, verify_integrity=True)
    
    #for items in ['fio2', 'peep', 'set_vt', 'peak_paw']:
    #    group[items].interpolate(method='linear', limit=lag_list, limit_direction='forward', inplace=True)
    #    #group[['fio2', 'peep', 'set_vt', 'peak_paw']].fillna(method='ffill', limit=lag_list, inplace=True)
    
    group.patient_id = name
    
    try:
        patient_info = patient_df.ix[name]
        if patient_info['NMB'] == 'Yes':
            start_stop = patient_info['Start_End_NMB'].strip('[]').split(';')

            for items in start_stop:
                start, stop = items.strip('()').split(',')
                start = pd.to_datetime(start)
                stop = pd.to_datetime(stop)

                if start < stop:
                    group.loc[(group.index >= start) & (group.index <= stop), 'NMB'] = 1
                else:
                    rn_df.loc[(group.index >= stop) & (group.index <= start), 'NMB'] = 1
        group['NMB_sum'] = group['NMB'].rolling(window=lag_list, center=False).sum() 
    except KeyError:
        print(name, ' is missing from patient data')  

    try:
        group['ds_freq'] = group['label.' + ds_types] / group['breath_num']
    except ZeroDivisionError:
        print(group['label.' + ds_types, 'breath_num'])

    for lags in [6,12,18,24]:
        for items in ['ds', 'fio2', 'peep', 'peak_paw', 'set_vt']:
            if items == 'ds':
                group[items + '_lag_' + str(lags)] = np.nan
                group[items + '_lag_' + str(lags)] = group['ds_freq'].shift(lags)
                group[items + '_lag_' + str(lags)] = group[items + '_lag_' + str(lags)].astype(np.float64)
            else:
                group[items + '_lag_' + str(lags)] = group[items].shift(lags)                
                group[items + '_diff_lag_' + str(lags)] = group[items] - group[items + '_lag_' + str(lags)]

        try:
            group['NMB_lag'+str(lags)] = 0
            group['NMB_lag'+str(lags)] = group['NMB_sum'].shift(lags)
        except KeyError:
            pass

    resampled_df = pd.concat([resampled_df, group])

In [None]:
resampled_df.describe()

In [None]:
for lag_list in [6,12,18,24]:
    resampled_df['peep_diff_lag_'+str(lag_list)] = resampled_df['peep'] - resampled_df['peep_lag_'+str(lag_list)]
    resampled_df['fio2_diff_lag_'+str(lag_list)] = resampled_df['fio2'] - resampled_df['fio2_lag_'+str(lag_list)]
    resampled_df['set_vt_diff_lag_'+str(lag_list)] = resampled_df['set_vt'] - resampled_df['set_vt_lag_'+str(lag_list)]
    resampled_df['peak_paw_diff_lag_'+str(lag_list)] = resampled_df['peak_paw'] - resampled_df['peak_paw_lag_'+str(lag_list)]

In [None]:
#resampled_df.dropna(how = 'any', subset = ['fio2'], inplace = True)
#resampled_df.replace({0: np.nan}, inplace = True)

for times in ['6','12','18','24']:
    resampled_df['ds_lag_' + times + '_bin'] = resampled_df['ds_lag_' + times].apply(bin_samples)

resampled_df.to_csv('c:\Research_data\Analysis\Lagged_DS\lagged_analysis_' + ds_types + resample_freq + 'no_extrap.csv')

In [None]:
resampled_df.to_hdf('c:\Research_data\Analysis\Lagged_DS\ds_lagged_data_' + resample_freq +'no_extrap.h5', 'table', append=False)