In [1]:
import pandas as pd

from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL
from sqlalchemy import text
import settings
import time
import numpy as np
import pickle
import json

def db_connect():
    """
    Performs database connection using database settings from settings.py.
    Returns sqlalchemy engine instance
    """
    return create_engine(URL(**settings.DATABASE))

db = db_connect()

In [2]:
command =  '''select icustay_id, first_careunit from icustays;'''
services = pd.read_sql(command, db)

In [3]:
services.to_pickle('../Pickles/services.p')

In [4]:
#build new SQL tables

#get list of all tables in db
from sqlalchemy import MetaData
start_time = time.time()
m = MetaData(bind=db)
m.reflect()
tables = list(m.tables.keys())
time.time()-start_time

# from list of tables, get all w_chartevent tables
import re
tab_re = re.compile(r'^(w_chartevents_.*)')
chart_tables = []
for table in tables:
    name=re.findall(tab_re, table)
    if name:
        chart_tables.append(name[0])
w_chart_tables = sorted(chart_tables)

## Choose your w_charts to work from now!

In [12]:
## START TIME!!!!!!!
start_time = time.time()
#pick w_charts here
print w_chart_tables

[u'w_chartevents_1', u'w_chartevents_10', u'w_chartevents_11', u'w_chartevents_12', u'w_chartevents_13', u'w_chartevents_14', u'w_chartevents_2', u'w_chartevents_3', u'w_chartevents_4', u'w_chartevents_5', u'w_chartevents_6', u'w_chartevents_7', u'w_chartevents_8', u'w_chartevents_9']


In [None]:
#combine pickles (w_chart_tabes = list of tables you want to join and work on )
df = combine_pickles(w_chart_tables)

In [None]:
print len(df)
df.head()

In [None]:
#get what items to cut
keep_items = pd.read_csv('../TextFiles/Keep_items.txt', names = ['itemsid'])
len(keep_items)

In [None]:
keep_list = list(keep_items.itemsid.values)
len(keep_list)

## Drop BAD Items 

In [None]:
#drop bad items
count = 0
total = len(df)
df = df[df.itemid.isin(keep_list)]
print 'done!'

In [None]:
print len(df)
df.head()

## Map old items to new items

In [None]:
def items_to_item(df):
    '''
    FUNCTION: map old item IDs to new item ID
    INPUT = data frame
    OUTPUT = data frame with new item IDs in place
    '''
    from collections import defaultdict
    import json
    items_dict = defaultdict(list)

    with open("../JSONs/items_to_item.json", 'r') as f:
        items_dict = json.load(f)
    items_dict = {int(k):int(v) for k,v in items_dict.items()}
    df['itemid'].replace(items_dict, inplace = True)
    return df

df = items_to_item(df)

In [None]:
len(df.itemid.unique())

## Get dictionary of OUT times from ICU

In [None]:
que = '''select * from icustay_detail;'''
icustays_df = pd.read_sql(que, db)
icustays_df = icustays_df[['icustay_id', 'outtime']]
print icustays_df.head()
icustays_df = icustays_df[icustays_df.icustay_id.isin(df.icustay_id.unique())]
print len(icustays_df.icustay_id.unique())

In [None]:
dt_dict = icustays_df.set_index('icustay_id')['outtime'].to_dict()
print "DONE!"

## Flatten chart

In [None]:
df.columns


In [None]:
# grouped_chart = df.groupby('icustay_id')
# pat_df = grouped_chart.get_group(grouped_chart.groups.keys()[1])
# group_item = pat_df.groupby('itemid')

In [None]:
# group_item.get_group(group_item.groups.keys()[1])[['charttime','value','valuenum','valueuom']].values   

In [None]:
def flatten_wchart(df, dt_dict, hours_before = 24):
    '''
    after getting all w_charts into pandas:
    build features!
    '''
    print 'starting up...', 
    #start_time = time.time()
    count = 0
    row_list = []
    keep_columns = ['subject_id', 'charttime', 'value', 'valuenum', 'valueuom', 'icustay_id', 'itemid']
    df = df[keep_columns]
    
    grouped_chart = df.groupby('icustay_id')
    total = len(grouped_chart.groups.keys())
    
    for patient in grouped_chart.groups.keys():
        row_dict={}
        pat_df = grouped_chart.get_group(patient)
        group_item = pat_df.groupby('itemid')
        row_dict['icustay_id'] = patient
        distime = dt_dict[patient]
        for itemid in group_item.groups.keys():
            item_df = group_item.get_group(itemid)
            if len(item_df) < 1:
                row_dict[itemid] = np.nan
            else:
                row_dict[itemid] = item_df[(distime - item_df.charttime) > pd.Timedelta(str(hours_before) + 'hours')].sort_values(by='charttime', 
                                        ascending=False)[['charttime','value','valuenum','valueuom']].values.tolist()
        row_list.append(row_dict)
        count += 1 #print progress
        perc = float(count)/total * 100
        print '\r{0} %done'.format(perc),
        
    features_df = pd.DataFrame(row_list)
    row_list = None
    row_dict = None
    grouped_chart = None
    
    return features_df

## Change hours before to chop out as desired!

In [None]:
hours_before = 48
final_df = flatten_wchart(df, dt_dict, hours_before)

In [None]:
final_df.shape

## Change from Item ID to Item Name

In [None]:
with open("../JSONs/item_to_name.json", 'r') as f:  # Get one item ID for item groups
    itemTname_key_dict = json.load(f)
itemTname_key_dict = {int(k):str(v) for k,v in itemTname_key_dict.items()}


In [None]:
final_df.rename(columns = itemTname_key_dict, inplace=True)

## Add target and demographics

In [None]:
patient_df = pd.read_csv('../TextFiles/FINAL_patient_list.csv')
patient_df = patient_df.drop(['Unnamed: 0', 'index', 'hadm_id'], axis = 1)

In [None]:
final_patient_df = patient_df.merge(final_df, on = 'icustay_id')

In [None]:
final_patient_df.shape

## Write FINAL df to pickles

In [None]:
name = '../Pickles/%d_final_patient_df.p' %(hours_before)
final_patient_df.to_pickle(name)
print "TIME ELAPSED: %f" %d(time.time()-start_time)

#### store final df to several pickle files as backup

In [6]:
def save_file_pickle(df, num_split, hours_before):
    length = len(df)
    for i in range(num_split):
        start_time = time.time()
        filename = '../Pickles/backup/' + str(hours_before) + "short_final_df_' + str(i) + '.p'
        end = (i+1)*length/num_split
        start = (i)*length/num_split
        df[start:end].to_pickle(filename)
        print '\r', time.time()-start_time, filename, 'done',

SyntaxError: EOL while scanning string literal (<ipython-input-6-6c5aa4644b97>, line 5)

In [79]:
save_file_pickle(final_patient_df, 4, hours_before)


71.5171160698 ../Pickles/backup/short_final_df_3.p done


### After flattening, split and send to pickle

In [7]:
def combine_pickles(tables):
    dflist = []
    count = 1
    for table in tables:
        print "\r","working on %s, %d/%d tables" % (table, count, len(tables)),
        filename = "../Pickles/" + table + '.p'
        dflist.append(pickle.load(open(filename, 'rb')))
        count +=1
    df = pd.concat(dflist)
    dflist=None
    return df

In [8]:
def get_wchart_data(tables, db):
    '''Gets all tables in list from sql server, loads them into memory and stores as .pickle'''
    for table in tables:
        filename = table + '.p'
        start_time = time.time()
        command =  '''select * from %s''' % table
        sql = text(command)
        temp_df = pd.read_sql(sql, db)
        print(table)
        print(time.time()-start_time)
        print "Writing %s to pickle... %s" % (table, filename)
        filename = table + '.p'
        start_time = time.time()
        pickle.dump(temp_df, open(filename , "wb" ) )
        del temp_df
        print "Done writing pickle", time.time()-start_time
    return 'Done'

In [9]:
def get_most_recent(entries):
    '''Sort by time, return most recent data '''
    sorted_entries = sorted(entries, key = lambda x: x[0])
    return sorted_entries[-1][1]
