In [1]:
import os, json, datetime, sys
from collections import defaultdict

In [2]:
from massageData import runPipeline, readData
from constants import PROCESSED_PATH, RAW_PATH, DATA_PATH
from rnn_utils import *

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [4]:
idsPath = os.path.join(RAW_PATH, 'd_ids_split.pickle')
split_ids = getSplitIds(idsPath)

In [5]:
datapath = os.path.join(DATA_PATH, 'bypt_old')
db = DataBatch(datapath, split_ids, batchSize=75)

print([(k, len(v)) for k, v in db.files.items()])

trainBatches = db.getBatchIterator('devel')
testBatches = db.getBatchIterator('test')

[('devel', 318), ('test', 97), ('valid', 88)]


In [6]:
learning_rate = 0.0001
max_train_steps = 10000
loss_thresh = 10
display_step = 100
seed = 1

In [7]:
H, F, S = 24, 200, 4

In [8]:
def RNN(xs, batch_size):
    with tf.variable_scope("MyRNN"):
        LSTMcells = [tf.contrib.rnn.LSTMCell(s) for s in [F, S]]
        cell = tf.contrib.rnn.MultiRNNCell(LSTMcells)
        
#         LSTMcell = tf.contrib.rnn.LSTMCell(F)
#         MRcell = tf.contrib.rnn.MultiRNNCell([LSTMcell])
#         cell=tf.contrib.rnn.OutputProjectionWrapper(MRcell, output_size=S)
        
        
        initial_state = cell.zero_state(batch_size=batch_size, dtype=tf.float32)
        output, state = tf.nn.dynamic_rnn(cell, xs, initial_state=initial_state)
        return output

In [9]:
tf.reset_default_graph()
xs = tf.placeholder(shape=[None, H, F], dtype=tf.float32)
yt = tf.placeholder(shape=[None, H, S], dtype=tf.float32)
batch_size = tf.placeholder(tf.int32, shape=[], name='batch_size')
output = RNN(xs, batch_size)

loss = tf.reduce_mean(tf.nn.l2_loss(yt-output))
optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate)
train=optimizer.minimize(loss)

prediction = tf.nn.softmax(output)
precat = tf.argmax(prediction, 2)
labels = tf.argmax(yt, 2)
correct_pred = tf.equal(precat, labels)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

confmat = tf.confusion_matrix(
    labels=tf.reshape(labels, [-1]),
    predictions=tf.reshape(tf.argmax(prediction, 2), [-1])
)

In [10]:
pro, pre, tru = None, None, None
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter("output", sess.graph)
#     print(sess.run())
    row_nan_bool = None
    headers = None
    
    prev_lss = 1
    lss_ratio = 1
    step = 0
    first = True
    while first or (lss_ratio > loss_thresh and step < max_train_steps):
        nb = 1
        for batch in trainBatches.setSeed(seed):
            # print('bnumnan', np.sum(np.isnan(batch)))
            trn_ids, trn_X, trn_Y = None, None, None
            if headers is None:
                headers = db.getHeaders()
                # batch, headers, row_nan_bool = dropNanCols(batch, headers, row_nan_bool)
                # print(headers)
                trn_ids, trn_X, trn_Y, hdrscut = prepareData(batch, headers, nclasses=4, debug=True)
                print(hdrscut)
            else:
                # batch, _, row_nan_bool = dropNanCols(batch, None, row_nan_bool)
                trn_ids, trn_X, trn_Y, hdrscut = prepareData(batch, headers, nclasses=4)

#             print('Batch', nb)
            nb += 1
            # Run optimization op (backprop)
            lss, _ = sess.run([loss, train], feed_dict={xs:trn_X,yt:trn_Y, batch_size:trn_X.shape[0]})
            lss_ratio = abs(prev_lss-lss)/prev_lss
            prev_lss = lss
            
        if step % display_step == 0 or step == 1:
            # Calculate batch loss and accuracy
            (otp, acc,) = sess.run([output, accuracy],
                                   feed_dict={xs:trn_X, yt:trn_Y, batch_size:trn_X.shape[0]})

            # print('numnan', np.sum(np.isnan(otp)))
            print("Step %5d | L2 Loss = %.4f, Train Accuracy = %.3f" % (step, lss, acc))
        
        first = False
        step += 1
        
    print("Optimization Finished!")
    trn_lss, trn_acc = sess.run([loss, accuracy], feed_dict={xs:trn_X, yt:trn_Y, batch_size:trn_X.shape[0]})
    print("Step %5d | L2 Loss = %.4f, Train Accuracy = %.3f" % (step, lss, acc))
    
    for test in testBatches.setSeed(seed):  
#         test, _, _ = dropNanCols(batch, None, row_nan_bool)
        tst_ids, tst_X, tst_Y, _ = prepareData(test, headers, nclasses=4)
        prob, preb, trub = sess.run([output, precat, labels],
                                     feed_dict={xs:tst_X, yt:tst_Y, batch_size:tst_X.shape[0]})
#         print('numnan', np.sum(np.isnan(prob)))
        if pre is None:
            pro, pre, tru = prob, preb, trub
        else:
            pro = np.concatenate([pro, prob], axis=0) 
            pre = np.concatenate([pre, preb], axis=0)
            tru = np.concatenate([tru, trub], axis=0)
        
        cor = (pre == tru).flatten()
        print('Test accuracy:', np.sum(cor) / len(cor))
    writer.close()

  means = np.nanmean(X, axis=1)
  xmeans = np.nanmean(means, axis=0)


input:
 75 x  24 x 200
  v x   h x   f
output:
 75 x  24 x   4
  v x   h x   s

['AGE' 'GENDER' 'ETHNICITY' 'P WEIGHT' 'P HEIGHT' 'P SYSTOLIC BP'
 'P DIASTOLIC BP' 'P TEMPERATURE' 'P RESPIRATORY RATE' 'P HEART RATE'
 'P SPO2' 'P CREATININE' 'P UREA NITROGEN' 'P HEMATOCRIT'
 'P PLATELET COUNT' 'P WHITE BLOOD CELLS' 'P HEMOGLOBIN' 'P MCHC' 'P MCH'
 'P MCV' 'P RED BLOOD CELLS' 'P RDW' 'P POTASSIUM' 'P SODIUM' 'P CHLORIDE'
 'P BICARBONATE' 'P ANION GAP' 'P GLUCOSE' 'P MAGNESIUM' 'P PHOSPHATE'
 'P CALCIUM' 'P INR(PT)' 'P PT' 'P PTT' 'P PH' 'P PH' 'P SPECIFIC GRAVITY'
 'P LYMPHOCYTES' 'P MONOCYTES' 'P NEUTROPHILS' 'P BASOPHILS'
 'P EOSINOPHILS' 'P BASE EXCESS' 'P CALCULATED TOTAL CO2' 'P PO2' 'P PCO2'
 'P LACTATE' 'P ALANINE AMINOTRANSFERASE (ALT)'
 'P ASPARATE AMINOTRANSFERASE (AST)' 'P PROTEIN' 'P BILIRUBIN'
 'P ALKALINE PHOSPHATASE' 'P KETONE' 'P UROBILINOGEN' 'P GLUCOSE'
 'P ALBUMIN' 'P URINE COLOR' 'P URINE APPEARANCE' 'P BLOOD' 'P BILIRUBIN'
 'P NITRITE' 'P YEAST' 'P WBC' 'P RBC' 'P LE

In [11]:
metricsfn = os.path.join(PROCESSED_PATH, 'rnn_metrics_%s.csv' % datetime.datetime.now().strftime('%m%d%y%H%M%S'))
aucsfn = os.path.join(PROCESSED_PATH, 'rnn_aucs_%s.csv' % datetime.datetime.now().strftime('%m%d%y%H%M%S'))

In [12]:
saveMetrics(tru, pre, metricsfn)
saveAUCs(tru, pro, aucsfn)

  ppv = rs[0]/(rs[0]+rs[1])
  spe = rs[2]/(rs[0]+rs[1])
