select patient_id, dispense_year, IF(SUM(IF(CAT_chronic_illness = "Diabetes",1,0))>0,1,0) as diab, MAX(pat_regional_category) as regional_cat, SUM(1) as trans_cnt
from dthon.transactions_enriched
where patient_id <= 279200
AND patient_id >= 558353
AND dispense_year < 2017
group by patient_id, dispense_year 

To do (basic):
- divide by patient_id into prediction set for 2016 (kaggle) and the rest into train + test (done)
- identify automatically number of features (done)
- train the model (done)
- send prediction to kaggle (done)

Advanced:
- save a model
- add batch normalization (done)
- run on ML engine
- set pipeline to load from google storage into model

Additional model:
- RNN - data sorted by patient and  years, chunked into patients and padded, so that everytime there is 2008 - 2015

Tensorflow 1.1 + Python 3.5/3.6

In [1]:
import sys
print(sys.version)

3.6.1 |Continuum Analytics, Inc.| (default, Mar 22 2017, 19:54:23) 
[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]


In [2]:
import tensorflow as tf
import google.datalab.bigquery as bq
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

project_id = "the-d-team-164512"
table_name = 'ml_model'
scheme = bq.Table('dthon.'+table_name).schema

In [18]:
pivot =''
for i in (np.arange(2, len(scheme))):
    for year in (np.arange(2008, 2017)):
        field = str(scheme[i]['name'])
        pivot = pivot + ",SUM(IF(dispense_year = " + str(year) + ", " + field + ", 0)) as " + field + "_" + str(year) + """
"""
query = """SELECT patient_id 
""" + pivot + "FROM dthon." + table_name + """
GROUP BY patient_id"""

In [19]:
print(query)

SELECT patient_id 
,SUM(IF(dispense_year = 2008, diab, 0)) as diab_2008
,SUM(IF(dispense_year = 2009, diab, 0)) as diab_2009
,SUM(IF(dispense_year = 2010, diab, 0)) as diab_2010
,SUM(IF(dispense_year = 2011, diab, 0)) as diab_2011
,SUM(IF(dispense_year = 2012, diab, 0)) as diab_2012
,SUM(IF(dispense_year = 2013, diab, 0)) as diab_2013
,SUM(IF(dispense_year = 2014, diab, 0)) as diab_2014
,SUM(IF(dispense_year = 2015, diab, 0)) as diab_2015
,SUM(IF(dispense_year = 2016, diab, 0)) as diab_2016
,SUM(IF(dispense_year = 2008, regional_cat, 0)) as regional_cat_2008
,SUM(IF(dispense_year = 2009, regional_cat, 0)) as regional_cat_2009
,SUM(IF(dispense_year = 2010, regional_cat, 0)) as regional_cat_2010
,SUM(IF(dispense_year = 2011, regional_cat, 0)) as regional_cat_2011
,SUM(IF(dispense_year = 2012, regional_cat, 0)) as regional_cat_2012
,SUM(IF(dispense_year = 2013, regional_cat, 0)) as regional_cat_2013
,SUM(IF(dispense_year = 2014, regional_cat, 0)) as regional_cat_2014
,SUM(IF(dispense_year

In [20]:
dataset = pd.read_gbq(query, project_id)

Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
  Got page: 1; 2% done. Elapsed 7.48 s.
  Got page: 2; 4% done. Elapsed 12.33 s.
  Got page: 3; 7% done. Elapsed 18.26 s.
  Got page: 4; 9% done. Elapsed 23.73 s.
  Got page: 5; 11% done. Elapsed 29.57 s.
  Got page: 6; 13% done. Elapsed 34.86 s.
  Got page: 7; 16% done. Elapsed 40.44 s.
  Got page: 8; 18% done. Elapsed 45.85 s.
  Got page: 9; 20% done. Elapsed 51.72 s.
  Got page: 10; 22% done. Elapsed 57.6 s.
  Got page: 11; 25% done. Elapsed 63.72 s.
  Got page: 12; 27% done. Elapsed 70.3 s.
  Got page: 13; 29% done. Elapsed 75.88 s.
  Got page: 14; 31% done. Elapsed 81.61 s.
  Got page: 15; 34% done. Elapsed 87.07 s.
  Got page: 16; 36% done. Elapsed 92.68 s.
  Got page: 17; 38% done. Elapsed 98.17 s.
  Got page: 18; 40% done. Elapsed 103.78 s.
  Got page: 19; 43% done. Elapsed 109.45 s.
  Got page: 20; 45% done. Elapsed 115.64 s.
  Got page: 21; 47% done. Elapsed 120.6 s.
  Got page: 22; 49% don

In [7]:
predict_set = dataset.loc[(dataset['patient_id'] > 279200) & (dataset['patient_id'] < 558353), :]
model_set = dataset.loc[(dataset['patient_id'] <= 279200) | (dataset['patient_id'] >= 558353), :]

In [8]:
chunk_size = len(scheme) - 2
chunks_no = len(np.arange(2008, 2017))
labels_no = len(np.unique(dataset['diab_2016']))

In [9]:
print(labels_no)
print(chunks_no)
print(chunk_size)

2
9
3


In [257]:
features = model_set.drop(['patient_id'], axis=1)
features['diab_2016'] = 0
labels = model_set['diab_2016']
features_pred = predict_set.drop(['patient_id'], axis=1)
features_pred['diab_2016'] = 0
labels_pred = predict_set[['patient_id','diab_2016']]

In [259]:
logs_path = './logs/tf_model'

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.2)

batch_size =1000
nodes_1 = 500
nodes_2 = 300

def init_weights(nodes):
    return 2 / np.sqrt(nodes)

graph = tf.Graph()

with graph.as_default():
    inp = tf.placeholder(tf.float32, shape=[None, chunk_size*chunks_no])
    y = tf.placeholder(tf.int32)
    keep_prob = tf.placeholder(tf.float32)
    is_training = tf.placeholder(tf.bool)
    labels = tf.one_hot(y, depth=2)
    
    with tf.name_scope('parameters'):
        #hidden_1 = {
        #    'weights': tf.Variable(tf.truncated_normal([features_no, nodes_1], stddev=init_weights(features_no))),
        #    'biases': tf.Variable(tf.zeros([nodes_1]))}
        hidden_2 = {'weights': tf.Variable(tf.truncated_normal([nodes_1, nodes_2], stddev=init_weights(nodes_1))),
                   'biases': tf.Variable(tf.zeros([nodes_2]))}
        output = {'weights': tf.Variable(tf.truncated_normal([nodes_2, labels_no], stddev=init_weights(nodes_2))),
                  'biases': tf.Variable(tf.zeros([labels_no]))}
    with tf.name_scope('model'):
        inp2 = tf.reshape(inp, [-1, chunk_size, chunks_no])
        inp2 = tf.transpose(inp2, perm =[0, 2, 1])
        lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units = 64, dropout_keep_prob = keep_prob)
        outputs, state = tf.nn.dynamic_rnn(cell=lstm_cell, inputs=inp2, sequence_length=chunks_no)
        layer_1 = outputs[:,:,(chunks_no-2):chunks_no]
        layer_2 = tf.matmul(layer_1, hidden_2['weights']) + hidden_2['biases']
        layer_2 = tf.contrib.layers.batch_norm(inputs=layer_2, is_training=is_training, updates_collections=None)
        layer_2 = tf.nn.relu(layer_2)
        layer_2 = tf.nn.dropout(layer_2, keep_prob=keep_prob)
        logits = tf.matmul(layer_2, output['weights']) + output['biases']
        prediction = tf.nn.softmax(logits)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.name_scope('loss'):
        cost = tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits)
    with tf.name_scope('optimization'):
        with tf.control_dependencies(update_ops):
            optimizer = tf.train.AdamOptimizer().minimize(cost)
    with tf.name_scope('metrics'):
        #auc, _ = tf.metrics.auc(labels=labels, predictions=prediction)
        accuracy= tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits,1),tf.argmax(labels, 1)), tf.float32))
    tf.summary.scalar('accuracy', accuracy)
    #tf.summary.scalar('auc', auc)
    merged_summary = tf.summary.merge_all()

In [260]:
with tf.Session(graph=graph) as sess:
    tf.global_variables_initializer().run()
    tf.local_variables_initializer().run()
    epoch_no = 50
    summary_writer = tf.summary.FileWriter(logs_path, graph=graph)
    train_batches = int(np.ceil(features_train.shape[0]/batch_size))
    test_batches = int(np.ceil(features_test.shape[0]/batch_size))
    train_size = features_train.shape[0]
    for epoch in np.arange(epoch_no):
        order = np.random.permutation(train_size)
        features_train = features_train.iloc[order,:]
        labels_train = labels_train.iloc[order]
        acc_t = []
        for step in np.arange(train_batches):
            _, summary, acc = sess.run([optimizer, merged_summary, accuracy], feed_dict = {
                inp:features_train.iloc[(step*batch_size):((step+1)*batch_size),:],
                y: labels_train.iloc[(step*batch_size):((step+1)*batch_size)],
                keep_prob: 0.7,
                is_training: True
            })
            summary_writer.add_summary(summary, epoch * train_batches + step)
        for step in np.arange(test_batches):
            predict = np.argmax(logits.eval(feed_dict = 
                                            {inp:features_test.iloc[(step*batch_size):((step+1)*batch_size),:],
                                            keep_prob: 1.0,
                                            is_training: False}), axis = 1)
            acc_t.append(np.mean(predict == labels_test.iloc[(step*batch_size):((step+1)*batch_size)]))
        print('Epoch: ' + str(epoch) + ' Training accuracy: ' + str(acc) + ' Test accuracy: ' + str(np.mean(acc_t)))
    full_predict = list()
    for step in np.arange(features_pred.shape[0]):
        predict = prediction.eval(feed_dict = {inp:[features_pred.iloc[step,:]], 
                                               keep_prob: 1.0, 
                                               is_training: False})
        full_predict.append(predict[0,1])

Epoch: 0 Training accuracy: 0.972222 Test accuracy: 0.938770408163
Epoch: 1 Training accuracy: 0.952778 Test accuracy: 0.950532312925
Epoch: 2 Training accuracy: 0.961111 Test accuracy: 0.960227040816
Epoch: 3 Training accuracy: 0.961111 Test accuracy: 0.961351190476
Epoch: 4 Training accuracy: 0.969444 Test accuracy: 0.964596938776
Epoch: 5 Training accuracy: 0.969444 Test accuracy: 0.964965136054
Epoch: 6 Training accuracy: 0.95 Test accuracy: 0.965307823129
Epoch: 7 Training accuracy: 0.958333 Test accuracy: 0.965825680272
Epoch: 8 Training accuracy: 0.972222 Test accuracy: 0.965039965986
Epoch: 9 Training accuracy: 0.977778 Test accuracy: 0.96412585034
Epoch: 10 Training accuracy: 0.975 Test accuracy: 0.965072278912
Epoch: 11 Training accuracy: 0.977778 Test accuracy: 0.966054421769
Epoch: 12 Training accuracy: 0.963889 Test accuracy: 0.966090136054
Epoch: 13 Training accuracy: 0.955556 Test accuracy: 0.96575085034
Epoch: 14 Training accuracy: 0.969444 Test accuracy: 0.966054421769

In [184]:
len(full_predict)

279152

In [261]:
upload = pd.DataFrame({'patient_id': predict_set['patient_id'], 'prediction': full_predict})

In [262]:
upload.to_csv('upload_file.csv')