## Loading Data

In [98]:
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pydot
import graphviz

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [2]:
ekg_denoised = pd.read_pickle('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised.pkl')
ekg_denoised = ekg_denoised.sample(frac=1)

In [3]:
ekg_denoised.head()

Unnamed: 0,ICD9_CODE,TEXT
815,4019,atrial fibrillation probable prior inferior my...
1661,4280,sinus rhythm first degree a-v delay prior ante...
2541,4280,atrial fibrillation with a rapid ventricular r...
563,4019,sinus bradycardia. leftward precordial r wave ...
1378,4019,baseline artifact sinus tachycardia low limb l...


### Converting to Tensorflow dataset

In [4]:
training_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(ekg_denoised['TEXT'].values, tf.string),
            tf.cast(ekg_denoised['ICD9_CODE'].values, tf.int32)
        )
    )
)

### Setting up train, dev, and test datasets 

In [48]:
AUTOTUNE = tf.data.AUTOTUNE
DATASET_SIZE = len(training_dataset)

train_size = int(0.8 * DATASET_SIZE)
val_size = int(0.1 * DATASET_SIZE)
test_size = int(0.1 * DATASET_SIZE)

full_dataset = training_dataset
train_dataset = full_dataset.take(train_size).cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = full_dataset.skip(train_size)
val_dataset = test_dataset.skip(test_size).cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.take(test_size).cache().prefetch(buffer_size=AUTOTUNE)


In [66]:
for text_batch, label_batch in train_dataset.take(3):
    print(f'EKG reading: {text_batch.numpy()}')
    print(f'label: {label_batch}\n')

EKG reading: b'atrial fibrillation probable prior inferior myocardial infarctionand consider also left anterior fascicular block prior anterior myocardial infarction nonspecific anterolateral st-t wave abnormalities consider left ventricular hypertrophy since previous tracing of same date, no significant change'
label: 4019

EKG reading: b'sinus rhythm first degree a-v delay prior anteroseptal myocardial infarction no previous tracing available for comparison'
label: 4280

EKG reading: b'atrial fibrillation with a rapid ventricular response at approximately 120. right bundle-branch block left anterior hemiblock. occasional ventricular premature beat. non-specific repolarization changes. compared to the previous tracing of ventricular ectopic activity is new. otherwise, no significant change.'
label: 4280



In [67]:
for text_batch, label_batch in val_dataset.take(3):
    print(f'EKG reading: {text_batch.numpy()}')
    print(f'label: {label_batch}\n')

EKG reading: b'sinus rhythm lateral t wave changes are nonspecific slight inferior st segment elevation, consider inferior myocardial infarction clinical correlation is advised'
label: 41401

EKG reading: b'atrial fibrillation. marked right axis deviation. left bundle-branch block. late precordial qrs transition. clinical correlation is suggested for possible prior anterolateral myocardial infarction, although not diagnostic. since the previous tracing of ventricular ectopy is not seen. tracing 2'
label: 4280

EKG reading: b'sinus rhythm. borderline low qrs voltage is non-specific and may be within normal limits. since the previous tracing of atrial fibrillation is now absent. tracing 2'
label: 4280



In [68]:
for text_batch, label_batch in test_dataset.take(3):
    print(f'EKG reading: {text_batch.numpy()}')
    print(f'label: {label_batch}\n')

EKG reading: b'sinus rhythm. prolonged a-v conduction. left bundle-branch block. compared to the previous tracing of transmural inferior wall myocardial infarction was previously present if left bundle-branch block patterning is absent, although the patterning barely resembles left bundle-branch block. the current tracing has a wider qrs interval.'
label: 4019

EKG reading: b'sinus rhythm lateral st-t changes may be due to myocardial ischemia since previous tracing, atrial fibrillation resolved'
label: 42731

EKG reading: b'sinus tachycardia. left atrial abnormality. diffuse non-diagnostic repolarization abnormalities. compared to the previous tracing of no definite change.'
label: 4280



## Modeling 

### Setting up BERT Pre-Process Model and SmallBERT

In [64]:
bert_preprocess_model = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

In [81]:
bert_model = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1')

### Running sample text

In [80]:
test_text = ['sinus rhythm. prolonged a-v conduction. left bundle-branch block. compared to the previous tracing of transmural inferior wall myocardial infarction was previously present if left bundle-branch block patterning is absent, although the patterning barely resembles left bundle-branch block. the current tracing has a wider qrs interval.']
text_preprocessed = bert_preprocess_model(test_text)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :128]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')


Keys       : ['input_mask', 'input_word_ids', 'input_type_ids']
Shape      : (1, 128)
Word Ids   : [  101  8254  2271  6348  1012 15330  1037  1011  1058  6204  3258  1012
  2187 14012  1011  3589  3796  1012  4102  2000  1996  3025 16907  1997
  9099 16069  2140 14092  2813  2026 24755 25070  1999 14971  7542  2001
  3130  2556  2065  2187 14012  1011  3589  3796  5418  2075  2003  9962
  1010  2348  1996  5418  2075  4510 12950  2187 14012  1011  3589  3796
  1012  1996  2783 16907  2038  1037  7289  1053  2869 13483  1012   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
Input Mask : [1 1 1 1 1 1 1 1 1 1 1 1]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


#### Notes:
- Increasing the number of sentences changes the shape of the input word IDs (2 sentences becomes (2,128)).
- The input is limited to 128 words
- Start and stop of sentences is encoded with 101 and 102

In [83]:
bert_results = bert_model(text_preprocessed)

print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Pooled Outputs Shape:(1, 512)
Pooled Outputs Values:[ 0.9887249  -0.7988822  -0.32717532  0.10474525 -0.29970306  0.99474674
  0.9869619  -0.9857107  -0.5750032  -0.32717896 -0.57857615 -0.974101  ]
Sequence Outputs Shape:(1, 128, 512)
Sequence Outputs Values:[[ 0.40818763  0.28475827 -0.72223073 ... -0.54963726 -0.17958531
   0.05423336]
 [ 0.21084103  0.48801625  0.15987696 ...  0.63800764  0.01293848
  -1.1388142 ]
 [ 0.5426493   0.87804466 -1.1415874  ...  0.86736923 -0.5243975
   0.10186958]
 ...
 [-0.20698741 -0.00587281 -1.0680927  ...  0.8720525   0.3271055
  -0.5672977 ]
 [ 0.02397707 -0.26795962 -0.38774925 ... -1.0148929   0.03317202
  -0.7092347 ]
 [ 0.07354864 -0.25090563 -0.44517663 ... -1.1513164  -0.33980662
   0.7860865 ]]


#### Notes:
- Bert model outputs a 512 valued vector

### Defining Model Pipeline

In [122]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1', trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()


In [123]:
bert_raw_result = classifier_model(tf.constant(test_text))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.64683807]], shape=(1, 1), dtype=float32)


### Loss Metrics

In [124]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [125]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5

In [126]:
classifier_model.compile(loss=loss,
                         metrics=metrics)

In [128]:
tf.constant(test_text)

<tf.Tensor: shape=(1,), dtype=string, numpy=
array([b'sinus rhythm. prolonged a-v conduction. left bundle-branch block. compared to the previous tracing of transmural inferior wall myocardial infarction was previously present if left bundle-branch block patterning is absent, although the patterning barely resembles left bundle-branch block. the current tracing has a wider qrs interval.'],
      dtype=object)>

In [129]:
history = classifier_model.fit(x=tf.constant(test_text),
                               validation_data=val_dataset,
                               epochs=epochs,
                               )

Epoch 1/5


TypeError: in user code:

    File "/home/sanjaycollege15/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "/home/sanjaycollege15/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/sanjaycollege15/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "/home/sanjaycollege15/anaconda3/lib/python3.8/site-packages/keras/engine/training.py", line 812, in train_step
        raise TypeError(

    TypeError: Target data is missing. Your model has `loss`: <keras.losses.SparseCategoricalCrossentropy object at 0x7f5b10df3430>, and therefore expects target data to be passed in `fit()`.
