In [1]:
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from transformers import BertTokenizer, TFBertModel, BertConfig
import tensorflow as tf
import numpy as np


## Import Data

In [2]:
ekg_denoised = pd.read_pickle('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_v2.pkl')
ekg_denoised = ekg_denoised.sample(frac=1)

In [3]:
ekg_denoised.head()

Unnamed: 0,ICD9_CODE,TEXT
38861,1,possible atrial flutter with an atrial rate of...
49525,1,sinus rhythm. normal tracing. compared to the ...
2591,3,sinus rhythm possible left ventricular hypertr...
39669,1,sinus rhythm. tall inferior p waves - possible...
21672,0,profound sinus bradycardia with intermittent v...


## Tokenize Data

In [4]:
tokenizer = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

INFO:absl:Using /tmp/tfhub_modules to cache modules.


In [5]:
encoder = hub.KerasLayer(
    'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1')

In [8]:
tokens = tokenizer(ekg_denoised.TEXT[:1000])
encoded_inputs = encoder(tokens)

In [30]:
encoded_inputs.keys()

dict_keys(['pooled_output', 'encoder_outputs', 'default', 'sequence_output'])

In [38]:
encoded_inputs['encoder_outputs']

[<tf.Tensor: shape=(1000, 128, 512), dtype=float32, numpy=
 array([[[-0.08538   ,  0.02170999, -0.23825505, ..., -0.7695437 ,
          -0.21822771, -0.01727127],
         [-0.0534118 ,  1.3537142 ,  0.2539853 , ..., -1.6795374 ,
          -0.47073588,  0.5243464 ],
         [-0.8059914 ,  0.06905432,  0.6797097 , ..., -0.74955696,
           0.6839643 , -0.20871504],
         ...,
         [ 0.84149045, -0.1282104 , -0.51381254, ...,  0.7839483 ,
          -0.1597345 ,  0.6535676 ],
         [ 0.7416849 , -0.08937413, -0.07411025, ...,  0.5033557 ,
           0.35541633,  0.523796  ],
         [ 0.47316024, -0.358746  ,  0.15270868, ...,  0.5151958 ,
           0.46780616,  0.40367538]],
 
        [[-0.08587334, -0.12070529, -0.14527205, ..., -0.8564461 ,
          -0.14008245, -0.11483904],
         [ 0.2044888 ,  0.7588219 ,  0.01169085, ..., -0.02754164,
          -0.24721523, -2.0268824 ],
         [-0.75083506,  0.3306901 , -0.7160338 , ...,  1.062837  ,
          -0.64282614,  0

In [7]:
labels = np.array(ekg_denoised.ICD9_CODE[:1000])

In [13]:
encoded_inputs.keys()

dict_keys(['pooled_output', 'encoder_outputs', 'default', 'sequence_output'])

In [14]:
test_encodings = encoded_inputs['pooled_output']

<tf.Tensor: shape=(1000, 512), dtype=float32, numpy=
array([[ 0.99166214,  0.1197574 , -0.06184475, ...,  0.5502014 ,
        -0.6442353 , -0.85817146],
       [ 0.9976368 ,  0.17481573, -0.19261858, ...,  0.5003145 ,
        -0.5349502 , -0.1832304 ],
       [ 0.64804137, -0.44555837,  0.02937316, ...,  0.56327635,
        -0.11237483, -0.92120934],
       ...,
       [ 0.9978899 , -0.42364314, -0.20901826, ...,  0.47847232,
        -0.6196047 , -0.04929641],
       [ 0.9972557 , -0.8440902 , -0.31961453, ...,  0.28505862,
        -0.2737933 , -0.04500105],
       [ 0.9970064 , -0.759793  ,  0.05473077, ...,  0.4079414 ,
        -0.3602471 , -0.35015243]], dtype=float32)>

### Test Set

In [23]:
test_tokens = tokenizer(ekg_denoised.TEXT[1000:1100])
test_encodings = encoder(test_tokens)

In [19]:
test_labels = np.array(ekg_denoised.ICD9_CODE[1000:1100])

In [15]:
inputs = tf.keras.layers.Input(shape=(encoded_inputs['pooled_output'].shape[1],))
hidden_1_out = tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4))(inputs)
dropout = tf.keras.layers.Dropout(0.2)(hidden_1_out)
hidden_2_out = tf.keras.layers.Dense(200, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4))(dropout)
classification_out = tf.keras.layers.Dense(4, activation='softmax')(hidden_2_out)

model_functional = tf.keras.models.Model(inputs=inputs, outputs = classification_out)
model_functional.compile(loss='SparseCategoricalCrossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model_functional.fit(encoded_inputs['pooled_output'], labels, epochs=15, batch_size=512)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fe34e6207f0>

In [26]:
model_functional.evaluate(test_encodings['pooled_output'], test_labels)




[1.5749602317810059, 0.25999999046325684]

## Saving Pooled Outputs

In [None]:
tokens = tokenizer(ekg_denoised.TEXT)
encoded_inputs = encoder(tokens)