In [1]:
from transformers import AutoTokenizer, AutoModel, pipeline
import tensorflow_hub as hub

import pandas as pd
import numpy as np 
import pickle

## Read Data

In [2]:
ekg_denoised = pd.read_pickle('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_v2.pkl')
ekg_denoised = ekg_denoised.sample(frac=1)

In [3]:
ekg_denoised.head()

Unnamed: 0,ICD9_CODE,TEXT
4714,3,sinus tachycardia. probable left anterior fasc...
49655,1,sinus rhythm - borderline first degree a-v blo...
44662,1,sinus bradycardia. a-v conduction delay. frequ...
51213,2,sinus rhythm borderline first degree a-v delay...
58068,2,sinus bradycardia left atrial abnormality infe...


## CORe

### Set up Model Pipeline

In [4]:
ekg_denoised['length'] = ekg_denoised.TEXT.str.len()

In [5]:
ekg_denoised['length'].mean()

209.00457612202993

In [6]:
ekg_denoised['length'].describe()

count    68180.000000
mean       209.004576
std        101.165786
min          1.000000
25%        134.000000
50%        190.000000
75%        264.000000
max       1149.000000
Name: length, dtype: float64

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1", model_max_length=264)
model = AutoModel.from_pretrained("bvanaken/CORe-clinical-outcome-biobert-v1")

In [22]:
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

### Embedding Train and Test 

#### Train

In [11]:
features = pipe(ekg_denoised.TEXT[:5000].tolist(), 
                padding='max_length',
                truncation=True
               )

In [12]:
features_np = np.squeeze(features)
del features

In [13]:
features_np.shape

(5000, 264, 768)

In [14]:
labels_np = np.array(ekg_denoised.ICD9_CODE[:5000])

#### Test

In [18]:
features_test = pipe(ekg_denoised.TEXT[5000:5500].tolist(), 
                padding='max_length'
               )

In [19]:
features_np_test = np.squeeze(features_test)
del features_test

In [20]:
labels_np_test = np.array(ekg_denoised.ICD9_CODE[5000:5500])

### Save off Outputs

In [26]:
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_embeddings',
        features_np,
        allow_pickle=True)
    
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_labels',
        labels_np,
        allow_pickle=True)

In [30]:
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_test_embeddings',
        features_np_test,
        allow_pickle=True)
    
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_test_labels',
        labels_np_test,
        allow_pickle=True)

In [27]:
embeddings = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_embeddings.npy')
labels = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_labels.npy')