In [1]:
from transformers import AutoTokenizer, AutoModel, pipeline
import tensorflow_hub as hub

import pandas as pd
import numpy as np 
import pickle

## Read Data

In [2]:
ekg_denoised = pd.read_pickle('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_v2.pkl')
ekg_denoised = ekg_denoised.sample(frac=1, random_state=10)


In [4]:
ekg_denoised.head()

Unnamed: 0,ICD9_CODE,TEXT
7423,3,sinus rhythm. the p-r interval is prolonged. t...
60592,2,sinus bradycardia. lead v3 was not obtained. c...
62820,2,sinus rhythm consider left atrial abnormality ...
38047,1,sinus rhythm with borderline first degree a-v ...
43024,1,sinus rhythm. non-specific lateral st-t wave a...


## ClinicalBERT

### Set up Model Pipeline

In [5]:
ekg_denoised['length'] = ekg_denoised.TEXT.str.len()

In [6]:
ekg_denoised['length'].mean()

209.00457612202993

In [7]:
ekg_denoised['length'].describe()

count    68180.000000
mean       209.004576
std        101.165786
min          1.000000
25%        134.000000
50%        190.000000
75%        264.000000
max       1149.000000
Name: length, dtype: float64

In [5]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", model_max_length=264)
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
pipe = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

### Embedding Train and Test 

#### Train

In [7]:
features = pipe(ekg_denoised.TEXT[15000:22500].tolist(), 
                padding='max_length',
                truncation=True
               )

In [8]:
features_np = np.squeeze(features)
del features

In [9]:
features_np.shape

(7500, 264, 768)

In [10]:
labels_np = np.array(ekg_denoised.ICD9_CODE[15000:22500])

#### Test

In [18]:
features_test = pipe(ekg_denoised.TEXT[7500:15000].tolist(), 
                padding='max_length'
               )

In [19]:
features_np_test = np.squeeze(features_test)
del features_test

In [20]:
labels_np_test = np.array(ekg_denoised.ICD9_CODE[7500:15000])

### Save off Outputs

In [11]:
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_ClinicalBERT_embeddings_10k_train_pt2',
        features_np,
        allow_pickle=True)
    
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_ClinicalBERT_labels_10k_train_pt2',
        labels_np,
        allow_pickle=True)

In [30]:
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_ClinicalBERT_embeddings_10k_test',
        features_np_test,
        allow_pickle=True)
    
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_ClinicalBERT_labels_10k_test',
        labels_np_test,
        allow_pickle=True)

In [27]:
embeddings = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_embeddings.npy')
labels = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_CORe_labels.npy')