In [1]:
from transformers import AutoTokenizer, AutoModel, pipeline
import tensorflow_hub as hub
import torch

import pandas as pd
import numpy as np 
import pickle

## Loading Data

In [2]:
ekg_denoised = pd.read_pickle('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_v2.pkl')
ekg_denoised = ekg_denoised.sample(frac=1, random_state=10)


## Initializing Model

In [3]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", model_max_length=264)
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Looping through data to generate embeddings

In [4]:
for i in range(0,len(ekg_denoised)//250):
    min_value = 250*i
    max_value = min_value + 250
    
    input_tokens = tokenizer(ekg_denoised.TEXT[min_value:max_value].tolist(),
                             padding = 'max_length',
                             truncation=True,
                             return_tensors='pt'
                            )
    
    pooled_model_outputs = model(**input_tokens)[1]
    inputs = pooled_model_outputs.detach().numpy()
    
    np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ClinicalBERT_Pooled/pt' + str(i), 
            inputs,
            allow_pickle=True)
    
    del input_tokens
    del pooled_model_outputs
    del inputs
    

## Combine Embeddings, Create Labels

In [None]:
embeddings1 = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ClinicalBERT_Pooled/pt0.npy')
embeddings2 = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ClinicalBERT_Pooled/pt1.npy')

combined_embeddings = np.concatenate((embeddings1, embeddings2), axis=0)

for i in range (3,272):
    temp_embeddings = np.load('/home/sanjaycollege15/PredictingDiagnoses/Data/ClinicalBERT_Pooled/pt' + str(i) + '.npy')
    combined_embeddings = np.concatenate((combined_embeddings, temp_embeddings), axis = 0)


In [9]:
labels = np.array(ekg_denoised['ICD9_CODE'][:250*(len(ekg_denoised)//250-1)])


## Save off embeddings and labels

In [13]:
np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_ClinicalBERT_pooled_embeddings_68k',
        combined_embeddings,
        allow_pickle=True)

np.save('/home/sanjaycollege15/PredictingDiagnoses/Data/ekg_denoised_ClinicalBERT_pooled_labels_68k',
        labels,
        allow_pickle=True)


---

## PyTorch Implementation

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.optim import Adam
from tqdm import tqdm

In [14]:
input_tokens = tokenizer(ekg_denoised.TEXT[0:50].tolist(),
                             padding = 'max_length',
                             truncation=True,
                             return_tensors='pt'
                            )
pooled_model_outputs = model(**input_tokens)[1]

In [15]:
labels = torch.tensor(ekg_denoised.ICD9_CODE[0:50].values)

In [27]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(768, 4)
        self.dropout1 = nn.Dropout(0.25)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout1(x)
        x = self.fc1(x)
        x = self.relu(x)
        
        return x

In [33]:
my_nn = Net()
result = my_nn(pooled_model_outputs)
print(result.argmax(dim=1))

tensor([0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0,
        3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
        0, 0])


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = labels
        self.texts = 

In [None]:
def train(model, train_data, learning_rate, epochs):
    train = 

---

In [4]:
ekg_denoised_limited = ekg_denoised[ekg_denoised.ICD9_CODE != 3]

In [5]:
input_tokens_limited = tokenizer(ekg_denoised_limited.TEXT[0:50].tolist(),
                                 padding = 'max_length',
                                 truncation=True,
                                 return_tensors='pt'
                                )
pooled_model_outputs_limited = model(**input_tokens_limited)[1]

In [6]:
labels_limited = torch.tensor(ekg_denoised_limited.ICD9_CODE[0:50].values)

In [11]:
class LimitedNet(nn.Module):
    def __init__(self):
        super(LimitedNet, self).__init__()
        self.fc1 = nn.Linear(768, 3)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        
        return x

In [34]:
my_limited_nn = LimitedNet()
result_limited = my_limited_nn(pooled_model_outputs_limited)
print(result_limited.argmax(dim=1))

tensor([2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
        1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2])


In [10]:
training = my_nn.train(pooled_model_outputs_limited, labels_limited)

TypeError: train() takes from 1 to 2 positional arguments but 3 were given