## Getting clinical word embeddings from BERT models


#### Clinical Bert
https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
#### Non-clinical BERT  - for extracting word embeddings
https://colab.research.google.com/drive/1yFphU6PW9Uo6lmDly_ud9a6c4RCYlwdX

In [29]:
#!pip install transformers
!pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 471 kB/s eta 0:00:01
[?25hCollecting boto3
  Downloading boto3-1.21.4-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 969 kB/s eta 0:00:01
Collecting botocore<1.25.0,>=1.24.4
  Downloading botocore-1.24.5-py3-none-any.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 4.1 MB/s eta 0:00:01
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.1-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 4.4 MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3, pytorch-pretrained-bert
Successfully installed boto3-1.21.4 botocore-1.24.5 jmespath-0.10.0 pytorch-pretrained-bert-0.6.2 s3transfer-0.5.1


In [None]:
import os
import torch
import logging
import joblib
import matplotlib.pyplot as plt
%matplotlib inline

In [31]:
from transformers import AutoTokenizer, AutoModel
from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME

In [None]:
# BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

In [32]:
# Loading Clinical Bert
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME

In [34]:
output_dir = "/Users/radix/MachineLearning/MLNLP/clinicalBERT"

In [35]:
# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
model_to_save = model.module if hasattr(model, 'module') else model

In [38]:
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

In [39]:
print(output_model_file)

/Users/radix/MachineLearning/MLNLP/clinicalBERT/pytorch_model.bin


In [40]:
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)

In [43]:
state_dict = model_to_save.state_dict()

In [None]:
help(model)

In [None]:
text = ['This is a 46-year-old female with a history of events concerning for seizures.',
        'The patient has a history of epilepsy and has also had non-epileptic events in the past.']

In [None]:
print(text)

In [None]:
# Tokenize text and add special tokens needed for Bert Model
tokenized_text = tokenizer.tokenize(text, add_special_tokens=True)

In [None]:
print(tokenized_text)

In [None]:
len(tokenized_text)

In [None]:
encoded_dict = tokenizer(text[0], text[1])

In [None]:
segment_ids = encoded_dict['token_type_ids']

In [None]:
decoded = tokenizer.decode(encoded_dict['input_ids'])

In [None]:
decoded

In [None]:
tokenizer.vocab_size

In [None]:
# Get indices for tokens
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [None]:
#for tup in zip(tokenized_text, indexed_tokens):
#    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [None]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensor = torch.tensor([segment_ids])

In [None]:
tokens_tensor

In [None]:
segments_tensor

In [None]:
#with torch.no_grad():
#    outputs = model(tokens_tensor)
#hidden_states = outputs[2]

In [None]:
with torch.no_grad():
    outputs = model(tokens_tensor, segments_tensor)

In [None]:
# Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel

hidden_states = outputs[2]

In [None]:
print("Number of layers:", len(hidden_states))
layer_i = 0
print("Number of batches (sentences)", len(hidden_states[layer_i]))
batch_i = 0
print("Number of tokens", len(hidden_states[layer_i][batch_i]))
token_i = 0
print("Number of hidden units", len(hidden_states[layer_i][batch_i][token_i]))

In [None]:
# current dimensions
# layers, batches, tokens, features
# Desired dimensions
# tokens, layers, features

In [None]:
print("type of hidden state: ", type(hidden_states))
print("Shape of layer:", hidden_states[0].size())

In [None]:
# Stack layers
token_embeddings = torch.stack(hidden_states, dim=0)

In [None]:
token_embeddings.size()

In [None]:
# Remove batches dimension
token_embeddings = token_embeddings.squeeze(dim=1)
token_embeddings.size()

In [None]:
# Switch token, layer dimensions
token_embeddings = token_embeddings.permute([1,0,2])
token_embeddings.size()

### Word vectors: Contatenate and sum layers

In [None]:
token_vecs_cat = []
for token in token_embeddings:
    # token is a 12 x 768 tensor. Concatenate last 4 dimensions for 0th dimension
    # Each layer has 768 values so with concatenation of 4 dimensions each layer has 3072 (4x768) values
    cat_vec = torch.cat((token[-4], token[-3], token[-2], token[-1]), dim=0)
    token_vecs_cat.append(cat_vec)

In [None]:
print ('Concatenated token vectors %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

In [None]:
# Summing of last 4 layers for each token
token_vecs_sum = []
for token in token_embeddings:
    sum_vec = torch.sum(token[-4:], dim=0)
    token_vecs_sum.append(sum_vec)

In [None]:
print("Sum Token vectors %d x %d", (len(token_vecs_sum), len(token_vecs_sum[0])))

In [None]:
for i, str in enumerate(tokenized_text):
    print(i, str)

In [None]:
# vectors for words 'past' and 'history'
# 'past' index = 42
# 'history' index = 25

a = token_vecs_sum[42]
b = token_vecs_sum[25]

In [None]:
from scipy.spatial.distance import cosine

In [None]:
1-cosine(a,b)

In [None]:
c = [0.9, 0.8]
d = [0.9, 0.9]

In [None]:
1- cosine(c,d)

In [None]:
# Last hidden state as word embeddings
last_hidden_state = outputs[0]
word_embed_1 = last_hidden_state

In [None]:
# sum of hidden states
word_embed_sum = torch.stack(hidden_states).sum(0)

In [None]:
# sum of last 4
word_embed_sum_last4 = torch.stack(hidden_states[-4:]).sum(0)