# BERT experiments visualisation notebook
This notebook is to learn how to use BERT model

## 0. Preparation

In [None]:
!pip install pytorch-pretrained-bert

In [None]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## 1. Tokenization

even though I will not use "text" in this experiment as we focus on "word", it would be useful to see how BERT works, how we can use it

In [None]:
input_text = "This is a exmpale text."
marked_text = "[CLS] " + input_text + " [SEP]"

# Tokenize
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

# Map the token strings to their vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
for tup in zip(tokenized_text, indexed_tokens):
    print("{:<12} {:>6,}".format(tup[0], tup[1]))

In [None]:
# Segment ID
segments_ids = [0] * len(tokenized_text)
print(segments_ids)

## 2. Extracting Embeddings

In [None]:
# list to tensor
tokens_ts = torch.tensor([indexed_tokens])
segments_ts = torch.tensor([segments_ids])

# load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

In [None]:
# predict hidden states
with torch.no_grad():
    encoded_layers, _ = model(tokens_ts, segments_ts)

### 2.1. interpreting the output

In [None]:
print("Number of layers: {}".format(len(encoded_layers)))
print("Number of batches: {}".format(len(encoded_layers[0])))  # number of sentences
print("Number of tokens: {}".format(len(encoded_layers[0][0])))
print("Number of hidden units: {}".format(len(encoded_layers[0][0][0])))

In [None]:
# ex. the 5th token in the sentence from layer 5
t_i = 5
l_i = 5
vec = encoded_layers[l_i][0][t_i]  # dim = [layers, batchs, tokens, features]

plt.figure(figsize=(10, 10))
plt.hist(vec, bins=200)

In [None]:
# each of 12 layers has tensor [batchs, tokens, features]
token_embeddings = torch.stack(encoded_layers, dim=0)
token_embeddings.size()

In [None]:
# discard batch dimension as we don't need it
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings.size()

In [None]:
# permute "layers" and "tokens" to make them [tokens, layers, features]
token_embeddings = token_embeddings.permute(1, 0, 2)
token_embeddings.size()

### 2.2. creating word_vec from hidden states

In [None]:
# ex. concatenate the last four layers
# vector will have dimension 4 * 768 = 3072

token_vecs_cat = []

for token in token_embeddings:
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    token_vecs_cat.append(cat_vec)
    
print("the size of word_vec dimension: {}".format(len(token_vecs_cat[0])))

In [None]:
# ex. summing the last four layers
# vector will have dimension 768

token_vecs_sum = []

for token in token_embeddings:
    sum_vec = torch.sum(token[-4:], dim=0)
    token_vecs_sum.append(sum_vec)

print("the size of word_vec dimension: {}".format(len(token_vecs_sum[0])))