<a href="https://colab.research.google.com/github/smf-9000/nlp-in-general/blob/main/Word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
Links:
[Chris McCormick and Nick Ryan. (2019, May 14)] https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/

```



### Info:
* “The man was accused of robbing a bank.” “The man went fishing by the bank of the river.” Word2Vec would produce the same word embedding for the word “bank” in both sentences, while under BERT the word embedding for “bank” would be different for each sentence.




In [None]:
!pip install transformers

In [2]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [23]:
# example = 'A tokenizer is in charge of preparing the inputs for a model.'
# example = 'What meaning word "embeddings" has?'
example = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."

indexed_tokens = tokenizer.encode_plus(example, add_special_tokens=True)['input_ids']
tokenized_text = [tokenizer.decode(w).replace(' ', '') for w in indexed_tokens]

# print(tokenized_text)
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:8,}'.format(tup[0], tup[1]))

segments_ids = [1] * len(tokenized_text)
print (segments_ids)

[CLS]             101
after           2,044
stealing       11,065
money           2,769
from            2,013
the             1,996
bank            2,924
vault          11,632
,               1,010
the             1,996
bank            2,924
robber         27,307
was             2,001
seen            2,464
fishing         5,645
on              2,006
the             1,996
mississippi     5,900
river           2,314
bank            2,924
.               1,012
[SEP]             102
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# print(tokens_tensor)
# print(segments_tensors)

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
model.eval()

In [36]:
with torch.no_grad():

  outputs = model(tokens_tensor, segments_tensors)
  hidden_states = outputs[2]
  token_embeddings = torch.stack(hidden_states, dim=0)
  token_embeddings = torch.squeeze(token_embeddings, dim=1)
  token_embeddings = token_embeddings.permute(1,0,2)
  print(token_embeddings.size())

torch.Size([22, 13, 768])


In [47]:
# Word Vectors
# ------------

# Ex1:
token_vecs_cat = []

for token in token_embeddings:
  # `token` is a [13 x 768] tensor

  # Concatenate the vectors (that is, append them together) from the last four layers.
  cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
  
  token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))

# Ex2:
token_vecs_sum = []

for token in token_embeddings:

  # Sum the vectors from the last four layers.
  sum_vec = torch.sum(token[-4:], dim=0)
  
  token_vecs_sum.append(sum_vec)

print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

# Ex3:
token_vecs_last = []

for token in token_embeddings:

  # Just vector from last layer for specific token.
  last_vec = token[-1]

  token_vecs_last.append(last_vec)

print ('Shape is: %d x %d' % (len(token_vecs_last), len(token_vecs_last[0])))


Shape is: 22 x 3072
Shape is: 22 x 768
Shape is: 22 x 768


In [55]:
from scipy.spatial.distance import cosine

diff_bank = 1 - cosine(token_vecs_last[10], token_vecs_last[19]) # "bank robber" vs "river bank",  one refers to the actual bank
same_bank = 1 - cosine(token_vecs_last[10], token_vecs_last[6]) # "bank robber" vs "bank vault", both refers to the actual bank

print('same bank', same_bank)
print('diff_bank', diff_bank)


same bank 0.9527329206466675
diff_bank 0.6978818774223328
