<a href="https://colab.research.google.com/github/raymondwcs/learning_bert/blob/master/Sentence_Embedding_in_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [99]:
!pip install --quiet transformers
from transformers import AutoModelForSequenceClassification,AutoTokenizer,BertModel,pipeline
import torch
from torch.nn.functional import cosine_similarity, pairwise_distance
import numpy as np
from scipy.spatial.distance import cosine

CHECKPOINT = 'uer/roberta-base-finetuned-jd-full-chinese'  # https://huggingface.co/uer/roberta-base-finetuned-jd-full-chinese
CHECKPOINT = 'bert-base-chinese'

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
model = BertModel.from_pretrained(CHECKPOINT)

model.eval()

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [103]:
sentences = [
  '這個服務生很不親切',         # this is `query`
  '这个服务生很不亲切', 
  '今天下大雨',  
  '今天陽光充沛!',
  '今天是晴天'   
]

###############################################################################
# Is `query` more similiar to sentences[1], sentences[2], ...?
###############################################################################

tokens = tokenizer(text=sentences, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')

sentence_embeddings = []
for i in range(len(sentences)):
  input_ids = tokens.input_ids[i].unsqueeze(0)
  attention_mask = tokens.attention_mask[i].unsqueeze(0)
  output = model(input_ids=input_ids, attention_mask=attention_mask)
  sentence_embeddings.append(output.pooler_output)

for sent_embedding in sentence_embeddings:
  sim = 1 - cosine(sentence_embeddings[4].detach().numpy(),sent_embedding.detach().numpy())
  print('Cosine similarity: {:.2}'.format(sim))


Cosine similarity: 0.91
Cosine similarity: 0.91
Cosine similarity: 0.89
Cosine similarity: 0.93
Cosine similarity: 1.0


In [104]:
output = model(input_ids=tokens.input_ids, output_hidden_states=True)

# https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/
# output[0]   # last_hidden_state
# output[1]   # pooler_output
# output[2]   # hidden layers (13 = input embeddings + 12 BERT layers)

hidden_states = output[2]

# print(hidden_states)
print(len(hidden_states))             # 13 = input embeddings + 12 BERT layers
print(hidden_states[-1].shape)        # last hidden layer [1, <input length>, 768]
print(len(hidden_states[-1]))         # 1
print(len(hidden_states[-1][0]))      # <input length>
print(len(hidden_states[-1][0][0]))   # 768
print(hidden_states[-1])

# `hidden_states` has shape [13 x 1 x <input length> x 768]
# `token_vecs` is a tensor with shape [<input length> x 768]
token_vecs = hidden_states[1:13][0]
print(len(token_vecs))

# Calculate the average of all token vectors.
sentence_embedding = torch.mean(token_vecs, dim=0)

print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

13
torch.Size([5, 11, 768])
5
11
768
tensor([[[ 0.6990,  0.2976,  0.0780,  ..., -0.2679,  0.4348, -0.4079],
         [ 0.4393,  0.4274,  0.3795,  ..., -0.2055, -0.2701, -0.1704],
         [ 1.2645,  0.2934, -0.3274,  ...,  0.4628,  1.5048, -0.3984],
         ...,
         [ 0.1623,  0.5196, -0.6810,  ...,  0.9575, -0.0201,  0.4393],
         [ 0.8598,  0.1185, -0.8425,  ..., -0.4070, -0.1333, -0.1838],
         [ 0.2993,  0.5014, -0.5098,  ..., -0.5224,  0.0896, -0.0483]],

        [[ 0.9797,  0.2807, -0.0285,  ..., -0.2808,  0.6529, -0.2834],
         [ 0.3758,  0.3738,  0.4548,  ..., -0.5361,  0.0271, -0.2273],
         [ 1.1849,  0.2157, -0.3603,  ...,  0.2832,  1.6439, -0.5681],
         ...,
         [ 0.0380,  0.4457, -0.4412,  ...,  0.9437,  0.2141,  0.1889],
         [ 0.8946,  0.0085, -0.8180,  ..., -0.3052,  0.0762, -0.2281],
         [ 0.3241,  0.7086, -0.3072,  ..., -0.2152,  0.3009, -0.0885]],

        [[-0.6302,  0.3261, -0.0465,  ...,  0.2729,  0.5397, -0.3083],
        

In [105]:
tokens = tokenizer(text=sentences, add_special_tokens=True, padding=True, truncation=True, return_tensors='pt')

sentence_embeddings = []
for i in range(len(sentences)):
  input_ids = tokens.input_ids[i].unsqueeze(0)
  attention_mask = tokens.attention_mask[i].unsqueeze(0)
  output = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
  # `hidden_states` has shape [13 x 1 x <sentence length> x 768]
  hidden_states = output[2]
  # print(len(hidden_states))   # 13, a list

  # Select the embeddings from the second to last layer.
  # `token_vecs` is a tensor with shape [<sent length> x 768]
  token_vecs = hidden_states[1:13][0]     # take 2nd layer to last layer
  token_vecs = token_vecs.squeeze(0)

  # Calculate the average of all token vectors.
  sen_emb = torch.mean(token_vecs, dim=0)

  # Convert to numpy array.
  sen_emb  = sen_emb.detach().numpy()

  sentence_embeddings.append(sen_emb)

for x in sentence_embeddings:
  sim = 1 - cosine(sentence_embeddings[4],x)
  print('Cosine similarity: {:.2}'.format(sim))

Cosine similarity: 0.64
Cosine similarity: 0.64
Cosine similarity: 0.94
Cosine similarity: 0.87
Cosine similarity: 1.0
