In [2]:
# !pip install transformers torch torchvision torchaudio

import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
import numpy as np

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [8]:
def get_embeddings(text):
  text = '[CLS] ' + text + ' [SEP]'
  # Split the sentence into tokens.
  tokenized_text = tokenizer.tokenize(text)

  # Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
  segments_ids = [1] * len(tokenized_text)
  # Convert inputs to PyTorch tensors
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  # Run the text through BERT, and collect all of the hidden states produced
  # from all 12 layers. 
  with torch.no_grad():

      outputs = model(tokens_tensor, segments_tensors)

      # Evaluating the model will return a different number of objects based on 
      # how it's  configured in the `from_pretrained` call earlier. In this case, 
      # becase we set `output_hidden_states = True`, the third item will be the 
      # hidden states from all layers. See the documentation for more details:
      # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
      hidden_states = outputs[2]
  
  return (tokenized_text, hidden_states)

def get_token_embeddings(hidden_states):
  # Concatenate the tensors for all layers. We use `stack` here to
  # create a new dimension in the tensor.
  token_embeddings = torch.stack(hidden_states, dim=0)

  # Remove dimension 1, the "batches".
  token_embeddings = torch.squeeze(token_embeddings, dim=1)

  # Swap dimensions 0 and 1.
  token_embeddings = token_embeddings.permute(1,0,2)
  # contains: Tokens, layers, features

  return token_embeddings

def vectorize_tokens(embeddings, method='cat'):
   # Stores the token vectors, with shape [22 x 3,072] for cat or [22 x 768] for sum
  token_vecs = []

  # `embeddings` is a [22 x 12 x 768] tensor.

  for token in embeddings:
    # `token` is a [12 x 768] tensor

    if method == 'cat':
      # Concatenate the vectors (that is, append them together) from the last 
      # four layers.
      # Each layer vector is 768 values, so `cat_vec` is length 3,072.
      vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    elif method == 'catMiddle':
      vec = torch.cat((token[-2], token[-3], token[-4], token[-5], token[-6], token[-7], token[-8], token[-9], token[-10]), dim=0)
    elif method == 'sum':
      # Sum the vectors from the last four layers.
      vec = torch.sum(token[-4:], dim=0)
    else:
      raise BaseException('Unkown method')

    token_vecs.append(vec)
  return token_vecs

def vectorize_sentence(hidden_states):
  # `hidden_states` has shape [13 x 1 x 22 x 768]

  # `token_vecs` is a tensor with shape [22 x 768]
  token_vecs = hidden_states[-2][0]

  # Calculate the average of all 22 token vectors.
  sentence_embedding = torch.mean(token_vecs, dim=0)
  return sentence_embedding
  # print ("Our final sentence embedding vector of shape:", sentence_embedding.size())

In [12]:
tokenized_text, hidden_states = get_embeddings("a web server on port 3000 with parameters username of type string and password of type string")
token_embeddings = get_token_embeddings(hidden_states)
token_vec = vectorize_tokens(token_embeddings, method='cat')
print(tokenized_text)

test_tokenized_text, test_hidden_states = get_embeddings('web server on port xxxx with parameters xxxx of type xxxx')
test = vectorize_tokens(get_token_embeddings(test_hidden_states), method='cat')
print(test_tokenized_text)

print(1 - cosine(token_vec[2], test[1]))
print(1 - cosine(vectorize_sentence(hidden_states), vectorize_sentence(test_hidden_states)))

['[CLS]', 'a', 'web', 'server', 'on', 'port', '3000', 'with', 'parameters', 'user', '##name', 'of', 'type', 'string', 'and', 'password', 'of', 'type', 'string', '[SEP]']
['[CLS]', 'web', 'server', 'on', 'port', 'xx', '##xx', 'with', 'parameters', 'xx', '##xx', 'of', 'type', 'xx', '##xx', '[SEP]']
0.7960519790649414
0.8924217224121094


In [5]:
def find_similar_words(*sentences):
  tokenizeds = []
  token_vecs = []
  for sentence in sentences:
    tokenized, hidden_states = get_embeddings(sentence)
    tokenizeds.append(tokenized)
    token_vecs.append(vectorize_tokens(get_token_embeddings(hidden_states), method='cat'))

  max_length = max(map(len, tokenizeds))
  for i in range(max_length):
    for sid in range(len(sentences)):
      print('{:<12} '.format(tokenizeds[sid][i] if len(tokenizeds[sid]) > i else ''), end='')

    if len(tokenizeds[0]) > i and len(tokenizeds[1]) > i:
      print('{:<12}'.format(1 - cosine(token_vecs[0][i], token_vecs[1][i])))
    else:
      print()

find_similar_words('a http server on port 3000 with parameters email of type text', 'a web server on port 3000 with parameters user of type string and password of type string')

[CLS]        [CLS]        0.9652470946311951
a            a            0.9581173062324524
http         web          0.7633567452430725
server       server       0.9401279091835022
on           on           0.9609048366546631
port         port         0.9547920227050781
3000         3000         0.9673320055007935
with         with         0.9559084177017212
parameters   parameters   0.9227150082588196
email        user         0.6826009750366211
of           of           0.8307011127471924
type         type         0.8420049548149109
text         string       0.6760694980621338
[SEP]        and          -0.010823934338986874
             password     
             of           
             type         
             string       
             [SEP]        


In [6]:
# from scipy.spatial.distance import cosine

# # Calculate the cosine similarity between the word bank 
# # in "bank robber" vs "river bank" (different meanings).
# diff_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[19])

# # Calculate the cosine similarity between the word bank
# # in "bank robber" vs "bank vault" (same meaning).
# same_bank = 1 - cosine(token_vecs_sum[10], token_vecs_sum[6])

# print('Vector similarity for  *similar*  meanings:  %.2f' % same_bank)
# print('Vector similarity for *different* meanings:  %.2f' % diff_bank)