In [None]:
import transformers
from transformers import AutoTokenizer, AutoModel 

model_name = "bert-base-cased" # cased means Upper and Lower case are distinguished
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
text = "Hello the time now is 1020 in the morning"

In [None]:
tokens = tokenizer(text)

In [None]:
print(tokens['input_ids'])
# 101 - CLS, 102 - SEP

In [None]:
tokens3 = tokenizer.convert_ids_to_tokens(tokens['input_ids'])

print(f"Tokens: {tokens3}")

In [None]:
model = transformers.BertModel.from_pretrained("bert-base-cased")

In [None]:
embedding_layer = model.embeddings

In [None]:
### Access the word embeddings within the BertEmbeddings layer
### as opposed to the entire embedding layer that also has
### position embeddings and token type embeddings
word_embeddings = embedding_layer.word_embeddings

print(word_embeddings.weight)

In [None]:
word1 = 'pig'
print(tokenizer(word1)) # 3 tokens because of CLS and SEP 

In [None]:
word2 = 'little'
print(tokenizer(word2))

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
token_ids1 = tokenizer.encode(word1, add_special_tokens=False)
token_ids2 = tokenizer.encode(word2, add_special_tokens=False)

In [None]:
# Re-select embeddings from embeddings layer without the extra tokens this time
embedding1 = embedding_layer.word_embeddings.weight[token_ids1]
embedding2 = embedding_layer.word_embeddings.weight[token_ids2]

# Convert tensors to NumPy arrays and squeeze result to remove extra dimensions
embedding1 = embedding1.squeeze().detach().numpy()
embedding2 = embedding2.squeeze().detach().numpy()

In [None]:
# Norms of the embeddings
magnitude1 = np.linalg.norm(embedding1)
magnitude2 = np.linalg.norm(embedding2)

cosine_sim = np.dot(embedding1, embedding2) / (magnitude1 * magnitude2)

In [None]:
print(f"Cosine sim of '{word1}' and '{word2}': {cosine_sim}")

In [None]:
# sklearn to compute cosine similarity - it's the same as above 
cosine_sim = cosine_similarity([embedding1], [embedding2])

print(f"Cosine sim of '{word1}' and '{word2}': {cosine_sim}")

In [None]:
import torch 

model = AutoModel.from_pretrained(model_name)

# Now we need the full output of the tokeniser for the forward pass, not just encode

tokenids1 = tokenizer(word1, return_tensors="pt", add_special_tokens=False)
tokenids2 = tokenizer(word2, return_tensors="pt", add_special_tokens=False)

# Get embeddings from BERT model forward pass, do not store values needed for gradient comp (no_grad)
# Now use token_ids1 and 2 since we need all compunents of the embeddings

with torch.no_grad():
    # forward pass only -- not doing any backward pass (training)
    outputs1 = model(**tokenids1)
    outputs2 = model(**tokenids2)

# Extract embeddings for first token (word embedding)
# The last_hidden_state is of shape (batch_size, sequence_length, hidden_dim)
# Use average pooling over the sequence dimension (dim=1) to get a fixed-size embedding

embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy() # if there are multiple tokens, we take the mean of the tokens
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()

cosine_sim = cosine_similarity([embedding1], [embedding2])
print(f"cosine similarity of {word1} and {word2} is {cosine_sim}") # the pretrained BERT weights modifies it 

In [None]:
sent1 = "hello i don't want homework"
sent2 = "hello i don't want homework"

In [None]:
tokens_sent1 = tokenizer.encode(sent1)
tokens_sent2 = tokenizer.encode(sent2)

In [None]:
s_embedding1 = embedding_layer.word_embeddings.weight[tokens_sent1].mean(dim=0).squeeze().detach()
s_embedding2 = embedding_layer.word_embeddings.weight[tokens_sent2].mean(dim=0).squeeze().detach()

In [None]:
cosine_sim_static = cosine_similarity([s_embedding1], [s_embedding2])
print(f"cosine similarity of '{sent1}' and '{sent2}' is {cosine_sim_static}")

In [None]:
sent_ids1 = tokenizer(sent1, return_tensors="pt", padding=True)
## pad so that 2 sentences same length in tokens -
#  as we will be passing them through the model as a BATCH]
sent_ids2 = tokenizer(sent2, return_tensors="pt", padding=True)
## Get embeddings from BERT model forward pass, do not store values needed for gradient comp (no_grad)
with torch.no_grad():
    # Forward pass to get embeddings from the last hidden state
    outputs1 = model(**sent_ids1)
    outputs2 = model(**sent_ids2)

s_embedding3 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
s_embedding4 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()

cosine_sim_mean = cosine_similarity([s_embedding3], [s_embedding4])

print(f"Cosine sim with BERT forward is {cosine_sim_mean} \n"
      f"With STATIC embeddings it was {cosine_sim_static}")

In [None]:
# Extract embeddings for  [CLS] token (index 0 of last_hidden_state)
# The `last_hidden_state` has shape (batch_size, sequence_length, hidden_dim)
embedding5 = outputs1.last_hidden_state[0, 0].numpy()  # [CLS] token for sent1
embedding6 = outputs2.last_hidden_state[0, 0].numpy()

# Now let's take their cosine sim as before

cosine_sim_CLS = cosine_similarity([embedding5], [embedding6])

print(f"Cosine sim of [CLS] tokens is  {cosine_sim_CLS} \n"
f"Cosine sim with mean pooling is {cosine_sim_mean} \n"
f"and STATIC embeddings was {cosine_sim_static}"
)