In [2]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
	torch.cuda.manual_seed_all(random_seed)


In [4]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [33]:
# Input text
text = "molecule"

# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus(
	[text],				 # List of input texts
	padding=True,			 # Pad to the maximum sequence length
	truncation=True,		 # Truncate to the maximum sequence length if necessary
	return_tensors='pt',	 # Return PyTorch tensors
	add_special_tokens=True # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids'] # Token IDs
# print input IDs
print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask'] # Attention mask
# print attention mask
print(f"Attention mask: {attention_mask}")

Input ID: tensor([[  101, 13922,   102]])
Attention mask: tensor([[1, 1, 1]])


In [34]:
# Generate embeddings using BERT model
with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)
	word_embeddings = outputs.last_hidden_state # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")

Shape of Word Embeddings: torch.Size([1, 3, 768])


In [35]:
# Decode the token IDs back to text
decoded_text=''
for i in range(0, word_embeddings.shape[0]):
  decoded_text += tokenizer.decode(input_ids[i], skip_special_tokens=True)
#print decoded text
print(f"Decoded Text: {decoded_text}")
# Tokenize the text again for reference
tokenized_text = tokenizer.tokenize(decoded_text)
#print tokenized text
print(f"tokenized Text: {tokenized_text}")
# Encode the text
encoded_text = tokenizer.encode(text, return_tensors='pt') # Returns a tensor
# Print encoded text
print(f"Encoded Text: {encoded_text}")

Decoded Text: molecule
tokenized Text: ['molecule']
Encoded Text: tensor([[  101, 13922,   102]])


In [36]:
# Print word embeddings for each token
for token, embedding in zip(tokenized_text, word_embeddings[0]):
	#print(f"Token: {token}")
	print(f"Embedding: {embedding}")
	print("\n")

Embedding: tensor([-3.8815e-01,  9.1515e-02, -4.9704e-02, -2.4158e-01,  2.9488e-02,
        -1.4817e-01,  2.3101e-01,  1.3127e-01, -2.1529e-01, -2.0963e-01,
        -1.4402e-01,  5.5526e-02, -1.3274e-01,  1.8071e-01,  4.1589e-02,
         8.0162e-02, -3.0528e-01,  1.9390e-01,  3.9717e-01, -3.6532e-01,
        -1.8966e-01, -7.6527e-02, -5.3644e-02, -1.4733e-01,  8.5097e-02,
        -1.9786e-01, -2.7967e-02, -7.2341e-02,  8.9798e-02,  3.4190e-03,
        -1.7328e-01,  3.0543e-01, -4.2409e-02,  1.7373e-01, -6.0360e-03,
         1.0509e-01, -4.7162e-03, -3.0410e-02, -2.2214e-02,  6.1597e-02,
         1.5326e-02, -1.2219e-01,  2.1234e-01,  2.9837e-02,  2.3820e-02,
        -2.4601e-01, -1.7790e+00,  5.9220e-02, -2.1918e-01,  2.0160e-01,
         1.9865e-01,  2.3943e-01,  4.2698e-01,  4.2763e-02, -1.9973e-02,
         3.2727e-01, -3.3323e-02,  4.6128e-01,  1.9502e-01, -9.4449e-02,
         4.1662e-02,  9.3733e-02, -1.4246e-02,  1.5594e-01,  4.2204e-02,
         3.0800e-01,  4.4613e-02,  9.550

In [37]:
# Compute the average of word embeddings to get the sentence embedding
sentence_embedding = word_embeddings.mean(dim=1) # Average pooling along the sequence length dimension

# Print the sentence embedding
print("Sentence Embedding:")
print(sentence_embedding)

# Output the shape of the sentence embedding
print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")

Sentence Embedding:
tensor([[ 2.8544e-02, -2.0548e-01, -1.9693e-01, -8.1720e-02,  8.4199e-02,
         -1.4104e-01,  2.0989e-01, -1.5014e-01,  2.5140e-02, -2.9520e-01,
         -3.6389e-02,  1.3348e-01, -1.1201e-01,  1.2146e-01, -3.7848e-01,
         -7.6871e-03, -4.8082e-03, -2.3726e-02,  2.5783e-01,  3.5012e-02,
          3.2202e-02, -1.8874e-02,  2.5819e-01, -1.8189e-02,  2.3397e-01,
          1.2778e-01, -2.8521e-01, -6.8162e-03, -2.5505e-01,  1.8377e-01,
         -3.1492e-01, -1.5163e-01,  1.2726e-01,  4.0938e-01, -5.1039e-02,
         -7.7385e-02,  2.4710e-01, -6.9834e-02, -5.7428e-01, -8.5280e-02,
         -1.8767e-01, -1.7277e-01, -6.1638e-02, -2.4280e-02,  1.0078e-01,
         -2.4518e-02, -3.0651e-01,  2.7773e-01, -1.0411e-01,  4.7686e-01,
         -1.9336e-01,  3.6241e-01,  8.5183e-02,  2.3377e-01,  1.4612e-01,
         -7.1649e-02,  3.8144e-01,  1.9353e-02, -2.9088e-02, -2.7097e-01,
          1.3297e-01,  1.9338e-01, -4.1512e-02,  1.2120e-01,  3.6655e-01,
          9.7766e-

In [38]:
# Example sentence for similarity comparison
example_sentence = "Pushkin"

# Tokenize and encode the example sentence
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
example_encoding = tokenizer.batch_encode_plus(
	[example_sentence],
	padding=True,
	truncation=True,
	return_tensors='pt',
	add_special_tokens=True
)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']

# Generate embeddings for the example sentence
with torch.no_grad():
	example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
	example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)

# Compute cosine similarity between the original sentence embedding and the example sentence embedding
similarity_score = cosine_similarity(sentence_embedding, example_sentence_embedding)

# Print the similarity score
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.50752324
