In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.2/224.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.0 tokenizers-0.13.3 transformers-4.28.1


In [None]:
# Import Pytorch, AutoModel and AutoTokenizer
import torch
from transformers import AutoModel, AutoTokenizer

# Choose a pre-trained model
model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
# Define the sentences to be tokenized
sentences = ["I love my family", "I love my dog", "My dog is a lab"]

In [None]:
# Create tokens from the tokenizer instance.
tokens = [tokenizer.tokenize(sentence) for sentence in sentences]

# Print out the tokens.
print(tokens)

[['I', 'love', 'my', 'family'], ['I', 'love', 'my', 'dog'], ['My', 'dog', 'is', 'a', 'lab']]


In [None]:
# Convert the tokens to IDs.
tokenized_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
input_ids = tokenized_sentences['input_ids']
print(input_ids)

tensor([[ 101,  146, 1567, 1139, 1266,  102,    0],
        [ 101,  146, 1567, 1139, 3676,  102,    0],
        [ 101, 1422, 3676, 1110,  170, 8074,  102]])


In [None]:
# Generate embeddings using PyTorch
with torch.no_grad():
    outputs = model(tokenized_sentences['input_ids'])
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()

# Print the embeddings
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Embedding {i+1}: {embeddings[i][0:10]}")
    print()

Sentence 1: I love my family
Embedding 1: [-0.19375661  0.32130736  0.07548827 -0.40688723 -0.4763685   0.07858068
  0.8525571   0.05237346 -0.06373273 -0.1726824 ]

Sentence 2: I love my dog
Embedding 2: [-0.18957779  0.6149509   0.42388856 -0.31033877 -0.55150765 -0.04185519
  0.68279636  0.19014312 -0.2128999  -0.1065003 ]

Sentence 3: My dog is a lab
Embedding 3: [-0.37911138  0.05389749 -0.32136977 -0.34749442 -0.16758227 -0.08009627
  0.09228063  0.4515096  -0.44420296 -0.25733727]



In [None]:
# Get the length of each embedding.
[print(len(embedding)) for embedding in embeddings]

768
768
768


[None, None, None]

In [None]:
# Import numpy and the cosine function from scipy.spatial.distance 
import numpy as np
from scipy.spatial.distance import cosine


# Create a 3x3 matrix with zeros. 
similarity_matrix = np.zeros((len(embeddings), len(embeddings)))
# First iteration through each embedding 
for i in range(len(embeddings)):
  # Second iteration through each embedding.
  for j in range(len(embeddings)):
    # Calculate the pairwise cosine similarities between the embeddings.
    similarity_matrix[i, j] = 1 - cosine(embeddings[i], embeddings[j])

# Print the similarity matrix
print("Similarity matrix:")
print(similarity_matrix)

Similarity matrix:
[[1.         0.96456212 0.82049882]
 [0.96456212 1.         0.8291617 ]
 [0.82049882 0.8291617  1.        ]]


In [None]:
# Import Pandas
import pandas as pd

# Convert the similarity matrix to a Pandas DataFrame. 
similarity_df = pd.DataFrame(similarity_matrix, columns=['Sentence 1', 'Sentence 2', 'Sentence 3'], index=['Sentence 1', 'Sentence 2', 'Sentence 3'])
similarity_df

Unnamed: 0,Sentence 1,Sentence 2,Sentence 3
Sentence 1,1.0,0.964562,0.820499
Sentence 2,0.964562,1.0,0.829162
Sentence 3,0.820499,0.829162,1.0
