In [None]:
!pip install transformers

In [None]:
!pip3 install faiss-cpu

In [None]:
!pip3 install -U scikit-learn scipy matplotlib

In [None]:
!pip3 install networkx==3.1
!pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

In [2]:
import transformers
import torch
print(transformers.__version__)
print(torch.__version__)


4.38.2
2.2.0+cpu


In [3]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-small-en-v1.5') # you can change the model here
model = AutoModel.from_pretrained('BAAI/bge-small-en-v1.5')

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
sentences = [
    "Three years later, the coffin was still full of Jello.",
    "The fish dreamed of escaping the fishbowl and into the toilet where he saw his friend go.",
    "The person box was packed with jelly many dozens of months later.",
    "He found a leprechaun in his walnut shell."
]

In [9]:
with torch.no_grad():
    outputs = model(**tokens)
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output'])

In [10]:
embeddings = outputs.last_hidden_state
embeddings.shape

torch.Size([4, 512, 384])

In [11]:
embeddings

tensor([[[-0.3095,  0.6000,  0.1314,  ..., -0.4734,  0.0277,  0.0955],
         [-0.0089,  0.1133, -0.0657,  ..., -0.3988,  0.3408,  0.4774],
         [-0.1598,  0.9341,  0.3608,  ..., -0.4125,  0.3438,  0.2293],
         ...,
         [-0.3095,  0.6000,  0.1314,  ..., -0.4735,  0.0277,  0.0955],
         [-0.3095,  0.6000,  0.1314,  ..., -0.4735,  0.0277,  0.0955],
         [-0.3095,  0.6000,  0.1314,  ..., -0.4735,  0.0277,  0.0955]],

        [[-0.2455, -0.3795,  0.4676,  ...,  0.6978,  0.1150,  0.0131],
         [-0.4006, -0.0357,  0.3751,  ...,  0.4017,  0.5057,  0.4307],
         [-0.4496, -0.3925,  0.3116,  ...,  0.4118,  0.8414, -0.0073],
         ...,
         [-0.2455, -0.3795,  0.4676,  ...,  0.6977,  0.1150,  0.0131],
         [-0.2455, -0.3795,  0.4676,  ...,  0.6978,  0.1150,  0.0131],
         [-0.2454, -0.3796,  0.4677,  ...,  0.6977,  0.1149,  0.0130]],

        [[-0.1163,  0.4797,  0.0555,  ..., -0.2075, -0.3212, -0.1198],
         [-0.1758,  0.6830, -0.0600,  ..., -0

After we have produced our dense vectors embeddings, we need to perform a mean pooling operation to create a single vector encoding (the sentence embedding). To do this mean pooling operation, we will need to multiply each value in our embeddings tensor by its respective attention_mask value — so that we ignore non-real tokens.

In [12]:
# resize our attention_mask tensor:
attention_mask = tokens['attention_mask']
attention_mask.shape

torch.Size([4, 512])

In [13]:
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
mask.shape

torch.Size([4, 512, 384])

Each vector above represents a single token attention mask - each token now has a vector of size 768 representing it's attention_mask status. Then we multiply the two tensors to apply the attention mask:

In [14]:
masked_embeddings = embeddings * mask
masked_embeddings.shape

torch.Size([4, 512, 384])

"Mean Pooling" starts

In [15]:
# Then we sum the remained of the embeddings along axis 1, because we want to reduce the 512 tokens to 1 dimension
summed = torch.sum(masked_embeddings, 1)
summed.shape

torch.Size([4, 384])

we want to count only those values that we want to give attention
then divide by the sum to get the mean

In [16]:
# clamp returns the same tensor with a range given, clamp is used to replace the zeros to a very minimal value
# to avoid divide by zero error
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
summed_mask.shape

torch.Size([4, 384])

Finally, we calculate the mean as the sum of the embedding activations summed divided by the number of values that should be given attention in each position `summed_mask`:

In [17]:
mean_pooled = summed / summed_mask

`mean_pooled` is the final "dense representation" of the sentences, note that mean_pooled contains all representations for all sentences together

In [18]:
mean_pooled

tensor([[-0.2187,  0.5695,  0.2394,  ..., -0.4227,  0.2383,  0.1496],
        [-0.3022, -0.2839,  0.4135,  ...,  0.5210,  0.2369,  0.0987],
        [-0.1402,  0.4530,  0.0868,  ..., -0.2211, -0.2011, -0.1606],
        [-0.2279, -0.2567, -0.0771,  ..., -0.3134,  0.2999,  0.3894]])

## Cosine Similarity

In [19]:
def convert_to_embedding(query):
    tokens = {'input_ids': [], 'attention_mask': []}
    new_tokens = tokenizer.encode_plus(query, max_length=512,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')
    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
    attention_mask = tokens['attention_mask']
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask
    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(1), min=1e-9)
    mean_pooled = summed / summed_mask
    
    return mean_pooled[0] # assuming query is a single sentence
    

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
query = "Nemo is a fish"
query_embedding = convert_to_embedding(query)

In [22]:
mean_pooled.shape

torch.Size([4, 384])

In [23]:
cos = torch.nn.CosineSimilarity()
sim = cos(query_embedding, mean_pooled)
sim

tensor([0.4140, 0.5127, 0.3851, 0.4938])

# FAISS

In [24]:
import faiss                   # make faiss available
index = faiss.IndexFlatIP(384)   # build the index
print(index.is_trained)
index.add(mean_pooled)                  # add vectors to the index
print(index.ntotal)

ModuleNotFoundError: No module named 'faiss'

In [31]:
mean_pooled.shape

torch.Size([4, 768])

In [32]:
query_embedding.shape

torch.Size([768])

In [35]:
D, I = index.search(query_embedding[None, :], 1) # None dimension is added because we only have one query against 4 documents

In [36]:
D

array([[35.04268]], dtype=float32)

In [37]:
I

array([[1]])

In [41]:
faiss.write_index(index,"sample_code.index")

In [42]:
index_loaded = faiss.read_index("sample_code.index")

In [43]:
D, I = index_loaded.search(query_embedding[None, :], 4)

In [44]:
D

array([[35.04268 , 26.346306, 17.326878, 14.138208]], dtype=float32)

In [45]:
I

array([[1, 3, 0, 2]])