# Explicando embeddings

O que são embeddings? Como eles se formam? O que fazer com eles?

In [2]:
pip install Transformers torch torchvision -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Tudo começa com a tokenização de uma sentença
sentence = "The world is full of kings and queens Who blind your eyes and steal your dreams It is Heaven and Hell"

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer_output = tokenizer.tokenize(sentence)
print(tokenizer_output)

  from .autonotebook import tqdm as notebook_tqdm


['the', 'world', 'is', 'full', 'of', 'kings', 'and', 'queens', 'who', 'blind', 'your', 'eyes', 'and', 'steal', 'your', 'dreams', 'it', 'is', 'heaven', 'and', 'hell']


In [4]:
tokens_embedding = tokenizer.convert_tokens_to_ids(tokenizer_output)
print(tokens_embedding)

[1996, 2088, 2003, 2440, 1997, 5465, 1998, 8603, 2040, 6397, 2115, 2159, 1998, 8954, 2115, 5544, 2009, 2003, 6014, 1998, 3109]


In [5]:
decoded_content = tokenizer.decode(tokens_embedding)
print(decoded_content)

the world is full of kings and queens who blind your eyes and steal your dreams it is heaven and hell



## Tokens não são os embeddings

Mas isto ainda não é o embedding. Simplificando, é um "de-para" de tokens para um ID numerico.

Os modelos partes dos tokens para iniciar o processamento através dos transformers. Cada um tem seu processo e é aqui que os modelos se diferenciam.

# Gerando embeddings

A resposta é um embedding?

(Código baseado no sample do modelo: https://huggingface.co/intfloat/e5-small-v2)

In [7]:
# Tudo começa com a geração de tokens
from transformers import  AutoTokenizer, AutoModel
import json
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-small-v2')
tokens = tokenizer(sentence)
print(f"""Tokens: {len(tokens["input_ids"])}""")
tokens

Tokens: 23


{'input_ids': [101, 1996, 2088, 2003, 2440, 1997, 5465, 1998, 8603, 2040, 6397, 2115, 2159, 1998, 8954, 2115, 5544, 2009, 2003, 6014, 1998, 3109, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
# Utilizando o modelo para gerar o embedding
model = AutoModel.from_pretrained('intfloat/e5-small-v2')
batch_dict = tokenizer([sentence], max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model(**batch_dict)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.4479, -0.0115,  0.1771,  ..., -0.2042, -0.1184,  0.1113],
         [-0.4541, -0.1457,  0.3784,  ..., -0.2429, -0.5251, -0.2153],
         [-0.1444,  0.2768,  0.1218,  ..., -0.8014, -0.5903, -0.3979],
         ...,
         [-0.5328,  0.3846,  0.1268,  ..., -0.6043, -0.1965, -0.5521],
         [-0.2418,  0.3299,  0.1877,  ..., -0.5818, -0.5166, -0.4228],
         [-0.4479, -0.0115,  0.1771,  ..., -0.2042, -0.1184,  0.1113]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.0139,  0.0415, -0.0178,  0.0873,  0.0021, -0.0255, -0.0400,  0.1028,
         -0.0874,  0.0009,  0.0891,  0.0599, -0.0562,  0.0321, -0.0597,  0.0932,
         -0.0117,  0.0076, -0.0127, -0.0247, -0.0755,  0.0670,  0.0625, -0.0348,
         -0.0093,  0.0975, -0.0021, -0.0222,  0.0278,  0.0815,  0.1085, -0.0818,
         -0.0205,  0.0673, -0.0257, -0.0115, -0.0107,  0.0078, -0.0597, -0.0123,
         -0.0231,  0.0250,  0.03

In [14]:
outputs[0][0][0]

tensor([-4.4785e-01, -1.1496e-02,  1.7711e-01,  1.2870e-01,  6.4215e-02,
         9.2883e-02,  4.3558e-01, -4.2201e-01,  6.6183e-02,  3.6002e-01,
         4.2325e-01,  2.2437e-02, -6.0449e-02,  8.9353e-02, -2.5898e-02,
        -2.0006e-01, -1.4480e-01,  2.7423e-01, -6.1061e-01,  1.9614e-01,
         4.7051e-01, -3.1828e-01,  2.6008e-01, -2.6943e-01, -1.7900e-01,
        -3.1307e-02,  2.7328e-01,  2.4214e-01, -3.8911e-01, -1.0437e+00,
        -3.1749e-01, -9.1094e-02,  1.0508e-01, -1.5807e-01,  2.4470e-01,
        -3.2338e-01, -7.8153e-02,  2.8226e-01,  2.2783e-01,  3.2338e-01,
        -1.0024e-01, -2.3141e-01,  1.8116e-01, -3.5374e-01, -6.9689e-02,
        -2.9036e-01, -2.2994e-01, -3.4706e-01,  5.4663e-01,  1.6744e-01,
        -2.7020e-01,  1.4854e-01,  3.0951e-01,  1.4862e-01,  1.5782e-01,
         1.3603e-01,  1.5238e-01,  3.0501e-01,  3.2070e-01,  6.4362e-01,
         4.1900e-02,  3.6655e-01, -7.5831e-01,  8.0596e-01,  5.1270e-01,
         1.1508e-01, -1.2763e-01, -1.5136e-01, -2.8

In [15]:
# Pooling do resultado
import torch.nn.functional as F
from torch import Tensor
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [16]:
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
embeddings

tensor([[-4.5350e-01,  3.3551e-03,  1.8387e-01,  2.6981e-01,  6.1939e-02,
          8.0977e-02,  7.0391e-01, -5.5273e-01,  2.5565e-01,  5.9972e-01,
          6.2136e-01, -1.3732e-01, -7.4902e-02,  8.3543e-02,  6.1113e-02,
         -1.8892e-01, -3.3909e-02,  5.1318e-02, -1.0601e+00,  1.2858e-01,
          7.4060e-01, -3.2295e-01,  2.2983e-01, -3.6419e-01, -4.3660e-01,
         -7.6555e-02,  2.4227e-01,  3.0801e-01, -4.5143e-01, -7.9612e-01,
         -6.5129e-01, -3.0351e-01, -4.7533e-02, -2.1919e-01,  4.2382e-01,
         -2.2801e-01, -1.4044e-03,  4.9123e-01,  3.5330e-01,  3.4610e-01,
         -2.0652e-01, -2.0422e-01,  4.3568e-01, -4.0872e-01, -7.0568e-02,
         -3.3678e-01, -2.9633e-01, -4.8066e-01,  8.2286e-01,  1.4736e-01,
         -3.7630e-01,  1.9381e-01,  3.9850e-01,  4.3981e-01,  2.8996e-01,
          1.8135e-02,  1.4675e-01,  4.3428e-01,  5.5339e-01,  8.3792e-01,
          6.2576e-03,  5.4782e-01, -8.5534e-01,  1.2178e+00,  6.5769e-01,
          3.4802e-01, -2.7304e-01,  1.

In [17]:
# normalize embeddings
embeddings_norm = F.normalize(embeddings, p=2, dim=1)
embeddings_norm

tensor([[-5.5003e-02,  4.0692e-04,  2.2301e-02,  3.2724e-02,  7.5123e-03,
          9.8214e-03,  8.5374e-02, -6.7039e-02,  3.1006e-02,  7.2737e-02,
          7.5362e-02, -1.6655e-02, -9.0846e-03,  1.0133e-02,  7.4121e-03,
         -2.2913e-02, -4.1127e-03,  6.2241e-03, -1.2857e-01,  1.5595e-02,
          8.9824e-02, -3.9169e-02,  2.7875e-02, -4.4170e-02, -5.2953e-02,
         -9.2850e-03,  2.9384e-02,  3.7357e-02, -5.4752e-02, -9.6557e-02,
         -7.8992e-02, -3.6811e-02, -5.7650e-03, -2.6585e-02,  5.1404e-02,
         -2.7655e-02, -1.7033e-04,  5.9580e-02,  4.2850e-02,  4.1976e-02,
         -2.5048e-02, -2.4769e-02,  5.2842e-02, -4.9572e-02, -8.5589e-03,
         -4.0847e-02, -3.5940e-02, -5.8297e-02,  9.9801e-02,  1.7873e-02,
         -4.5640e-02,  2.3506e-02,  4.8332e-02,  5.3342e-02,  3.5168e-02,
          2.1995e-03,  1.7798e-02,  5.2672e-02,  6.7118e-02,  1.0163e-01,
          7.5895e-04,  6.6442e-02, -1.0374e-01,  1.4771e-01,  7.9768e-02,
          4.2210e-02, -3.3116e-02,  2.

In [23]:
embeddings_norm[0]

tensor([-5.5003e-02,  4.0692e-04,  2.2301e-02,  3.2724e-02,  7.5123e-03,
         9.8214e-03,  8.5374e-02, -6.7039e-02,  3.1006e-02,  7.2737e-02,
         7.5362e-02, -1.6655e-02, -9.0846e-03,  1.0133e-02,  7.4121e-03,
        -2.2913e-02, -4.1127e-03,  6.2241e-03, -1.2857e-01,  1.5595e-02,
         8.9824e-02, -3.9169e-02,  2.7875e-02, -4.4170e-02, -5.2953e-02,
        -9.2850e-03,  2.9384e-02,  3.7357e-02, -5.4752e-02, -9.6557e-02,
        -7.8992e-02, -3.6811e-02, -5.7650e-03, -2.6585e-02,  5.1404e-02,
        -2.7655e-02, -1.7033e-04,  5.9580e-02,  4.2850e-02,  4.1976e-02,
        -2.5048e-02, -2.4769e-02,  5.2842e-02, -4.9572e-02, -8.5589e-03,
        -4.0847e-02, -3.5940e-02, -5.8297e-02,  9.9801e-02,  1.7873e-02,
        -4.5640e-02,  2.3506e-02,  4.8332e-02,  5.3342e-02,  3.5168e-02,
         2.1995e-03,  1.7798e-02,  5.2672e-02,  6.7118e-02,  1.0163e-01,
         7.5895e-04,  6.6442e-02, -1.0374e-01,  1.4771e-01,  7.9768e-02,
         4.2210e-02, -3.3116e-02,  2.3929e-04, -4.7

In [20]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-small-v2')
input_texts = [
    sentence
]
embedding_st = model.encode(input_texts, normalize_embeddings=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
embedding_st

array([[-5.50028048e-02,  4.06924548e-04,  2.23011617e-02,
         3.27239670e-02,  7.51231192e-03,  9.82137769e-03,
         8.53741691e-02, -6.70386478e-02,  3.10061667e-02,
         7.27371424e-02,  7.53616616e-02, -1.66554824e-02,
        -9.08455532e-03,  1.01325316e-02,  7.41213001e-03,
        -2.29131803e-02, -4.11272421e-03,  6.22413447e-03,
        -1.28571197e-01,  1.55947404e-02,  8.98244902e-02,
        -3.91689278e-02,  2.78747417e-02, -4.41704728e-02,
        -5.29532507e-02, -9.28498060e-03,  2.93835327e-02,
         3.73574458e-02, -5.47523461e-02, -9.65574160e-02,
        -7.89923221e-02, -3.68110277e-02, -5.76502504e-03,
        -2.65847743e-02,  5.14036492e-02, -2.76546981e-02,
        -1.70333748e-04,  5.95797002e-02,  4.28500213e-02,
         4.19764034e-02, -2.50484254e-02, -2.47685499e-02,
         5.28421029e-02, -4.95721996e-02, -8.55890010e-03,
        -4.08470444e-02, -3.59404273e-02, -5.82969822e-02,
         9.98011827e-02,  1.78730804e-02, -4.56400886e-0

# O que fazer com os Embeddings?

Os embeddings são então utilizados para encontrar conteúdo pelo seu significado (e não sobre as palavras ou termos utilizados)

Vamos ver na prática como gerar, armazenar e encontrar similaridades

In [24]:
# Vamos importar dados
import pandas as pd
from tqdm import tqdm
df = pd.read_csv("./data/90minFootballTransferNewsNLP.csv")

In [25]:
df.head()

Unnamed: 0,Title,Date,Link,Content
0,Football transfer rumours: Why Maguire's Man U...,"Aug 15, 2023",https://www.90min.com/posts/football-transfer-...,"90minÂ rounds up the latestÂ transfer news, ru..."
1,Chelsea agree Romeo Lavia fee with Southampton,"Aug 15, 2023",https://www.90min.com/posts/chelsea-agree-rome...,Chelsea have finalised an agreement with South...
2,Harry Maguire's proposed West Ham transfer col...,"Aug 15, 2023",https://www.90min.com/posts/harry-maguire-prop...,Harry Maguire's proposed transfer to West Ham ...
3,Southampton director breaks down Chelsea & Liv...,"Aug 15, 2023",https://www.90min.com/posts/southampton-direct...,Southampton director Jason Wilcox has revealed...
4,Neymar completes move from PSG to Al Hilal,"Aug 15, 2023",https://www.90min.com/posts/neymar-completes-m...,Saudi Pro League side Al Hilal have confirmed ...


In [26]:
from sentence_transformers import SentenceTransformer
model_emb = SentenceTransformer('intfloat/e5-small-v2')
print(df.iloc[1]["Content"])
model_emb.encode(df.iloc[1]["Content"])

Chelsea have finalised an agreement with Southampton to sign midfielder Romeo Lavia, 90min understands. After striking a deal over a fee with the Saints, Chelsea set about finalising the terms of a move which sources have confirmed is worth around Â£58m including add-ons. Southampton director Jason Wilcox confirmed earlier on Tuesday that the two clubs were still in negotiations at that point, but just a few hours later, the Saints have shook hands with Chelsea and are bidding farewell to Lavia. Lavia is due to undergo a medical with Chelsea in the coming days ahead of putting pen to paper on another long-term contract at Stamford Bridge. The Belgian had the opportunity to join Liverpool this summer but, like Moises Caicedo, has opted to join Chelsea instead. feed Should all go to plan with his medical, Lavia will become the latest in a long list of new signings at Chelsea. Alongside Caicedo, the Blues have already recruited Axel Disasi, Christopher Nkunku, Nicolas Jackson, Lesley Ugoc

array([-5.76095395e-02,  3.38781178e-02, -2.69276090e-02, -3.79669853e-02,
       -3.37238796e-02,  5.25908284e-02,  2.47094631e-02, -6.07529990e-02,
        1.87662095e-02,  5.68616353e-02,  3.69013287e-02, -3.58773097e-02,
       -2.86247302e-02,  7.25682899e-02,  1.89046748e-02, -4.77037430e-02,
       -3.52913737e-02,  2.63741929e-02, -8.76376554e-02,  4.58207317e-02,
        2.14688461e-02, -8.09595808e-02, -1.63774677e-02, -1.02591664e-01,
        3.48840021e-02,  7.74936005e-03,  7.95766115e-02,  3.47386976e-03,
       -6.00943230e-02, -1.80301383e-01, -1.98317859e-02,  2.15552840e-02,
       -2.51364429e-02, -9.65009108e-02,  4.53840382e-02, -4.86114472e-02,
        1.97075773e-02,  5.80615178e-02,  7.67342979e-03,  7.48858154e-02,
       -3.32354754e-02, -8.55442733e-02, -1.22771757e-02, -9.79826152e-02,
        4.76597250e-02, -1.96836554e-02, -6.41095713e-02, -2.69780736e-02,
        7.07179755e-02,  7.47454092e-02, -8.36836454e-03, -3.19483504e-02,
        2.44122483e-02,  

In [28]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

# creating connection with Astra
import cassio
from cqlsession import getCQLSession, getCQLKeyspace
cqlMode = "astra_db" # "astra_db"/"local"
session = getCQLSession(mode=cqlMode)
keyspace = getCQLKeyspace(mode=cqlMode)

In [29]:
from cassio.table.tables import MetadataVectorCassandraTable
table_name = "football_news_emb"
embedding_dimension = 384

v_table = MetadataVectorCassandraTable(
    session=session,
    keyspace=keyspace,
    table=table_name,
    vector_dimension=embedding_dimension,
    primary_key_type="TEXT",
)

In [30]:
rows_to_load = 100
for index, row in tqdm(df.head(rows_to_load).iterrows(),total=len(df.head(rows_to_load))):
    v_table.put(
            row_id=f"""{row["Date"]}|{row["Title"]}""",
            body_blob=row["Content"],
            vector=model_emb.encode(row["Content"]),
            metadata={"date":row["Date"], "link": row["Link"]}
        )

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:30<00:00,  3.30it/s]


In [32]:
query = "brazillian players"
query_emb = model_emb.encode(query)
res = list(v_table.metric_ann_search(query_emb, metric='dot', n=5))
for i in res:
    print("--------------------")
    print(f"""Distance: {i["distance"]}""")
    print(i["body_blob"])


--------------------
Distance: 0.8013982042657043
Saudi Pro League director of football Michael Emenalo claims the competition can position itself as one of the world's best within the next two years by targeting 'exceptional players only' The former technical director of Chelsea assumed his role in July and is continuing the Pro League's significant Â£400m investment initiative. This strategy has enticed players such as Karim Benzema, Jordan Henderson, Riyad Mahrez, and Roberto Firmino to join Saudi Arabian clubs this summer. So far, the majority of top stars heading to Saudi Arabia, including Benzema, N'Golo Kante and Cristiano Ronaldo, have been in the latter stages of their respective careers. At 26, Ruben Neves is one of the exceptions to the trend and arguably not in the same bracket of star anyway. Richard Masters, chief executive of the Premier League, recently shared that he isn't overly worried about Saudi Arabia's unrestricted spending. However, Emenalo cautioned that there 