In [5]:
import torch
import time
from transformers import pipeline
import torch.nn.functional as F

In [6]:
# "sentence-transformers/all-mpnet-base-v2"
model_id = "sentence-transformers/all-MiniLM-L6-v2"

if torch.cuda.is_available():
  device="cuda"
elif torch.xpu.is_available():
  device="xpu"
else:
  device="cpu"

print("Using device:", device)

extractor = pipeline("feature-extraction", model=model_id, torch_dtype=torch.float16, device=device, return_dict=False)

Using device: xpu


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use xpu


In [7]:
sentences = ["This is an example sentence", "Each sentence is converted"]
# sentences = ["This is an example sentence", "This is an example sentence"]

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = ( attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [8]:
with torch.inference_mode():
    extractor.forward_time = 0
    encoded_input = extractor.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    model_output = extractor(sentences, return_tensors=True, batch_size=2)
    sentence_embeddings_1 = F.normalize( mean_pooling(model_output[0], encoded_input["attention_mask"][0]) )
    sentence_embeddings_2 = F.normalize( mean_pooling(model_output[1], encoded_input["attention_mask"][1]) )
    score = torch.inner(sentence_embeddings_1, sentence_embeddings_2)

In [9]:
print(score)

tensor([[0.4046]])
