In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("cointegrated/rubert-tiny2")
model = SentenceTransformer("DeepPavlov/rubert-base-cased-sentence")  # may require manual pooling


sentences = [
    "Привет, как дела?",
    "Это тестовое предложение.",
    "Я люблю машинное обучение."
]

embeddings = model.encode(sentences)

print(embeddings.shape)  # e.g., (3, 768)

No sentence-transformers model found with name DeepPavlov/rubert-base-cased-sentence. Creating a new one with mean pooling.


(3, 768)


In [9]:
vec1 = model.encode("в прошлом году я купил дом")
vec2 = model.encode("мои апартаменты куплены в прошлом году")

# ✅ Cosine similarity (0 to 1)
cos_sim = cosine_similarity([vec1], [vec2])[0][0]
print("Cosine similarity:", cos_sim)

# ✅ Cosine distance (0 = identical)
cos_dist = 1 - cos_sim
print("Cosine distance:", cos_dist)

# ✅ Euclidean distance
euclid_dist = np.linalg.norm(vec1 - vec2)
print("Euclidean distance:", euclid_dist)

Cosine similarity: 0.82871413
Cosine distance: 0.17128587
Euclidean distance: 0.5852964


In [26]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Load model + tokenizer
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Input sentence(s)
sentences = ["в прошлом году я купил дом", "с недавних пор у меня есть недвижимость"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)

# Mean pooling over tokens (ignore padding)
attention_mask = inputs['attention_mask']
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
sentence_embeddings = sum_embeddings / sum_mask

# Now: sentence_embeddings is a torch.Tensor of shape (batch_size, 768)
print(sentence_embeddings.shape)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


torch.Size([2, 768])


In [27]:
import torch
import torch.nn.functional as F

# Normalize vectors to unit length
normed = F.normalize(sentence_embeddings, p=2, dim=1)

# Compute cosine similarity matrix: (N x N)
cos_sim_matrix = torch.matmul(normed, normed.T)

In [28]:
1- cos_sim_matrix

tensor([[5.9605e-08, 2.4579e-01],
        [2.4579e-01, 2.3842e-07]])

In [29]:
# Expand dimensions to broadcast: (N, 1, D) and (1, N, D)
a = sentence_embeddings.unsqueeze(1)  # shape: (N, 1, D)
b = sentence_embeddings.unsqueeze(0)  # shape: (1, N, D)

# Compute pairwise Euclidean distances
euclidean_dist_matrix = torch.sqrt(torch.sum((a - b) ** 2, dim=2))
euclidean_dist_matrix

tensor([[ 0.0000, 13.4417],
        [13.4417,  0.0000]])

In [30]:
manhattan_dist_matrix = torch.sum(torch.abs(a - b), dim=2)
manhattan_dist_matrix

tensor([[  0.0000, 291.7520],
        [291.7520,   0.0000]])