## Загружаем модель ##

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch



In [2]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
# tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased-sentence")
# model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased-sentence")

## Проверка ##

In [3]:
sentences = ['Габа чай улучшает нейронное состояние мозга',
             'Габа чай был изобретен в Японии в 1980-е годы.']

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

In [4]:
sentence_embeddings.shape

torch.Size([2, 1024])

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
X = sentence_embeddings[0].reshape(1, -1)
Y = sentence_embeddings[1].reshape(1, -1)
print(cosine_similarity(X, X)[0][0])
print(cosine_similarity(X, Y)[0][0])

0.99999994
0.5947609


## Данные ##

In [7]:
import requests, zipfile, io

# download utility functions
url = 'https://raw.githubusercontent.com/rossyaykin/RuOpinionNE-2024/refs/heads/main/utils/src.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall('')

from src.utils import load_jsonl, dict2tuple, extract_tuple # str2list, save_jsonl

In [8]:
# download train and test data
train_path = "data/train.jsonl"
test_path = "data/test.jsonl"

url = 'https://raw.githubusercontent.com/dialogue-evaluation/RuOpinionNE-2024/refs/heads/master/train.jsonl'
train = load_jsonl(url, train_path)
url = 'https://raw.githubusercontent.com/dialogue-evaluation/RuOpinionNE-2024/refs/heads/master/test.jsonl'
test = load_jsonl(url, test_path)

print(len(train), len(test))

2556 803


## Test set x Train set ##

In [29]:
%%time

sentences = [x['text'] for x in train]

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=1024, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
Y = mean_pooling(model_output, encoded_input['attention_mask'])

CPU times: user 33min 42s, sys: 3min 10s, total: 36min 52s
Wall time: 3min 27s


In [30]:
# train texts embeddings
Y.shape

torch.Size([2556, 768])

In [31]:
%%time

sentences = [x['text'] for x in test]

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=1024, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
X = mean_pooling(model_output, encoded_input['attention_mask'])

CPU times: user 6min 6s, sys: 35 s, total: 6min 41s
Wall time: 36.3 s


In [32]:
# test texts embeddings
X.shape

torch.Size([803, 768])

In [33]:
%%time

from sklearn.metrics.pairwise import pairwise_distances

result = pairwise_distances(X, Y, metric='cosine')

CPU times: user 181 ms, sys: 189 ms, total: 369 ms
Wall time: 18.9 ms


## Проверка ##

In [34]:
result[0]

array([0.37094367, 0.42755193, 0.26106262, ..., 0.34202486, 0.38876927,
       0.24002558], dtype=float32)

In [35]:
# sort texts indexes

final = list()
for elem in result:
    final.append(' '.join([str(x) for x in elem.argsort()]))

In [36]:
final[1]

'2236 2259 161 2227 1473 1628 2198 1556 996 1959 205 737 1828 402 2197 1123 1877 1652 906 797 369 1560 1138 997 1324 907 2263 235 186 1023 1961 239 927 473 1940 117 154 2262 2228 1615 1725 45 799 465 714 1472 1822 1712 544 1796 2266 2229 913 1171 1184 2238 1139 791 905 449 2260 469 478 490 1629 538 1874 853 1949 912 1573 581 365 476 1627 915 916 1056 559 1412 540 1024 1870 2231 803 1798 1572 940 2235 1369 918 121 1241 1693 470 373 174 1730 1534 914 1832 290 1960 1726 911 1724 2169 1604 159 696 1477 573 2246 1559 116 800 724 617 1884 1277 1618 1739 2196 2133 1941 1601 1529 703 1852 477 452 11 606 2316 1368 1544 908 398 2176 2072 1260 1787 1571 1224 1575 705 2109 1790 1892 155 481 2310 416 1888 1636 550 1915 1742 2126 667 1222 2237 1137 1682 2182 2031 1753 1895 1202 157 1430 509 1680 859 2527 1689 2510 936 541 1 796 1528 2043 619 1755 2074 1480 657 1827 1166 1063 1083 2461 716 2376 1854 80 5 1801 1954 1499 580 126 1694 966 1244 693 575 1409 185 722 1457 1188 2554 547 184 119 1049 1323 24

In [38]:
# test text
print(test[1]['text'])
# the most similar train text
print(train[2236]['text'])
# the least similar train text
print(train[749]['text'])

Известного российского певца Бориса Моисеева, который недавно перенес инсульт, отключили от аппарата вентиляции легких.
11 июля в одной из онкологических клиник Германии на 53-м году жизни скончался артист Ленкома Павел Смеян.
Очевидцы описывают его как молодого человека с бородой в чёрных спортивных штанах.


In [39]:
# another example
final[-1]

'1459 2074 923 1330 2231 413 2167 1572 1763 254 2110 1736 1731 1483 1291 501 21 1316 790 1245 216 2503 1974 213 798 1003 617 1133 1961 2 1541 1508 229 502 2514 9 609 1606 667 914 905 1639 1611 1249 331 528 291 1016 924 2235 546 437 1165 1892 1435 1933 2238 2168 1742 1310 481 1614 840 2142 2076 2122 2327 1701 1460 1559 1239 1320 642 1294 350 1578 1546 1302 645 1290 1730 1818 214 370 652 24 1027 1882 2398 1339 1758 2021 2083 1726 271 1540 801 2513 1438 319 2230 1613 2232 2535 332 313 1573 1346 439 1084 1144 1732 1749 1493 612 1828 2329 918 236 880 1377 1138 2414 1309 1953 2409 1859 551 724 1930 509 2374 1557 260 1351 723 2179 2527 1482 808 2140 340 168 1852 1433 498 1024 769 1850 2248 1692 920 1975 1921 1443 1018 12 588 180 210 1716 733 928 1870 1167 407 1756 1619 42 1013 1425 1448 1839 2031 108 127 1770 911 14 198 763 2413 330 1306 1480 1226 1728 800 735 334 2195 15 799 368 465 1575 1452 341 170 1180 1318 997 2554 1812 120 1712 499 151 1518 143 1472 137 1884 1587 554 17 1887 1804 1202 2

In [40]:
# test text
print(test[-1]['text'])
# the most similar train text
print(train[1459]['text'])
# the least similar train text
print(train[222]['text'])

Как Уилера оказался в Ньюарке, пока остается загадкой.
Кирилл Здоровенин, местонахождение которого на данный момент неизвестно, находится в розыске.
В регионе помимо местных подворий есть крупные свиноводческие комплексы, где выращивают до 200 тысяч свиней в год.


## Сохраняем ##

In [41]:
with open('test_distances.txt', 'w') as st:
    st.writelines(line + '\n' for line in final)