# Задание 1: реализуйте задачу классификации на основе BERT-like модели и KNN на данных Russian Intents Dataset с Kaggle.

## Грузим и готовим данные

In [3]:
import kagglehub

In [4]:
path = kagglehub.dataset_download("constantinwerner/qa-intents-dataset-university-domain")

In [5]:
path

'/home/alexander/.cache/kagglehub/datasets/constantinwerner/qa-intents-dataset-university-domain/versions/3'

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv(path + "/dataset_train.tsv", sep='\t', names=['sec', 'class'])
df.head()

Unnamed: 0,sec,class
0,мне нужна справка,statement_general
1,оформить справку,statement_general
2,взять справку,statement_general
3,справку как получить,statement_general
4,справку ммф где получаться,statement_general


## Маппинг меток в числовые идентификаторы

In [9]:
labels=set(df['class'].values.tolist())
id2label = {i:label for i,label in enumerate(labels) if label != None}
label2id = {label:i for i,label in enumerate(labels) if label != None}
y = [label2id[label] for label in df['class'].values.tolist()]

## Получаем эмбединги

In [10]:
MODEL_NAME = 'cointegrated/rubert-tiny2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME,
                                  num_labels=len(labels),
                                  id2label=id2label,
                                  label2id=label2id)

In [11]:
sentences = df['sec'].values.tolist()

In [12]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [13]:
 model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)

In [14]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [15]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [16]:
X = mean_pooling(model_output, encoded_input['attention_mask'])

In [103]:
import nmslib

In [104]:
index = nmslib.init(method='hnsw', space='cosinesimil')

In [105]:
index.addDataPointBatch(X, ids=list(range(len(X))))

13230

In [106]:
index.createIndex({'post': 2}, print_progress=True)


0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
****************************************************

In [110]:
neighbors = index.knnQuery(X_test[0], k=5)

In [122]:
neighbors

(array([5394, 5544, 5509, 7456, 7470], dtype=int32),
 array([2.3841858e-07, 1.2212336e-02, 3.0445635e-02, 4.2435706e-02,
        5.7461619e-02], dtype=float32))

In [112]:
indices = neighbors[0].astype(int)
similarities = neighbors[1]
distances = 1.0 - similarities

In [126]:
for i, d in zip(indices, distances):
    print(sentences[i],id2label[y[i]], '\t', d)

найти магазин. loc_shop 	 0.99999976
отыскать магазин. loc_shop 	 0.98778766
обнаружить магазин. loc_shop 	 0.96955436
найти магазин loc_shop 	 0.9575643
отыскать магазин loc_shop 	 0.9425384
