# Задание 1: реализуйте задачу классификации на основе BERT-like модели и KNN на данных Russian Intents Dataset с Kaggle.

## Грузим и готовим данные

In [3]:
import kagglehub

In [4]:
path = kagglehub.dataset_download("constantinwerner/qa-intents-dataset-university-domain")

In [5]:
path

'/home/alexander/.cache/kagglehub/datasets/constantinwerner/qa-intents-dataset-university-domain/versions/3'

In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

In [7]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv(path + "/dataset_train.tsv", sep='\t', names=['sec', 'class'])
df.head()

Unnamed: 0,sec,class
0,мне нужна справка,statement_general
1,оформить справку,statement_general
2,взять справку,statement_general
3,справку как получить,statement_general
4,справку ммф где получаться,statement_general


## Маппинг меток в числовые идентификаторы

In [9]:
labels=set(df['class'].values.tolist())
id2label = {i:label for i,label in enumerate(labels) if label != None}
label2id = {label:i for i,label in enumerate(labels) if label != None}
y = [label2id[label] for label in df['class'].values.tolist()]

## Получаем эмбединги

In [10]:
MODEL_NAME = 'cointegrated/rubert-tiny2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME,
                                  num_labels=len(labels),
                                  id2label=id2label,
                                  label2id=label2id)

In [11]:
sentences = df['sec'].values.tolist()

In [12]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [13]:
 model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)

In [14]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [15]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [16]:
X = mean_pooling(model_output, encoded_input['attention_mask'])

## Делим на тестовую и обучающую выборку

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
from sklearn.neighbors import KNeighborsClassifier

In [25]:
knn = KNeighborsClassifier(n_neighbors=10, metric='cosine')
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,10
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'cosine'
,metric_params,
,n_jobs,


## Смотрим метрики

In [22]:
from sklearn.metrics import classification_report

In [26]:
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred, target_names=labels))

                                     precision    recall  f1-score   support

                            loc_atm       0.89      0.97      0.93        34
                        loc_dean_hi       0.56      0.67      0.61        15
         student_union_event_guests       0.95      1.00      0.98        41
                        dorm_living       0.92      1.00      0.96        11
                       status_leave       0.57      0.57      0.57         7
                      grade_improve       1.00      1.00      1.00        11
                          loc_pharm       0.70      0.89      0.78        18
                      sched_teacher       1.00      1.00      1.00       222
                       loc_dean_med       1.00      1.00      1.00         9
          student_trade_union_enter       1.00      1.00      1.00         4
                        cvvr_define       1.00      1.00      1.00         5
                     stdscholarship       0.74      0.89      0.81        1