### 1. Подготовка данных

In [None]:
import pandas as pd
import re
import torch

In [None]:
train_text = pd.read_csv('/kaggle/input/ab-project/ab_train_reviews.txt', sep = '\t', header=None)
train_asp = pd.read_csv('/kaggle/input/ab-project/train_aspects.txt', sep = '\t', header=None)

In [None]:
from tqdm import tqdm

In [None]:
res = []
asp_vals = train_asp.values
for i in tqdm(range(len(train_asp))):
    
    id_ = asp_vals[i][0]
    text = train_text[train_text[0]==id_][1].values[0]
    
    w = asp_vals[i][2]
    qw = '@q@ ' + w + ' @q@'
    
    ln = [qw + ' ' + text]
    res.extend(ln)


In [None]:
ls = pd.get_dummies(train_asp[5]).astype('float')

### 2. Bert f-t

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained('sberbank-ai/ruBert-base')
model = AutoModelForSequenceClassification.from_pretrained('sberbank-ai/ruBert-base', num_labels=4).to('cuda')

In [None]:
pip install evaluate --quiet

In [None]:
import torch
from transformers import TrainingArguments, Trainer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import evaluate

In [None]:
vals = ls.values
labels = torch.tensor(vals)
X_train, X_test, y_train, y_test = train_test_split(res, labels, test_size = 0.25, random_state=666, shuffle=True )

In [None]:
def prep(text, tokenizer=tokenizer):
    return tokenizer(text, padding = 'max_length', max_length = 128, truncation=True, return_tensors='pt')

In [None]:
X_train = prep(X_train)
X_test = prep(X_test)

In [None]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(X_train, y_train)
val_dataset = TextDataset(X_test, y_test)


In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    labels_ = [np.argmax(i) for i in labels]
    return metric.compute(predictions=predictions, references=labels_)

In [None]:
training_args = TrainingArguments(output_dir="./ab_bert_asp", 
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size = 8,
                                  per_device_eval_batch_size = 8,
                                  save_strategy = 'epoch',
                                  num_train_epochs=5)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()