# 0. Импорты


In [1]:
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset

# 1. Подготовка данных


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir("/content/drive/MyDrive/bmstu/semester_3/NLP/hw_2")
# print("Текущая папка:", os.getcwd())

In [3]:
# !mkdir data

# import gdown

# def download_dataset(file_id, output_path):
#     url = f"https://drive.google.com/uc?id={file_id}"
#     print("Скачивание данных...")
#     gdown.download(url, output_path, quiet=False)


# download_dataset(
#     file_id="1ECxjo60HQwYqn1MdP5cukP1iYNw-GXxZ",
#     output_path="./data/geo_reviews_2023.csv",
# )
# download_dataset(
#     file_id="153iQ_jMqFsdHhwvhj4lgN8LeOkLUjhkl",
#     output_path="./data/test.csv",
# )

In [4]:
import re


def clean_text(text):
    # Убираем лишние обратные слеши и пробелы
    text = text.replace("\\n", " ")

    # Убираем лишние пробелы
    text = re.sub(r"\s+", " ", text)

    # Убираем пробелы перед точками и запятыми
    text = re.sub(r"\s+\.", ".", text)
    text = re.sub(r"\s+,", ",", text)

    return text.strip()

In [10]:
# 1. Подготовка данных
from sklearn.model_selection import train_test_split
import pandas as pd

# Загрузка данных из CSV
print("Загрузка датасета...")
nrows = None  # Чтобы прочитать все данные: nrows = None
dataset_df = pd.read_csv(
    filepath_or_buffer="./data/train.csv",
    nrows=nrows,
)

dataset_df = dataset_df[["text", "rating"]]
dataset_df = dataset_df.rename(columns={"rating": "labels"})
print("Очистка текстов...")
dataset_df["text"] = dataset_df["text"].map(clean_text)

# Выкинуть оценки, равные 0
dataset_df = dataset_df[dataset_df["labels"] != 0]

# Оставить только 50_000 оценок, равных 5, для балансировки распределения оценок
# dataset_df_5 = dataset_df[dataset_df["labels"] == 5].sample(n=50_000, random_state=42)
# dataset_df_other = dataset_df[dataset_df["labels"] != 5]
# dataset_df_balanced = pd.concat([dataset_df_other, dataset_df_5]).reset_index(drop=True)

print("Разделение на train/val...")
train_df, val_df = train_test_split(
    dataset_df,
    test_size=0.2,
    random_state=42,
    stratify=dataset_df["labels"],
)

train_df = train_df.astype({"labels": "float"})
val_df = val_df.astype({"labels": "float"})

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

print("Готово")

Загрузка датасета...
Очистка текстов...
Разделение на train/val...
Готово


In [11]:
test_df = pd.read_csv("./data/test.csv", nrows=None)
test_df = test_df[["review_text"]]
test_df = test_df.rename(columns={"review_text": "text"})
test_df["text"] = test_df["text"].map(clean_text)

test_data = Dataset.from_pandas(test_df)

# 2. Загрузка токенизатора


In [12]:
hf_model_name = "seara/rubert-tiny2-russian-sentiment"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=hf_model_name)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )


train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/399840 [00:00<?, ? examples/s]

Map:   0%|          | 0/99960 [00:00<?, ? examples/s]

Map:   0%|          | 0/5069 [00:00<?, ? examples/s]

In [14]:
train_data, val_data

(Dataset({
     features: ['text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 399840
 }),
 Dataset({
     features: ['text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 99960
 }))

In [15]:
train_df["labels"].astype(int).value_counts().to_dict()

{5: 312412, 4: 32928, 1: 27481, 3: 17349, 2: 9670}

In [10]:
tmp = train_df["labels"].value_counts()
tmp.to_dict()

{5.0: 8, 4.0: 3, 1.0: 3, 3.0: 1, 2.0: 1}

# 2. Кастомная модель: BERT + MLP


In [43]:
import torch
import collections


class WeightedMSELoss(nn.Module):
    def __init__(self, class_counts):
        super().__init__()
        class_counts = collections.OrderedDict(sorted(class_counts.items()))

        self.class_counts = class_counts
        total = sum(class_counts.values())

        self.weights = {}
        for class_id, count in class_counts.items():
            self.weights[class_id] = total / count

        # print("Веса:", self.weights)

    def forward(self, predictions, targets):
        squared_errors = (predictions - targets) ** 2

        batch_weights = torch.ones_like(targets)
        for cls, weight in self.weights.items():
            mask = targets == cls
            batch_weights[mask] = weight

        return (squared_errors * batch_weights).mean()


# Использование
class_counts = train_df["labels"].astype(int).value_counts().to_dict()

loss_fn = WeightedMSELoss(class_counts)

In [27]:
from typing import Dict, Callable


def create_loss_fn(
    loss_type: str,
    class_counts: Dict[int, int],
) -> Callable:
    match loss_type:
        case "MSELoss":
            return nn.MSELoss()
        case "WeightedMSELoss":
            return WeightedMSELoss(class_counts)

In [36]:
from transformers import (
    AutoConfig,
    PreTrainedModel,
    PretrainedConfig,
)


# Создаем конфигурацию для нашей модели
class BertMLPRegressorConfig(PretrainedConfig):
    model_type = "BertMLPRegressor"

    def __init__(
        self,
        hf_model_name="seara/rubert-tiny2-russian-sentiment",
        loss_type="WeightedMSELoss",
        classes_counts={},
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.hf_model_name = hf_model_name
        self.loss_type = loss_type
        self.classes_counts = classes_counts
        # Загружаем конфиг базовой BERT модели
        self.bert_config = AutoConfig.from_pretrained(hf_model_name)


# Наследуемся от PreTrainedModel для совместимости с HF
class BertMLPRegressor(PreTrainedModel):
    config_class = BertMLPRegressorConfig

    def __init__(self, config):
        super().__init__(config)
        self.bert = AutoModel.from_pretrained(
            config.hf_model_name,
            config=config.bert_config,
        )
        self.regressor = nn.Sequential(
            nn.Linear(config.bert_config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
        )
        # self.loss_fn = nn.MSELoss()
        # self.loss_fn = WeightedMSELoss(class_counts)
        self.loss_fn = create_loss_fn(config.loss_type, config.classes_counts)

    def forward(self, input_ids, attention_mask=None, labels=None, **kwargs):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_tokens = outputs.last_hidden_state[:, 0, :]
        logits = self.regressor(cls_tokens).squeeze(-1)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return {"loss": loss, "logits": logits}


# Загрузка конфигурации
classes_counts = train_df["labels"].astype(int).value_counts().to_dict()
# loss_fn = WeightedMSELoss(classes_counts)
# loss_fn = nn.MSELoss()

config = BertMLPRegressorConfig(
    hf_model_name=hf_model_name,
    loss_type="WeightedMSELoss",
    classes_counts=classes_counts,
)

model = BertMLPRegressor(config)

Веса: {1: 5.333333333333333, 2: 16.0, 3: 16.0, 4: 5.333333333333333, 5: 2.0}


In [37]:
# 4. Аргументы обучения
training_args = TrainingArguments(
    # Мониторинг
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_steps=None,
    eval_steps=None,
    logging_steps=None,
    save_total_limit=2,
    report_to="none",
    # Обучение
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    # Оптимизация
    dataloader_pin_memory=torch.cuda.is_available(),
    # Выбрать лучшую модель в конце обучения
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [38]:
# 5. Кастомная функция подсчёта метрик
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    root_mean_squared_error,
    r2_score,
)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    rmse = root_mean_squared_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    return {
        "mse": mse,
        "mae": mae,
        "rmse": rmse,
        "r2": r2,
    }

In [39]:
from transformers import TrainerCallback
import os


class KeepBestModelCallback(TrainerCallback):
    def __init__(
        self,
        best_model_dir="./best_model",
        metric_name="eval_loss",
        greater_is_better=False,
    ):
        self.best_model_dir = best_model_dir
        self.metric_name = metric_name
        self.greater_is_better = greater_is_better
        self.best_metric = float("inf") if not greater_is_better else float("-inf")
        self.best_epoch = None

        # Создаем папку для лучшей модели
        os.makedirs(best_model_dir, exist_ok=True)

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None or self.metric_name not in metrics:
            return

        current_metric = metrics[self.metric_name]
        current_epoch = state.epoch

        is_better = (
            current_metric > self.best_metric
            if self.greater_is_better
            else current_metric < self.best_metric
        )

        if is_better:
            self.best_metric = current_metric
            self.best_epoch = current_epoch

            # Сохраняем лучшую модель
            model = kwargs.get("model")
            tokenizer = kwargs.get("tokenizer")

            if model is not None:
                # Сохраняем модель
                model.save_pretrained(
                    self.best_model_dir,
                )

                # Сохраняем токенайзер если есть
                if tokenizer is not None:
                    tokenizer.save_pretrained(self.best_model_dir)

                # Сохраняем информацию о метрике
                with open(
                    os.path.join(self.best_model_dir, "best_metric.txt"), "w"
                ) as f:
                    f.write(f"{self.metric_name}: {current_metric}\n")
                    f.write(f"epoch: {state.epoch}\n")
                    f.write(f"step: {state.global_step}\n")

    def on_train_end(self, args, state, control, **kwargs):
        print("------ Обучение завершено ------")
        print(f"Лучшая эпоха: {self.best_epoch}")
        print(f"Лучший {self.metric_name}: {self.best_metric:.4f}")

In [40]:
# 6. Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    callbacks=[
        KeepBestModelCallback(
            best_model_dir="./checkpoints/best_model",
            metric_name="eval_loss",
            greater_is_better=False,
        )
    ],
)

In [41]:
# 7. Обучение
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Rmse,R2
1,54.5402,70.585648,16.580902,3.984868,4.071965,-23.117676
2,51.8818,67.627693,16.043938,3.915844,4.005488,-22.336637
3,49.1417,65.521675,15.654323,3.865083,3.956554,-21.769924
4,47.1405,64.21666,15.394722,3.831524,3.923611,-21.392323
5,48.7704,63.667515,15.285935,3.817353,3.909723,-21.234087


------ Обучение завершено ------
Лучшая эпоха: 5.0
Лучший eval_loss: 63.6675


TrainOutput(global_step=10, training_loss=50.29491424560547, metrics={'train_runtime': 19.8133, 'train_samples_per_second': 4.038, 'train_steps_per_second': 0.505, 'total_flos': 599660544000.0, 'train_loss': 50.29491424560547, 'epoch': 5.0})

In [42]:
loaded_model = BertMLPRegressor.from_pretrained("./checkpoints/best_model")

Веса: {'1': 5.333333333333333, '2': 16.0, '3': 16.0, '4': 5.333333333333333, '5': 2.0}


In [51]:
test_data

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5069
})

In [47]:
from torch.utils.data import DataLoader
import torch

from tqdm.auto import tqdm
import numpy as np


def manual_predict_from_dataset(model_path, dataset):
    """Ручные предсказания из готового датасета"""

    model = BertMLPRegressor.from_pretrained(model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Устанавливаем формат
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    # Создаем DataLoader
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            # Переносим на устройство
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            predictions = outputs["logits"].cpu().numpy()
            all_predictions.append(predictions)

    # Объединяем предсказания
    predictions = np.concatenate(all_predictions, axis=0)

    return predictions


# Использование
predictions = manual_predict_from_dataset("./checkpoints/best_model", test_data)

  0%|          | 0/634 [00:00<?, ?it/s]

In [None]:
predictions_df = pd.DataFrame(
    data={
        "id": np.arange(len(predictions)),
        "predicted_rating": np.clip(predictions, 1, 5),
    }
)

display(predictions_df)

Unnamed: 0,id,predicted_rating
0,0,0.105735
1,1,0.478765
2,2,0.214496
3,3,0.518317
4,4,0.437218
...,...,...
5064,5064,0.291326
5065,5065,0.185495
5066,5066,0.300469
5067,5067,0.498086


In [None]:
os.makedirs(name="./predictions", exist_ok=True)
predictions_df.to_csv(
    "./predictions/rubert_mlp.tsv",
    sep="\t",
    header=False,
    index=False,
)