# 0. Импорты


In [11]:
# !git clone https://github.com/ruslann19/bmstu_fs12_nlp.git
# !cp -r ./bmstu_fs12_nlp/hw/hw_2/src/ ./src/
# !rm -rf ./bmstu_fs12_nlp/

In [None]:
from src.preprocessing import clean_text
from sklearn.model_selection import train_test_split
from src.bert_mlp_regressor import BertMLPRegressor, BertMLPRegressorConfig
from src import compute_metrics
from src import KeepBestModelCallback
from src.predict_from_dataset import predict_from_dataset

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import os

from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)

# 1. Подготовка данных


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir("/content/drive/MyDrive/bmstu/semester_3/NLP/hw_2")
# print("Текущая папка:", os.getcwd())

In [None]:
# !mkdir data

# import gdown

# def download_dataset(file_id, output_path):
#     url = f"https://drive.google.com/uc?id={file_id}"
#     print("Скачивание данных...")
#     gdown.download(url, output_path, quiet=False)


# download_dataset(
#     file_id="1ECxjo60HQwYqn1MdP5cukP1iYNw-GXxZ",
#     output_path="./data/geo_reviews_2023.csv",
# )
# download_dataset(
#     file_id="153iQ_jMqFsdHhwvhj4lgN8LeOkLUjhkl",
#     output_path="./data/test.csv",
# )

In [None]:
# Загрузка данных из CSV
print("Загрузка датасета...")
dataset_df = pd.read_csv(
    filepath_or_buffer="./data/train.csv",
    nrows=10,
)

dataset_df = dataset_df[["text", "rating"]]
dataset_df = dataset_df.rename(columns={"rating": "labels"})
print("Очистка текстов...")
dataset_df["text"] = dataset_df["text"].map(clean_text)

# Выкинуть оценки, равные 0
dataset_df = dataset_df[dataset_df["labels"] != 0]

# Оставить только 50_000 оценок, равных 5, для балансировки распределения оценок
# dataset_df_5 = dataset_df[dataset_df["labels"] == 5].sample(n=50_000, random_state=42)
# dataset_df_other = dataset_df[dataset_df["labels"] != 5]
# dataset_df_balanced = pd.concat([dataset_df_other, dataset_df_5]).reset_index(drop=True)

print("Разделение на train/val...")
train_df, val_df = train_test_split(
    dataset_df,
    test_size=0.2,
    random_state=42,
    stratify=dataset_df["labels"],
)

train_df = train_df.astype({"labels": "float"})
val_df = val_df.astype({"labels": "float"})

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

print("Готово")

Загрузка датасета...
Очистка текстов...
Разделение на train/val...
Готово


In [None]:
test_df = pd.read_csv("./data/test.csv", nrows=None)
test_df = test_df[["review_text"]]
test_df = test_df.rename(columns={"review_text": "text"})
test_df["text"] = test_df["text"].map(clean_text)

test_data = Dataset.from_pandas(test_df)

# 2. Загрузка токенизатора


In [None]:
hf_model_name = "seara/rubert-tiny2-russian-sentiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=hf_model_name)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )


train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/399840 [00:00<?, ? examples/s]

Map:   0%|          | 0/99960 [00:00<?, ? examples/s]

Map:   0%|          | 0/5069 [00:00<?, ? examples/s]

# 2. Кастомная модель: BERT + MLP


In [None]:
# Загрузка конфигурации
classes_counts = train_df["labels"].astype(int).value_counts().to_dict()
# loss_fn = WeightedMSELoss(classes_counts)
# loss_fn = nn.MSELoss()

config = BertMLPRegressorConfig(
    hf_model_name=hf_model_name,
    loss_type="WeightedMSELoss",
    classes_counts=classes_counts,
)

model = BertMLPRegressor(config)

Веса: {1: 5.333333333333333, 2: 16.0, 3: 16.0, 4: 5.333333333333333, 5: 2.0}


# 3. Обучение


In [None]:
training_args = TrainingArguments(
    # Мониторинг
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_steps=None,
    eval_steps=None,
    logging_steps=None,
    save_total_limit=2,
    report_to="none",
    # Обучение
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    # Оптимизация
    dataloader_pin_memory=torch.cuda.is_available(),
    # Выбрать лучшую модель в конце обучения
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    callbacks=[
        KeepBestModelCallback(
            best_model_dir="./checkpoints/best_model",
            metric_name="eval_loss",
            greater_is_better=False,
        )
    ],
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Rmse,R2
1,54.5402,70.585648,16.580902,3.984868,4.071965,-23.117676
2,51.8818,67.627693,16.043938,3.915844,4.005488,-22.336637
3,49.1417,65.521675,15.654323,3.865083,3.956554,-21.769924
4,47.1405,64.21666,15.394722,3.831524,3.923611,-21.392323
5,48.7704,63.667515,15.285935,3.817353,3.909723,-21.234087


------ Обучение завершено ------
Лучшая эпоха: 5.0
Лучший eval_loss: 63.6675


TrainOutput(global_step=10, training_loss=50.29491424560547, metrics={'train_runtime': 19.8133, 'train_samples_per_second': 4.038, 'train_steps_per_second': 0.505, 'total_flos': 599660544000.0, 'train_loss': 50.29491424560547, 'epoch': 5.0})

# 4. Предсказание


In [None]:
# Использование
predictions = predict_from_dataset("./checkpoints/best_model", test_data)

  0%|          | 0/634 [00:00<?, ?it/s]

In [None]:
predictions_df = pd.DataFrame(
    data={
        "id": np.arange(len(predictions)),
        "predicted_rating": np.clip(predictions, 1, 5),
    }
)

display(predictions_df)

Unnamed: 0,id,predicted_rating
0,0,0.105735
1,1,0.478765
2,2,0.214496
3,3,0.518317
4,4,0.437218
...,...,...
5064,5064,0.291326
5065,5065,0.185495
5066,5066,0.300469
5067,5067,0.498086


In [None]:
os.makedirs(name="./predictions", exist_ok=True)
predictions_df.to_csv(
    "./predictions/rubert_mlp.tsv",
    sep="\t",
    header=False,
    index=False,
)