# 0. Импорты


In [1]:
# !git clone https://github.com/ruslann19/bmstu_fs12_nlp.git
# !cp -r ./bmstu_fs12_nlp/hw/hw_2/src/ ./src/
# !rm -rf ./bmstu_fs12_nlp/

In [2]:
from src.preprocessing import clean_text
from sklearn.model_selection import train_test_split
from src.bert_regressor import BertRegressor, BertRegressorConfig
from src import compute_metrics
from src import KeepBestModelCallback
from src.predict_from_dataset import predict_from_dataset

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import os
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)

# 1. Подготовка данных


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir("/content/drive/MyDrive/bmstu/semester_3/NLP/hw_2")
# print("Текущая папка:", os.getcwd())

In [4]:
# !mkdir data

# import gdown

# def download_dataset(file_id, output_path):
#     url = f"https://drive.google.com/uc?id={file_id}"
#     print("Скачивание данных...")
#     gdown.download(url, output_path, quiet=False)


# download_dataset(
#     file_id="1ECxjo60HQwYqn1MdP5cukP1iYNw-GXxZ",
#     output_path="./data/geo_reviews_2023.csv",
# )
# download_dataset(
#     file_id="153iQ_jMqFsdHhwvhj4lgN8LeOkLUjhkl",
#     output_path="./data/test.csv",
# )

In [5]:
# Загрузка данных из CSV
print("Загрузка датасета...")
dataset_df = pd.read_csv(
    filepath_or_buffer="./data/train.csv",
    nrows=10,
)

dataset_df = dataset_df[["text", "rating"]]
dataset_df = dataset_df.rename(columns={"rating": "labels"})
print("Очистка текстов...")
dataset_df["text"] = dataset_df["text"].map(clean_text)

# Выкинуть оценки, равные 0
dataset_df = dataset_df[dataset_df["labels"] != 0]

# Оставить только 50_000 оценок, равных 5, для балансировки распределения оценок
# dataset_df_5 = dataset_df[dataset_df["labels"] == 5].sample(n=50_000, random_state=42)
# dataset_df_other = dataset_df[dataset_df["labels"] != 5]
# dataset_df_balanced = pd.concat([dataset_df_other, dataset_df_5]).reset_index(drop=True)

print("Разделение на train/val...")
train_df, val_df = train_test_split(
    dataset_df,
    test_size=0.2,
    random_state=42,
    # stratify=dataset_df["labels"],
)

train_df = train_df.astype({"labels": "float"})
val_df = val_df.astype({"labels": "float"})

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

print("Готово")

Загрузка датасета...
Очистка текстов...
Разделение на train/val...
Готово


In [6]:
test_df = pd.read_csv("./data/test.csv", nrows=10)
test_df = test_df[["review_text"]]
test_df = test_df.rename(columns={"review_text": "text"})
test_df["text"] = test_df["text"].map(clean_text)

test_data = Dataset.from_pandas(test_df)

# 2. Загрузка токенизатора


In [7]:
hf_model_name = "seara/rubert-tiny2-russian-sentiment"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=hf_model_name)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )


train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

# 2. Кастомная модель: BERT + MLP


In [9]:
classes_counts = train_df["labels"].astype(int).value_counts().to_dict()

config = BertRegressorConfig(
    hf_model_name=hf_model_name,
    loss_type="WeightedMSELoss",
    classes_counts=classes_counts,
)

model = BertRegressor(config)

# 3. Обучение


In [10]:
training_args = TrainingArguments(
    # Мониторинг
    output_dir="./checkpoints",
    overwrite_output_dir=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_steps=None,
    eval_steps=None,
    logging_steps=None,
    save_total_limit=2,
    report_to="none",
    # Обучение
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    # Оптимизация
    dataloader_pin_memory=torch.cuda.is_available(),
    # Выбрать лучшую модель в конце обучения
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    compute_metrics=compute_metrics,
    callbacks=[
        KeepBestModelCallback(
            best_model_dir="./checkpoints/best_model",
            metric_name="eval_loss",
            greater_is_better=False,
        )
    ],
)

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Mse,Mae,Rmse,R2
1,35.9332,44.506222,16.689833,4.084612,4.085319,0.0
2,33.2863,41.493958,15.560233,3.944268,3.944646,0.0
3,28.6694,39.30397,14.738989,3.838938,3.839139,0.0
4,30.8507,37.888409,14.208153,3.769255,3.76937,0.0
5,26.5967,37.19342,13.947533,3.734558,3.73464,0.0


------ Обучение завершено ------
Лучшая эпоха: 5.0
Лучший eval_loss: 37.1934


TrainOutput(global_step=5, training_loss=31.067252731323244, metrics={'train_runtime': 15.2892, 'train_samples_per_second': 2.616, 'train_steps_per_second': 0.327, 'total_flos': 294929817600.0, 'train_loss': 31.067252731323244, 'epoch': 5.0})

# 4. Предсказание


In [13]:
# Использование
predictions = predict_from_dataset("./checkpoints/best_model", test_data)

  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
predictions_df = pd.DataFrame(
    data={
        "id": np.arange(len(predictions)),
        "predicted_rating": np.clip(predictions, 1, 5),
    }
)

display(predictions_df)

Unnamed: 0,id,predicted_rating
0,0,1.0
1,1,1.215467
2,2,1.0
3,3,1.325769
4,4,1.0
5,5,1.639527
6,6,1.022444
7,7,1.0
8,8,1.063907
9,9,1.42294


In [15]:
# os.makedirs(name="./predictions", exist_ok=True)
# predictions_df.to_csv(
#     "./predictions/rubert_mlp.tsv",
#     sep="\t",
#     header=False,
#     index=False,
# )