In [2]:
from src.preprocessing import clean_text
from sklearn.model_selection import train_test_split
from src.bert_regressor import BertRegressor, BertRegressorConfig
from src import compute_metrics
from src import KeepBestModelCallback
from src.predict_from_dataset import predict_from_dataset

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import os
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)

# 1. Подготовка данных


In [3]:
nrows = None

# Загрузка данных из CSV
print("Загрузка датасета...")
dataset_df = pd.read_csv(
    filepath_or_buffer="./data/train.csv",
    nrows=nrows,
)

dataset_df = dataset_df[["text", "rating"]]
dataset_df = dataset_df.rename(columns={"rating": "labels"})

# Выкинуть оценки, равные 0
dataset_df = dataset_df[dataset_df["labels"] != 0]

if nrows is None:
    # Оставить только 50_000 оценок, равных 5, для балансировки распределения оценок
    dataset_df_5 = dataset_df[dataset_df["labels"] == 5].sample(
        n=50_000,
        random_state=42,
    )
    dataset_df_other = dataset_df[dataset_df["labels"] != 5]
    dataset_df = pd.concat([dataset_df_5, dataset_df_other]).reset_index(drop=True)

print("Очистка текстов...")
dataset_df["text"] = dataset_df["text"].map(clean_text)


print("Разделение на train/val...")
if nrows is None:
    train_df, val_df = train_test_split(
        dataset_df,
        test_size=0.2,
        random_state=42,
        # Во всём датасете найдутся объекты, чтобы разделить их на train/val
        stratify=dataset_df["labels"],
    )
else:
    train_df, val_df = train_test_split(
        dataset_df,
        test_size=0.2,
        random_state=42,
    )

train_df = train_df.astype({"labels": "float"})
val_df = val_df.astype({"labels": "float"})

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

print("Готово")

Загрузка датасета...
Очистка текстов...
Разделение на train/val...
Готово


In [4]:
test_df = pd.read_csv("./data/test.csv", nrows=None)
test_df = test_df[["review_text"]]
test_df = test_df.rename(columns={"review_text": "text"})
test_df["text"] = test_df["text"].map(clean_text)

test_data = Dataset.from_pandas(test_df)

# 2. Загрузка токенизатора


In [5]:
hf_model_name = "seara/rubert-tiny2-russian-sentiment"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=hf_model_name)


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt",
    )


train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/127428 [00:00<?, ? examples/s]

Map:   0%|          | 0/31857 [00:00<?, ? examples/s]

Map:   0%|          | 0/5069 [00:00<?, ? examples/s]

# Получение эмбэддингов


In [7]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np


def get_embeddings_from_dataset(
    model_path: str,
    dataset: Dataset,
) -> np.ndarray:
    model = BertRegressor.from_pretrained(model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Устанавливаем формат
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    # Создаем DataLoader
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)

    all_embeddings = []

    with torch.no_grad():
        for batch in tqdm(dataloader):
            # Переносим на устройство
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Forward pass
            outputs = model.bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings = embeddings.cpu().numpy()

            all_embeddings.append(embeddings)

    # Объединяем предсказания
    predictions = np.concatenate(all_embeddings, axis=0)

    return predictions

In [None]:
import os

os.makedirs("./data/embeddings/", exist_ok=True)

datasets = {
    "train": train_data,
    "val": val_data,
    "test": test_data,
}

for name, dataset in datasets.items():
    embeddings = get_embeddings_from_dataset("./checkpoints/rubert_linear/", dataset)
    embeddings_df = pd.DataFrame(embeddings)
    embeddings_df.to_csv(
        f"./data/embeddings/{name}_embeddings.csv",
        index=False,
        header=False,
    )

  0%|          | 0/15929 [00:00<?, ?it/s]