In [2]:
from src.preprocessing import clean_text
from sklearn.model_selection import train_test_split
from src.bert_regressor import BertRegressor, BertRegressorConfig
from src import compute_metrics
from src import KeepBestModelCallback
from src.predict_from_dataset import predict_from_dataset

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
import os
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)

# 1. Подготовка данных


In [3]:
nrows = None

# Загрузка данных из CSV
print("Загрузка датасета...")
dataset_df = pd.read_csv(
    filepath_or_buffer="./data/train.csv",
    nrows=nrows,
)

dataset_df = dataset_df[["text", "rating"]]
dataset_df = dataset_df.rename(columns={"rating": "labels"})

# Выкинуть оценки, равные 0
dataset_df = dataset_df[dataset_df["labels"] != 0]

if nrows is None:
    # Оставить только 50_000 оценок, равных 5, для балансировки распределения оценок
    dataset_df_5 = dataset_df[dataset_df["labels"] == 5].sample(
        n=50_000,
        random_state=42,
    )
    dataset_df_other = dataset_df[dataset_df["labels"] != 5]
    dataset_df = pd.concat([dataset_df_5, dataset_df_other]).reset_index(drop=True)

# print("Очистка текстов...")
# dataset_df["text"] = dataset_df["text"].map(clean_text)


print("Разделение на train/val...")
if nrows is None:
    train_df, val_df = train_test_split(
        dataset_df,
        test_size=0.2,
        random_state=42,
        # Во всём датасете найдутся объекты, чтобы разделить их на train/val
        stratify=dataset_df["labels"],
    )
else:
    train_df, val_df = train_test_split(
        dataset_df,
        test_size=0.2,
        random_state=42,
    )

train_df = train_df.astype({"labels": "float"})
val_df = val_df.astype({"labels": "float"})

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

print("Готово")

Загрузка датасета...
Разделение на train/val...
Готово


In [4]:
test_df = pd.read_csv("./data/test.csv", nrows=None)
test_df = test_df[["review_text"]]
test_df = test_df.rename(columns={"review_text": "text"})
test_df["text"] = test_df["text"].map(clean_text)

test_data = Dataset.from_pandas(test_df)

In [69]:
train_rating = train_df["labels"].values
train_rating.shape

(127428,)

In [70]:
val_rating = val_df["labels"].values
val_rating.shape

(31857,)

In [71]:
train_embeddings_df = pd.read_csv("./data/embeddings/train_embeddings.csv", header=None)

In [72]:
val_embeddings_df = pd.read_csv("./data/embeddings/val_embeddings.csv", header=None)

In [73]:
train_embeddings = train_embeddings_df.values
val_embeddings = val_embeddings_df.values

In [74]:
noises = np.random.randn(*(train_embeddings.shape)) * 1.0
train_augmented = train_embeddings + noises
train_embeddings = np.concatenate([train_embeddings, train_augmented])
train_rating = np.concatenate([train_rating, train_rating])

In [75]:
from tqdm.auto import tqdm
from sklearn.metrics import root_mean_squared_error


def normalize_embeddings(embeddings):
    result = []

    for embedding in tqdm(embeddings):
        norm = np.linalg.norm(embedding)
        embedding_normalized = embedding / norm
        result.append(embedding_normalized)

    result = np.array(result)
    return result

In [76]:
train_embeddings_normalized = normalize_embeddings(train_embeddings)
val_embeddings_normalized = normalize_embeddings(val_embeddings)

  0%|          | 0/254856 [00:00<?, ?it/s]

  0%|          | 0/31857 [00:00<?, ?it/s]

In [82]:
from catboost import CatBoostRegressor

catboost_regressor = CatBoostRegressor(
    iterations=500,
    eval_metric="RMSE",
    random_seed=42,
    verbose=True,
)

catboost_regressor.fit(
    X=train_embeddings_normalized,
    y=train_rating,
    eval_set=(val_embeddings_normalized, val_rating),
    logging_level="Verbose",
)

Learning rate set to 0.185487
0:	learn: 1.3187040	test: 1.3095842	best: 1.3095842 (0)	total: 123ms	remaining: 1m 1s
1:	learn: 1.1603103	test: 1.1488870	best: 1.1488870 (1)	total: 160ms	remaining: 39.8s
2:	learn: 1.0401440	test: 1.0275868	best: 1.0275868 (2)	total: 199ms	remaining: 32.9s
3:	learn: 0.9459542	test: 0.9334306	best: 0.9334306 (3)	total: 242ms	remaining: 30s
4:	learn: 0.8755287	test: 0.8673147	best: 0.8673147 (4)	total: 287ms	remaining: 28.4s
5:	learn: 0.8221891	test: 0.8185933	best: 0.8185933 (5)	total: 329ms	remaining: 27.1s
6:	learn: 0.7829723	test: 0.7843954	best: 0.7843954 (6)	total: 369ms	remaining: 26s
7:	learn: 0.7540172	test: 0.7612363	best: 0.7612363 (7)	total: 406ms	remaining: 25s
8:	learn: 0.7329576	test: 0.7459931	best: 0.7459931 (8)	total: 451ms	remaining: 24.6s
9:	learn: 0.7173206	test: 0.7355833	best: 0.7355833 (9)	total: 512ms	remaining: 25.1s
10:	learn: 0.7055528	test: 0.7288273	best: 0.7288273 (10)	total: 555ms	remaining: 24.7s
11:	learn: 0.6973505	test: 0

<catboost.core.CatBoostRegressor at 0x7c6dc68def30>

In [None]:
y_pred = catboost_regressor.predict(val_embeddings_normalized)
root_mean_squared_error(val_rating, y_pred)

In [79]:
emb_dim = train_embeddings_normalized.shape[1]
print("emb_dim:", emb_dim)

emb_dim: 312


In [80]:
from sklearn.neural_network import MLPRegressor

mlp_regressor = MLPRegressor(
    hidden_layer_sizes=(16, 8),
    activation="relu",
    solver="adam",
    max_iter=10,
    verbose=True,
)

mlp_regressor.fit(train_embeddings_normalized, train_rating)

Iteration 1, loss = 0.47451025
Iteration 2, loss = 0.21625784
Iteration 3, loss = 0.21129057
Iteration 4, loss = 0.20968884
Iteration 5, loss = 0.20899643
Iteration 6, loss = 0.20815886
Iteration 7, loss = 0.20782174
Iteration 8, loss = 0.20735975
Iteration 9, loss = 0.20689514
Iteration 10, loss = 0.20669764




0,1,2
,loss,'squared_error'
,hidden_layer_sizes,"(16, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,10


In [81]:
y_pred_reg = mlp_regressor.predict(val_embeddings_normalized)
root_mean_squared_error(val_rating, y_pred_reg)

0.708387735071745

In [None]:
test_embeddings_df = pd.read_csv("./data/embeddings/test_embeddings.csv", header=None)
test_embeddings = test_embeddings_df.values
test_embeddings_normalized = normalize_embeddings(test_embeddings)

predictions = mlp_regressor.predict(test_embeddings_normalized)

predictions_df = pd.DataFrame(
    data={
        "id": np.arange(len(predictions)),
        "predicted_rating": np.clip(predictions, 1, 5),
    }
)
display(predictions_df)

  0%|          | 0/5069 [00:00<?, ?it/s]

Unnamed: 0,id,predicted_rating
0,0,2.452761
1,1,4.620105
2,2,1.000000
3,3,1.287900
4,4,4.125743
...,...,...
5064,5064,1.717100
5065,5065,3.749171
5066,5066,1.997983
5067,5067,1.180375


In [140]:
os.makedirs(name="./predictions", exist_ok=True)
predictions_df.to_csv(
    "./predictions/embeddings_mlp.tsv",
    sep="\t",
    header=False,
    index=False,
)