In [None]:
import joblib
import polars as pl
from clearml import Dataset, Task

task = Task.init(project_name="Amazon reviews", task_name="Bert", output_uri=True)
frame_path = Dataset.get(
    dataset_name="Amazon reviews dataset",
    dataset_project="Amazon reviews",
    dataset_version="1.2.1",
).get_local_copy()
train = pl.read_csv(frame_path + "/raw_train.csv")
test = pl.read_csv(frame_path + "/raw_test.csv")

Загружаем Bert модель и его токенайзер:

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "bert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

In [None]:
device

Будем загружать все батчами, т.к. все не влезет в память:

In [None]:
from torch.utils.data import DataLoader

fixed_batch_size = 32
train_dataloader = DataLoader(
    train["Review"].to_list(), batch_size=fixed_batch_size, shuffle=False
)
test_dataloader = DataLoader(
    test["Review"].to_list(), batch_size=fixed_batch_size, shuffle=False
)

Инференс получаем как выход с последнего слоя берта:

In [None]:
from tqdm.notebook import tqdm


def batch_inference(batch):
    tokenized_batch = tokenizer(
        batch, padding=True, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        hidden_batch = bert_model(**tokenized_batch)
        batch_embeddings = hidden_batch.last_hidden_state[:, 0, :].detach().to("cpu")
        return batch_embeddings


train_embeddings = torch.concat(
    [batch_inference(batch_data) for batch_data in tqdm(train_dataloader)]
)
test_embeddings = torch.concat(
    [batch_inference(batch_data) for batch_data in tqdm(test_dataloader)]
)

In [None]:
task.upload_artifact(
    name="train_embeddings",
    artifact_object=train_embeddings,
)
task.upload_artifact(
    name="test_embeddings",
    artifact_object=test_embeddings,
)

Попробуем обучить на этих эмбеддингах логистическую регрессию:

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

random_state = 42
model_params = {
    "multi_class": "multinomial",
    "solver": "saga",
    "random_state": random_state,
}
task.connect(model_params)
model_lr = LogisticRegression(**model_params)
model_lr.fit(train_embeddings, train["Polarity"])
joblib.dump(model_lr, "model.pkl", compress=True)
predicts = model_lr.predict(test_embeddings)
report = classification_report(test["Polarity"], predicts, output_dict=True)
confusion = confusion_matrix(test["Polarity"], predicts)

logger = task.get_logger()
logger.report_single_value("accuracy", report.pop("accuracy"))
for class_name, metrics in report.items():
    for metric, value in metrics.items():
        logger.report_single_value(f"{class_name}_{metric}", value)
logger.report_table(
    "Classifiacation Report", "Metrics", table_plot=pd.DataFrame(report).T
)
logger.report_confusion_matrix(
    "Classifiacation Report", "ConflusionMatrix", matrix=confusion
)

In [None]:
task.mark_completed()