### Импортируем библиотечки

In [1]:
%%capture
!pip install transformers focal_loss_torch optuna pymorphy3
!gdown 13X6J0enOo1m-MCbUvMt-lueWrdGXNnF0
!gdown 1HJqqXsw9OnM3nSRgAcqNy5ktqnqWPtfq
!gdown 1fkx3Hq0j0ItDz0z-OX36pAqFJ3k21NGm
!unzip /content/image_data.zip -d /content/data

In [2]:
%cd

/root


In [14]:
import torch
import pandas as pd
import numpy as np
import os
import cv2
import torch.nn as nn
import nltk
import re
import math
import pymorphy3
import optuna
nltk.download('stopwords')

from nltk.corpus import stopwords
from tqdm import tqdm
from collections import OrderedDict, defaultdict
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoTokenizer, AutoModel
from focal_loss.focal_loss import FocalLoss
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Фунция для предобработки текста

In [4]:
morph = pymorphy3.MorphAnalyzer()

stopwords_ru = stopwords.words("russian")
def preprocess_text(string):
  try:
    math.isnan(string)
    return ""
  except:
    string = string.lower()
    points_re = re.compile('[".,;/+]')
    backslash = re.compile(r"\\")
    double_wh = re.compile("  ")
    string = points_re.sub(repl="", string=string)
    string = double_wh.sub(repl=" ", string=string)
    string = backslash.sub(repl="", string=string)
    string = " ".join([morph.parse(word)[0].normal_form for word in string.split(" ") if word not in stopwords_ru])
    return string

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

### Определяем датасет

In [5]:
label2id = ["Развлечения и юмор", "Кулинария", "Торговля и объявления", "СМИ", "Философия и религия", "Животные", "Творчество и дизайн", "Путешествия"]
class make_dataset_(Dataset):
  def __init__(self, data_df, mode="train"):
    self.data = data_df
    self.mode = mode
  def __getitem__(self, id):
    text = preprocess_text(self.data.iloc[id, 1])
    if self.mode == "train":
      label = self.data.iloc[id, 2]
      return text, label2id.index(label)
    else:
      return text
  def __len__(self):
    return len(self.data)

### Создаём датасет, сплитим, смотрим на распределение классов

In [6]:
data = pd.read_csv("/content/image_train.csv", sep=";")
data = data[data.label.isin(label2id)]
data.index = list(range(0, data.shape[0]))

train_df, valid_df = train_test_split(data, test_size=0.15, train_size=0.85)
valid_df.index = [i for i in range(len(valid_df))]
train_dset = make_dataset_(train_df)
valid_dset = make_dataset_(valid_df)

### Определяем класс для подсчёта метрик

In [7]:
def calculate_accuracy(output, target):
    train_accuracy = torch.sum(target == output) / len(target)
    return train_accuracy

def calculate_f1(output, target):
    return f1_score(target.cpu(), output.cpu(), average="macro")

class MetricMonitor:
    def __init__(self, float_precision=3):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"], float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

### Фунцкия для создания модели

In [8]:
best_model = None
model_bert = None

In [9]:
def create_model(trial):
    global model_bert

    model_bert = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

    ln_middle_feat = trial.suggest_int("linear_middle_feat", 128, 2056)
    model_bert.pooler = nn.Sequential(OrderedDict([
      ("dense1", nn.Linear(312, ln_middle_feat, bias=True)),
      ("act1", nn.ReLU()),
      ("dense2", nn.Linear(ln_middle_feat, 8, bias=True)),
      ("act2", nn.ReLU())
    ]))

    return model_bert

### Функции трейна и валидации

In [16]:
# from sklearn.metrics import f1_score
def train(train_loader, model, criterion, optimizer, epoch, device):
    metric_monitor = MetricMonitor(float_precision=4)
    model.train()
    stream = tqdm(train_loader)
    for i, (text, target) in enumerate(stream, start=1):
        tokenized = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        target = target.to(device, non_blocking=True)
        output = model(**{k: v.to(device, non_blocking=True) for k, v in tokenized.items()}).pooler_output[:, -1]
        output = torch.nn.functional.softmax(output, dim=1)
        loss = criterion(output, target)
        output = torch.argmax(output, dim=1).to(torch.int64)
        accuracy = calculate_accuracy(output, target)
        f1 = calculate_f1(output, target)
        metric_monitor.update("Loss", loss.item())
        metric_monitor.update("Accuracy", accuracy)
        metric_monitor.update("F1", f1)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        stream.set_description(
            "Epoch: {epoch}. Train.      {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor)
        )

In [17]:
def validate(val_loader, model, criterion, epoch, device):
    metric_monitor = MetricMonitor(float_precision=4)
    model.eval()
    stream = tqdm(val_loader)
    with torch.no_grad():
        for i, (text, target) in enumerate(stream, start=1):
            tokenized = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
            target = target.to(device, non_blocking=True)
            output = model(**{k: v.to(device, non_blocking=True) for k, v in tokenized.items()}).pooler_output[:, -1]
            output = torch.nn.functional.softmax(output, dim=1)
            loss = criterion(output, target)
            output = torch.argmax(output, dim=1).to(torch.int64)
            accuracy = calculate_accuracy(output, target)
            f1 = calculate_f1(output, target)
            metric_monitor.update("Loss", loss.item())
            metric_monitor.update("Accuracy", accuracy)
            metric_monitor.update("F1", f1)
            stream.set_description(
                "Epoch: {epoch}. Validation. {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor)
            )
    return metric_monitor.metrics["F1"]["avg"], metric_monitor.metrics["Accuracy"]["avg"]

### Objective Функция

In [26]:
def objective(trial):
    lr_base = trial.suggest_categorical("lr_base", [2e-5, 3e-5])
    optimizer = trial.suggest_categorical("optimizer", ["Adam", "Adagrad", "RMSprop"])
    batch_size = trial.suggest_categorical("batch_size", [32, 64])
    gamma_loss = trial.suggest_int("gamma_loss", 0, 4)

    device = torch.device("cuda")
    model = create_model(trial).to(device)
    criterion = FocalLoss(gamma=gamma_loss).to(device)
    optimizer = getattr(torch.optim, optimizer)(model.parameters(), lr=lr_base)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

    train_loader = DataLoader(
        train_dset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True,
    )
    val_loader = DataLoader(
        valid_dset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True,
    )

    for epoch in range(1, 21):
        train(train_loader, model, criterion, optimizer, epoch, device)
        f1, acc = validate(val_loader, model, criterion, epoch, device)

        scheduler.step(acc)

        trial.report(f1, epoch)

        if trial.should_prune():
            raise optuna.TrialPruned()

    return f1

### Оптимизируем модель

In [28]:
import time
from IPython.display import clear_output

num_trials = 1

def callback(study, trial):
    global best_model
    try:
      if study.best_trial == trial:
          best_model = model_bert
    except: pass

def clean_stream(study, trial):
    global num_trials
    clear_output(wait=True)
    num_trials += 1
    print(f"Trial {num_trials}")
    print()

start_time = time.time()

study = optuna.create_study(
        direction="maximize",
        pruner=optuna.pruners.ThresholdPruner(
          n_warmup_steps=5, lower=0.4
        )
      )
study.optimize(objective, n_trials=30, callbacks=[callback, clean_stream])

end_time = time.time()
took_time =  end_time - start_time

Trial 31



# Результаты

In [29]:
print(f"This took {took_time // 60} minutes and {took_time - (took_time // 60)} seconds")
print()
print("best trial: ", study.best_trial)
print()
print("best_score: ", study.best_value)
print()
print("best_params: ", study.best_params)

This took 55.0 minutes and 3272.105474948883 seconds

best trial:  FrozenTrial(number=29, state=TrialState.COMPLETE, values=[0.6287364655562004], datetime_start=datetime.datetime(2023, 11, 12, 13, 14, 41, 301941), datetime_complete=datetime.datetime(2023, 11, 12, 13, 16, 54, 930365), params={'lr_base': 2e-05, 'optimizer': 'Adam', 'batch_size': 64, 'gamma_loss': 1, 'linear_middle_feat': 1022}, user_attrs={}, system_attrs={}, intermediate_values={1: 0.28490958025112684, 2: 0.34797856507101255, 3: 0.41544251379013947, 4: 0.4971760244355219, 5: 0.5195412537336105, 6: 0.5340789007075297, 7: 0.5609870348697863, 8: 0.604070688469929, 9: 0.6172238264096449, 10: 0.6337397870973878, 11: 0.6262180078049626, 12: 0.6275447202279417, 13: 0.6331384815440345, 14: 0.6293149103031839, 15: 0.6369871622219835, 16: 0.6325127461247604, 17: 0.6336351769346912, 18: 0.6379690892455773, 19: 0.6294143756091105, 20: 0.6287364655562004}, distributions={'lr_base': CategoricalDistribution(choices=(0.0002, 0.0003, 2e

In [30]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()