In [1]:
%%capture
%pip install pymorphy3 focal-loss-torch

In [2]:

import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
import pymorphy3
import warnings

from dateutil import parser
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.metrics import f1_score, roc_auc_score
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from collections import defaultdict, OrderedDict
from focal_loss.focal_loss import FocalLoss

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    return torch.Generator().manual_seed(seed)

warnings.filterwarnings("ignore")

In [3]:
!gdown 1QWQuBUa7I69FmBvgtIrnoya4ebvjuNlI
!gdown 1AgftFgsgDfnFoLL-G9mj_-ShfSPQi0lI

Downloading...
From (original): https://drive.google.com/uc?id=1QWQuBUa7I69FmBvgtIrnoya4ebvjuNlI
From (redirected): https://drive.google.com/uc?id=1QWQuBUa7I69FmBvgtIrnoya4ebvjuNlI&confirm=t&uuid=ab8f07cf-d95c-4740-a82c-7bd5e12992f9
To: /content/model_f1m_0.530.pt
100% 714M/714M [00:15<00:00, 44.7MB/s]


In [17]:
# data = pd.read_parquet("/kaggle/input/rew-cat/reviews_categories.parquet")
data = pd.read_parquet("/content/reviews_categories.parquet")
data = data.drop_duplicates("text")

# выбрасываем 1000 наблюдений 0 класса
data = data.sort_values("label", ascending=True)[10000:]

#сэмплим одинаковое кол-во по минимальному классу
min_count = data["label"].value_counts().min()
# data = data.groupby("label").sample(n=min_count)

In [18]:
class catDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.text = data["text"]
    self.label = data["label"]
    self.tokenizer = tokenizer

  def __getitem__(self, x):
    return self.tokenizer.encode(self.text.iloc[x], truncation="longest_first", max_length=512), self.label.iloc[x]

  def __len__(self):
    return len(self.text)

In [28]:
class BertClassifierModel(nn.Module):
    def __init__(self, hidden_dim, bert_model, n_classes):
        super().__init__()
        self.bert = bert_model
        self.linear = nn.Sequential(OrderedDict([
            ("ln1", nn.Linear(768, hidden_dim)),
            ("act", nn.LeakyReLU()),
            ("ln2", nn.Linear(hidden_dim, n_classes)),
            # ("act1", nn.LeakyReLU()),
            # ("ln3", nn.Linear(hidden_dim, n_classes))
        ]))
        # self.linear = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
      x = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
      x = self.linear(x)
      return x


class BertClassifier:

  def __init__(self, seed, device, model_path):
    self.g = seed_everything(seed)
    self.device = device
    if isinstance(device, str):
      self.device = torch.device(device)
    self.model_path = model_path

  def init_data(self, data, dataset_class, batch_size, splits=[0.9, 0.1]):

    def collate(data):
      # print(data)
      max_len = len(max(data, key=lambda x: len(x[0]))[0])
      # print(max_len)
      input_ids = [i[0] + [0] * (max_len - len(i[0])) for i in data]
      attention_mask = [[1] * len(i[0]) + [0] * (max_len - len(i[0])) for i in data]
      target = [i[1] for i in data]

      return input_ids, attention_mask, target

    dataset = dataset_class(data, AutoTokenizer.from_pretrained(self.model_path))

    train, test = random_split(dataset, lengths=splits)

    self.train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, collate_fn=collate, num_workers=2)
    self.val_loader = DataLoader(test, batch_size=batch_size, shuffle=False,  collate_fn=collate, num_workers=2)

  def init_model(self, hidden_dim=128, n_classes=10):
    model = AutoModel.from_pretrained(self.model_path)
    model = BertClassifierModel(hidden_dim=hidden_dim, bert_model=model, n_classes=n_classes)
    model.eval()
    for p in model.parameters(): p.requires_grad = False
    for p in model.bert.pooler.parameters(): p.requires_grad = True
    for p in model.linear.parameters(): p.requires_grad = True
    model = model.to(self.device)
    self.model = model

  def load_model(self, model_path, map_location="cuda"):

    self.model.load_state_dict(torch.load(model_path, map_location=map_location))

    self.model.eval()
    for p in self.model.parameters(): p.requires_grad = False
    for p in self.model.bert.pooler.parameters(): p.requires_grad = True
    for p in self.model.linear.parameters(): p.requires_grad = True
    self.model = self.model.to(self.device)


  def train_one_epoch(self, loss_fn, optimizer, task):
    lossi = []
    f1_i = []
    roc_auc_i = []

    stream = tqdm(self.train_loader)

    self.model.train()
    for input_ids, attention_mask, y_batch in stream:
        input_ids = torch.tensor(input_ids).to(self.device, non_blocking=True)
        attention_mask = torch.tensor(attention_mask).to(self.device, non_blocking=True)
        y_batch = torch.tensor(y_batch).to(self.device, non_blocking=True)
        logits = self.model(input_ids, attention_mask)
        if task == "binary": probs = torch.sigmoid(logits)
        else: probs = torch.nn.functional.softmax(logits, dim=1)
        loss = loss_fn(probs, y_batch)
        lossi.append(loss.item())

        f1_i.append(f1_score(y_batch.cpu(), probs.argmax(axis=1).cpu(), average="binary" if task == "binary" else "micro"))
        if all(y_batch) or (not any(y_batch)) or task != "binary":
            roc_auc_i.append(0)
        else:
            roc_auc_i.append(roc_auc_score(y_batch.cpu(), probs.argmax(axis=1).cpu()))

        stream.set_description(f"train: loss: {np.mean(lossi)}, f1: {np.mean(f1_i)}, roc_auc: {np.mean(roc_auc_i)}")


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

  def validate_one_epoch(self, loss_fn, task):
    lossi = []
    f1_i = []
    roc_auc_i = []

    stream = tqdm(self.val_loader)

    self.model.eval()

    with torch.no_grad():
      for input_ids, attention_mask, y_batch in stream:
        input_ids = torch.tensor(input_ids).to(self.device, non_blocking=True)
        attention_mask = torch.tensor(attention_mask).to(self.device, non_blocking=True)
        y_batch = torch.tensor(y_batch).to(self.device, non_blocking=True)
        logits = self.model(input_ids, attention_mask)
        if task == "binary": probs = torch.sigmoid(logits)
        else: probs = torch.nn.functional.softmax(logits, dim=1)
        loss = loss_fn(probs, y_batch)
        lossi.append(loss.item())

        f1_i.append(f1_score(y_batch.cpu(), probs.argmax(axis=1).cpu(), average="binary" if task == "binary" else "micro"))
#         print(y_batch, all(y_batch), any(y_batch))
        if all(y_batch) or (not any(y_batch)) or task != "binary":
            roc_auc_i.append(0)
        else:
            roc_auc_i.append(roc_auc_score(y_batch.cpu(), probs.argmax(axis=1).cpu()))

        stream.set_description(f"validation: loss: {np.mean(lossi)}, f1: {np.mean(f1_i)}, roc_auc: {np.mean(roc_auc_i)}")
#         stream.set_description(f"validation: loss: {loss.item()}, f1: {f1_i[-1]}")
    return np.mean(f1_i)

  def train(self, num_epochs, lr, gamma, task="binary", best_f1=0, checkpoint=None):

      loss_fn = nn.BCELoss() if task == "binary" else FocalLoss(gamma=gamma)
      optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
      for epoch in range(num_epochs):
        print(f"epoch {epoch} started")
        self.train_one_epoch(loss_fn, optimizer, task)
        val_f1 = self.validate_one_epoch(loss_fn, task)
        if best_f1 < val_f1:
            best_f1 = val_f1
            self.best_model = self.model
            if checkpoint != None:
                torch.save(self.model.state_dict(), checkpoint + f"/model_f1m_{val_f1:.3f}.pt")
      print(f"train finished with best f1 micro={best_f1}")

In [14]:
# %mkdir /kaggle/working/checkpoint
%mkdir /content/checkpoint

In [29]:
worker = BertClassifier(seed=42, device="cuda", model_path="ai-forever/ruBert-base")

In [30]:
worker.init_data(data, catDataset, 16)

In [None]:
torch.cuda.empty_cache()

In [31]:
worker.init_model(n_classes=8, hidden_dim=128)
worker.load_model("/content/model_f1m_0.530.pt")

In [None]:
# torch.cuda.empty_cache()
worker.train(5, gamma=0.5, lr=2e-5 / 3, task="multiclass", checkpoint="/content/checkpoint", best_f1=0.53)

epoch 0 started


train: loss: 1.2089808293091995, f1: 0.48759266886326197, roc_auc: 0.0: 100%|██████████| 2428/2428 [14:35<00:00,  2.77it/s]
validation: loss: 1.183378301947205, f1: 0.5018728956228956, roc_auc: 0.0: 100%|██████████| 270/270 [01:31<00:00,  2.96it/s]


epoch 1 started


train: loss: 1.1864038924065217, f1: 0.4984709637561779, roc_auc: 0.0: 100%|██████████| 2428/2428 [14:45<00:00,  2.74it/s]
validation: loss: 1.1692565792136722, f1: 0.506060606060606, roc_auc: 0.0: 100%|██████████| 270/270 [01:31<00:00,  2.96it/s]


epoch 2 started


train: loss: 1.1778337048885261, f1: 0.5034287479406919, roc_auc: 0.0: 100%|██████████| 2428/2428 [14:42<00:00,  2.75it/s]
validation: loss: 1.1615303860770332, f1: 0.5112584175084175, roc_auc: 0.0: 100%|██████████| 270/270 [01:31<00:00,  2.95it/s]


epoch 3 started


train: loss: 1.1681110966686754, f1: 0.50667215815486, roc_auc: 0.0: 100%|██████████| 2428/2428 [15:13<00:00,  2.66it/s]
validation: loss: 1.155024489208504, f1: 0.511026936026936, roc_auc: 0.0: 100%|██████████| 270/270 [01:35<00:00,  2.82it/s]


epoch 4 started


train: loss: 1.1600566268261812, f1: 0.5080673393739704, roc_auc: 0.0: 100%|██████████| 2428/2428 [14:49<00:00,  2.73it/s]
validation: loss: 1.1477923179114307, f1: 0.517276936026936, roc_auc: 0.0: 100%|██████████| 270/270 [01:31<00:00,  2.94it/s]

train finished with best f1 micro=0.53





In [None]:
%cd /kaggle/working/checkpoint

In [None]:
from IPython.display import FileLink
from glob import glob

FileLink(sorted(glob("/kaggle/working/checkpoint/*"))[-1].replace("/kaggle/working/checkpoint/", ""))

In [None]:
# loss = nn.CrossEntropyLoss()
for input_ids, attention_mask, y_batch in worker.train_loader:
    for in_id in input_ids:
        print(tokenizer.decode(in_id))
#     input_ids = torch.tensor(input_ids).to(worker.device)
# #     print(input_ids)
#     attention_mask = torch.tensor(attention_mask).to(worker.device)
# #     print(attention_mask)
#     y_batch = torch.tensor(y_batch).to(worker.device)
#     print(y_batch)
#     print(worker.model(input_ids=input_ids, attention_mask=attention_mask).shape)
#     probs = torch.nn.functional.softmax(worker.model(input_ids=input_ids, attention_mask=attention_mask), dim=1)
#     print(y_batch)
# #     break

In [None]:
data["info"].unique()

In [None]:
y_