In [2]:
%%capture
%pip install pymorphy3 gdown

In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import torch
import torch.nn as nn
# import pymorphy3
import warnings

from dateutil import parser
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.metrics import f1_score, roc_auc_score
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
from collections import defaultdict, OrderedDict

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch

    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    return torch.Generator().manual_seed(seed)

warnings.filterwarnings("ignore")

In [3]:
!gdown 11SQswlJWdxCiWzO0rG1ttNT4CePLRYIM

Downloading...
From: https://drive.google.com/uc?id=11SQswlJWdxCiWzO0rG1ttNT4CePLRYIM
To: /kaggle/working/results_sravni (2).csv
100%|███████████████████████████████████████| 38.6M/38.6M [00:00<00:00, 188MB/s]


In [None]:
def preprocess_triplets(res_df, date_df, triplets_columns="triplets", date_column="timestamp", morph=True):
  if morph: morph_parser = pymorphy3.MorphAnalyzer()
  triplets = res_df[triplets_columns].values.tolist()

  dates = date_df[date_column].values.tolist()
  all_triplets = []
  for i, (text, label_ix, label) in enumerate(triplets):
      all_triplets.extend([[text] + list(item) + [list(item_ix)[0]] + [dates[i]] for item_ix, item in zip(eval(label_ix), eval(label))])

  data = []
  if not morph:
     for triplet in tqdm(all_triplets):
        txt, at, ot, sp, (at_ix), ts = triplet
        at = at.lower()
        ot = ot.lower()
        data.append([ts, txt, at, at_ix, ot, sp, (at, ot, sp), (at, ot)])
  else:
    for triplet in tqdm(all_triplets):
        txt, at, ot, sp, (at_ix), ts = triplet
        at = morph_parser.parse(at.lower())[0].normal_form
        ot = ot.lower()
        data.append([ts, txt, at, at_ix, ot, sp, (at, ot, sp), (at, ot)])
  data = pd.DataFrame(data)
  data.columns = ['date', "text", 'aspect', "aspect_ix", 'opinion', 'sentiment', 'triplet', 'aspect_opinion']

  data["timestamp"] = data["date"].apply(lambda x: parser.isoparse(x).timestamp())

  return data

In [None]:
df = pd.read_csv("/kaggle/working/results_sravni (2).csv")
data = preprocess_triplets(df, df, ["text", "pred", "pred_text"], "time")

In [241]:
df = pd.read_csv("/kaggle/input/test-50-informative/test_selling_pandas.csv")
data = data[:50]
data["target"] = df["json_response"].map({"Информативные" : 1, "Информативное" : 1, "Информативно" : 1, "Неинформативно":0})

In [2]:
data = pd.read_csv("/kaggle/input/1k-all/1k_all.csv")

In [71]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
df = BertContextTokenizer(data, tokenizer, target_column="info")

In [3]:
data["aspect_ix"] = data["aspect_ix"].apply(lambda x: eval(x))

In [4]:
data = data.drop(data[data["cat"].isna()].index).reset_index(drop=True)

In [5]:
def map_(x): return x - 1 if x > 3 else x
data["cat"] = data["cat"].apply(lambda x: map_(x))

In [7]:
def mapper_input_ids(tokenizer, text):
  words = text.split(" ")
  result = defaultdict(int)
  input_ids = []
  current = 1
  for i, word in enumerate(words):
    word_input = tokenizer.encode(word, add_special_tokens=False)
#     print(i, word, word_input)
    input_ids.extend(word_input)

    result[i] = (current, current + len(word_input))
    current += len(word_input)
  return input_ids, result

class BertContextTokenizer(Dataset):

  def __init__(self, data, tokenizer, pos_column="aspect_ix", text_column="text", target_column="target", context_size=100):
    super().__init__()
    self.ixs = data[pos_column]
    self.text = data[text_column]
    self.y = data[target_column]
    self.context_size = context_size
    self.tokenizer = tokenizer

  def __getitem__(self, x):

    input = [self.tokenizer.encode(".")[0]]
    text = self.text[x]
    y = self.y[x]
    at_old_start = text.split(" ")[self.ixs.iloc[x][0]]
    at_old_end = text.split(" ")[self.ixs.iloc[x][-1]]
    at_ix_start = text.index(at_old_start); at_ix_end = text.index(at_old_end) + len(at_old_end)
    left_part = text[:at_ix_start]; aspect = text[at_ix_start:at_ix_end]; right_part = text[at_ix_end:]
    text = left_part + "<ASPECT> " + aspect + " <ASPECT>" + right_part
    ids, mapper_ids  = mapper_input_ids(self.tokenizer, text)
#     print(mapper_ids)
    left_at = mapper_ids[self.ixs.iloc[x][0] + 1][0] #прибавляем 1, т.к слово аспект появилось спереди
    right_at = mapper_ids[self.ixs.iloc[x][-1] + 1][-1]
    left_gl = max(0, left_at - self.context_size)
    right_gl = min(len(ids) - 1, right_at + self.context_size)
#     print(left_at, right_at, left_gl, right_gl, ids)
    input.extend(ids[left_gl:(right_gl + 1)])
    input.extend([self.tokenizer.encode(".")[-1]])
    return input, y

  def __len__(self):
    return len(self.text)

In [8]:
def pad_sequence_right(input_ids, length):
  attention_mask_pad = 0
  if len(input_ids) - 2 < length:
    input_ids.extend((length - (len(input_ids)- 2)) * [0])
  return input_ids

def attention_pad(input_ids, length):
  attention_mask_pad = 0
  if len(input_ids) - 2 < length:
    attention_mask_pad += (length - (len(input_ids)- 2))
  return attention_mask_pad

def custom_collate(data):
  max_length = len(max(data, key = lambda x: len(x[0]))[0])

  inputs_ids = [pad_sequence_right(list(input_id), max_length) for (input_id, _) in data]
  attention_mask = [[1]*len(input_id) + [0]*attention_pad(list(input_id), max_length) for (input_id, _) in data]

  y_s = [y for (_, y) in data ]
  return inputs_ids, attention_mask, y_s

In [8]:
data["info"].isna().sum()

1

In [64]:
class BertClassifierModel(nn.Module):
    def __init__(self, hidden_dim, bert_model, n_classes):
        super().__init__()
        self.bert = bert_model
#         self.linear = nn.Sequential(OrderedDict([
#             ("ln1", nn.Linear(768, hidden_dim)),
#             ("act", nn.ReLU()),
#             ("ln2", nn.Linear(hidden_dim, n_classes))
#         ]))
        self.linear = nn.Linear(768, n_classes)

    def forward(self, input_ids, attention_mask):
      x = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
      x = self.linear(x)
      return x


class BertClassifier:

  def __init__(self, seed, device, model_path):
    self.g = seed_everything(seed)
    self.device = device
    if isinstance(device, str):
      self.device = torch.device(device)
    self.model_path = model_path

  def init_data(self, data, dataset_class, collate_fn, batch_size,target_column, splits=[0.9, 0.1]):

    tokenizer = AutoTokenizer.from_pretrained(self.model_path)

    dataset = dataset_class(data, tokenizer, target_column=target_column)
    print(splits)
    train, test = random_split(dataset, lengths=splits, generator=self.g)

    self.train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, collate_fn = collate_fn, generator=self.g)
    self.val_loader = DataLoader(test, batch_size=batch_size, shuffle=False, collate_fn = collate_fn, generator=self.g)

  def init_model(self, hidden_dim=128, n_classes=10):
    model = AutoModel.from_pretrained(self.model_path)
    model = BertClassifierModel(hidden_dim=hidden_dim, bert_model=model, n_classes=n_classes)
    model.eval()
#     for p in model.parameters(): p.requires_grad = False
    for p in model.bert.pooler.parameters(): p.requires_grad = True
    for p in model.linear.parameters(): p.requires_grad = True
    model = model.to(self.device)
    self.model = model

  def train_one_epoch(self, loss_fn, optimizer, task):
    lossi = []
    f1_i = []
    roc_auc_i = []

    stream = tqdm(self.train_loader)

    self.model.train()
    for input_ids, attention_mask, y_batch in stream:
        input_ids = torch.tensor(input_ids).to(self.device)
        attention_mask = torch.tensor(attention_mask).to(self.device)
        y_batch = torch.tensor(y_batch).to(self.device)
        logits = self.model(input_ids=input_ids, attention_mask=attention_mask)
        if task == "binary": probs = torch.sigmoid(logits)
        else: probs = torch.nn.functional.softmax(logits, dim=1)
        loss = loss_fn(probs, y_batch)
        lossi.append(loss.item())
#         print(y_batch, all(y_batch), any(y_batch))
        f1_i.append(f1_score(y_batch.cpu(), probs.argmax(axis=1).cpu(), average="binary" if task == "binary" else "micro"))
        if all(y_batch) or (not any(y_batch)) or task != "binary":
            roc_auc_i.append(0)
        else:
            roc_auc_i.append(roc_auc_score(y_batch.cpu(), probs.argmax(axis=1).cpu()))

        stream.set_description(f"train: loss: {np.mean(lossi)}, f1: {np.mean(f1_i)}, roc_auc: {np.mean(roc_auc_i)}")
#         stream.set_description(f"train: loss: {loss.item()}, f1: {f1_i[-1]}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

  def validate_one_epoch(self, loss_fn, task):
    lossi = []
    f1_i = []
    roc_auc_i = []

    stream = tqdm(self.val_loader)

    self.model.eval()

    with torch.no_grad():
      for input_ids, attention_mask, y_batch in stream:
        input_ids = torch.tensor(input_ids).to(self.device)
        attention_mask = torch.tensor(attention_mask).to(self.device)
        y_batch = torch.tensor(y_batch).to(self.device)
        logits = self.model(input_ids=input_ids, attention_mask=attention_mask)
        if task == "binary": probs = torch.sigmoid(logits)
        else: probs = torch.nn.functional.softmax(logits, dim=1)
#         prit
        loss = loss_fn(probs, y_batch)
        lossi.append(loss.item())

        f1_i.append(f1_score(y_batch.cpu(), probs.argmax(axis=1).cpu(), average="binary" if task == "binary" else "micro"))
#         print(y_batch, all(y_batch), any(y_batch))
        if all(y_batch) or (not any(y_batch)) or task != "binary":
            roc_auc_i.append(0)
        else:
            roc_auc_i.append(roc_auc_score(y_batch.cpu(), probs.argmax(axis=1).cpu()))

        stream.set_description(f"validation: loss: {np.mean(lossi)}, f1: {np.mean(f1_i)}, roc_auc: {np.mean(roc_auc_i)}")
#         stream.set_description(f"validation: loss: {loss.item()}, f1: {f1_i[-1]}")

  def train(self, num_epochs, task="binary"):
    
      loss_fn = nn.BCELoss() if task == "binary" else nn.CrossEntropyLoss()
      lr = 2e-4
      optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)
      # scheduler = torch.optim.lr_scheduler(optimizer)

      for epoch in range(num_epochs):
        print(f"epoch {epoch} started")
        self.train_one_epoch(loss_fn, optimizer, task)
        self.validate_one_epoch(loss_fn, task)

In [65]:
worker = BertClassifier(seed=42, device="cuda", model_path="DeepPavlov/rubert-base-cased")

In [66]:
worker.init_data(data, BertContextTokenizer, custom_collate, 16, "cat")

[0.9, 0.1]


In [67]:
torch.cuda.empty_cache()

In [68]:
worker.init_model(n_classes=7)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
worker.train(5, task="multiclass")

epoch 0 started


train: loss: 1.5535838311178642, f1: 0.6228070175438597, roc_auc: 0.0: 100%|██████████| 57/57 [00:16<00:00,  3.45it/s]
validation: loss: 1.5791318927492415, f1: 0.5863095238095238, roc_auc: 0.0: 100%|██████████| 7/7 [00:01<00:00,  5.98it/s]


epoch 1 started


train: loss: 1.531674247038992, f1: 0.6337719298245614, roc_auc: 0.0: 100%|██████████| 57/57 [00:16<00:00,  3.46it/s] 
validation: loss: 1.5791280780519759, f1: 0.5863095238095238, roc_auc: 0.0: 100%|██████████| 7/7 [00:01<00:00,  5.90it/s]


epoch 2 started


train: loss: 1.5332306623458862, f1: 0.6322115384615384, roc_auc: 0.0:  91%|█████████ | 52/57 [00:15<00:01,  3.44it/s]


KeyboardInterrupt: 

In [73]:
# loss = nn.CrossEntropyLoss()
for input_ids, attention_mask, y_batch in worker.train_loader:
    for in_id in input_ids:
        print(tokenizer.decode(in_id))
#     input_ids = torch.tensor(input_ids).to(worker.device)
# #     print(input_ids)
#     attention_mask = torch.tensor(attention_mask).to(worker.device)
# #     print(attention_mask)
#     y_batch = torch.tensor(y_batch).to(worker.device)
#     print(y_batch)
#     print(worker.model(input_ids=input_ids, attention_mask=attention_mask).shape)
#     probs = torch.nn.functional.softmax(worker.model(input_ids=input_ids, attention_mask=attention_mask), dim=1)
#     print(y_batch)
# #     break

[CLS] Быстрая доставка карт Кредитует с 18 лет Очень грамотная поддержка специалистов Мой самый любимый < ASPECT > банк < ASPECT >, оперативно обрабатывают заявки, высокий процент одобрения. Теперь открыла у них расчетный счет как предприниматель, самое лучшее, что карту вам привезут куда угодно, в удобное для вас время. Приложения Тинькофф и Тинькофф бизнес с очень удобным интерфейсом, интуитивно понимаешь что и где находится. Однозначно рекомендую! [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] Давно являю

KeyboardInterrupt: 

In [None]:
data["info"].unique()

In [42]:
for i in range(len(df)): df[i]

KeyError: 942

In [None]:
y_