In [1]:
import re
from gensim.models.fasttext import FastText
import pandas as pd
import pytorch_lightning as pl
from seqeval.metrics.sequence_labeling import get_entities
from sklearn.model_selection import train_test_split
import numpy as np
import random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split

torch.set_float32_matmul_precision("high")

2023-06-28 12:03:42.141099: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-28 12:03:42.192391: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from tqdm import tqdm

In [2]:
torch.cuda.is_available()

True

In [54]:
from torch.utils.data import Dataset,DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

from transformers import(
    AdamW,
    T5Model,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
from utils import F1Score, f1score
from addict import Dict

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [5]:
class ReceiptsDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_length: int = 256,
        target_max_token_length: int = 32
    ):
        super().__init__()
        self.is_predict = "target_text" not in df.columns
        self.data = df[["input_text", "target_text"]] if not self.is_predict else df[["input_text"]]
        self.data = self.data.values
        self.tokenizer = tokenizer
        self.source_max_token_length = source_max_token_length
        self.target_max_token_length = target_max_token_length
        
    def __getitem__(self, index):
        input_text = self.data[index][0]
        target_text = self.data[index][1] if not self.is_predict else ""
        
        source_encoding = self.tokenizer(
          input_text,
          max_length=self.source_max_token_length,
          padding="max_length",
          truncation="only_second",
          return_attention_mask=True,
          add_special_tokens=True,
          return_tensors="pt"
        )
        if target_text != '':
            target_encoding = self.tokenizer(
              target_text,
              max_length=self.target_max_token_length,
              padding="max_length",
              truncation=True,
              return_attention_mask=True,
              add_special_tokens=True,
              return_tensors="pt"
            )

            labels = target_encoding['input_ids']
            labels[labels == 0] = -100
        else:
            target_encoding = torch.zeros(1)
            labels = torch.zeros(1)

        return dict(
            input_text=input_text,
            target_text=target_text,
            input_ids=source_encoding['input_ids'].flatten(),
            attention_mask=source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
        )

    def __len__(self):
        return len(self.data)

In [6]:
class ReceiptsDataModule(pl.LightningDataModule):
    def __init__(self, hparam):
        super().__init__()
        self.hparam = hparam
        self.tokenizer = T5Tokenizer.from_pretrained(self.hparam.tokenizer_name)

    def prepare_data(self):
        self.train_df = pd.read_csv(self.hparam.train_dataset_path).fillna("")
        self.test_df = pd.read_csv(self.hparam.test_dataset_path)
    
    def setup(self, stage: str):
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.hparam.val_split_size, random_state=42)

        self.train_dataset = ReceiptsDataset(self.train_df, self.tokenizer)
        self.val_dataset = ReceiptsDataset(self.val_df, self.tokenizer)
        self.predict_dataset = ReceiptsDataset(self.test_df, self.tokenizer)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.hparam.batch_size,
                          num_workers=self.hparam.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.hparam.batch_size,
                          num_workers=self.hparam.num_workers)

    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                                           batch_size=self.hparam.batch_size,
                                           num_workers=self.hparam.num_workers)

In [71]:
# MODEL_NAME = "cointegrated/rut5-small"
MODEL_NAME = "cointegrated/rut5-base"
# MODEL_NAME = "alenusch/mt5small-ruparaphraser"
# MODEL_NAME = "cointegrated/rut5-base"
# MODEL_NAME = "sberbank-ai/ruT5-base "

In [72]:
root = 'data'
TRAIN_DATASET_PATH = f"{root}/train_ner.csv"
TEST_DATASET_PATH = f"{root}/test_ner.csv"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 256
NUM_WORKERS = 4

In [73]:
data_module_args = Dict(**dict(
    tokenizer_name=MODEL_NAME,
    train_dataset_path=TRAIN_DATASET_PATH,
    test_dataset_path=TEST_DATASET_PATH,
    val_split_size=VAL_SPLIT_SIZE,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,    
))

In [10]:
dm = ReceiptsDataModule(data_module_args)

In [11]:
dm.prepare_data()
dm.setup('train')

In [12]:
dm.train_df.head(3)

Unnamed: 0,input_text,target_text
4484,блуза acoola rep12445011,good: блуза; brand: acoola <\s>
11558,vladi toys. магнитный театр. (россия),good: театр; brand: vladi toys <\s>
22322,машина на радиоуправлении 1:24 арт в1112555 pl,good: машина; brand: <\s>


In [13]:
for batch in dm.train_dataloader():
    break
batch.keys()

dict_keys(['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'])

In [14]:
def get_entities(label):
    label = label.replace('<\s>', '')
    good_regex = r"good:\s(.*?)(?:;|\s<|$)"
    brand_regex = r"brand:\s(.*?)(?:;|\s<|$)"
    good, brand = '', ''
    good_match = re.search(good_regex, label)
    brand_match = re.search(brand_regex, label)
    if good_match:
        good = good_match.group(1).strip()
    if brand_match:
        brand = brand_match.group(1).strip()
    return good, brand

In [15]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.hparam = hparam

        self.dm = ReceiptsDataModule(hparam)
        self.dm.prepare_data()
        self.dm.setup('train')
        
        
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(
            hparam.model_name
        )
        
        
#         self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    
    def shared_step(self, batch, stage='train'):
        input_ids = batch["input_ids"]
        attention_mask = batch['attention_mask']
        labels = batch["labels"]
        loss, outputs = self(input_ids,attention_mask,labels)
        self.log(f"loss/{stage}", loss, prog_bar=True, logger=True)

        return loss
    
    def training_step(self, batch, _):
        loss = self.shared_step(batch, 'train')
        return loss

    def validation_step(self, batch, _):
        loss = self.shared_step(batch, 'val')
        return loss
    
    def predict_step(self, batch, _):
        
        generated_ids = trainer.model.model.generate(
          input_ids=batch["input_ids"],
          attention_mask=batch["attention_mask"],
          num_beams=3,
          max_length=80,
          repetition_penalty=1.0,
          early_stopping=True,
          use_cache=True
        )
        preds = [
           self.tokenizer.decode(generated_id,
                                 skip_special_tokens=True, 
                                 clean_up_tokenization_spaces=True)
           for generated_id in generated_ids
        ]
        
        return preds
    
    
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparam.learning_rate, eps=self.hparam.adam_epsilon)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self,
                       epoch=None,
                       batch_idx=None,
                       optimizer=None,
                       optimizer_idx=None,
                       optimizer_closure=None,
                       on_tpu=None,
                       using_native_amp=None,
                       using_lbfgs=None
                       ):

        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
        tqdm_dict = {"loss": "{:.3f}".format(
            self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

        return tqdm_dict
    
    def train_dataloader(self):        
        dataloader = self.dm.train_dataloader()
        
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.batch_size))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        scheduler = get_linear_schedule_with_warmup(
            self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        return self.dm.val_dataloader()

    def predict_dataloader(self):
        return self.dm.predict_dataloader()

In [16]:
model_args = Dict(**dict(
    model_name=MODEL_NAME,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    num_train_epochs=30,
    gradient_accumulation_steps=16,
    early_stop_callback=False,
    seed=42,
    output_dir='t5models',
))

In [17]:
args = model_args | data_module_args

In [18]:
model = T5FineTuner(args)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [19]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filename=args.output_dir+"/checkpoint.pth", monitor="loss/val", mode="min", save_top_k=1
)

In [20]:
trainer = pl.Trainer(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    max_epochs=args.num_train_epochs,
    precision= 32,
    devices=[0],
    callbacks=[checkpoint_callback],
    log_every_n_steps=1,
    logger=pl.loggers.TensorBoardLogger("tb_logs", name="rut5-small"),
    accelerator="gpu"
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Обучение модели

In [21]:
# best: tb_logs/ner_crf_lstm_2_512_pre_renamed_20e/version_0

In [22]:
trainer.fit(model)

  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 64.6 M
-----------------------------------------------------
64.6 M    Trainable params
0         Non-trainable params
64.6 M    Total params
258.578   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


In [25]:
val_predictions = trainer.predict(model, dataloaders=model.dm.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]


Predicting: 88it [00:00, ?it/s]

In [39]:
val_df, test_df = model.dm.val_df.copy(), model.dm.test_df.copy()

In [42]:
val_df['predict'] = sum(val_predictions, start=[])

In [77]:
trainer.save_checkpoint('t5models/rut5-small')

In [43]:
val_df[['good_gt', 'brand_gt']] = np.column_stack(val_df['target_text'].apply(lambda x: get_entities(x))).T
val_df[['good_pred', 'brand_pred']] = np.column_stack(val_df['predict'].apply(lambda x: get_entities(x))).T

In [59]:
f1_good = f1score(val_df['good_pred'], val_df['good_gt'])
f1_good

0.8348773841961853

In [60]:
f1_brand = f1score(val_df['brand_pred'], val_df['brand_gt'])
f1_brand

0.6172175249807839

In [61]:
(f1_good + 2 * f1_brand) / 3

0.6897708113859177

In [50]:
val_df[val_df['brand_pred'] != val_df['brand_gt']][['input_text', 'target_text', 'predict']].sample(100)

Unnamed: 0,input_text,target_text,predict
10981,462709958184 болт din933 с шестигранной головк...,good: болт; brand: tech krep <\s>,good: болт; brand: tech-krep <\s>
960,чай 7 трав травяной чай 45 г р,good: чай; brand: 7 трав <\s>,good: чай; brand: травяной чай <\s>
21176,"подклад стрейч, черный-79",good: подклад; brand: <\s>,good: подклад; brand: стрейч <\s>
15537,ванна морская вода,good: вода; brand: <\s>,good: вода; brand: ванна <\s>
21511,"мармелад бебето жевательный 0,35",good: мармелад; brand: bebeto <\s>,good: мармелад; brand: бебето <\s>
...,...,...,...
24954,зефир глаз. маша и медведь банан-клубника 1шт,good: зефир; brand: маша и медведь <\s>,good: зефир; brand: маша <\s>
1972,аппетитка рест.обломов томаты/перец пепперони ...,good: аппетитка; brand: ресторация обломов <\s>,good: аппетитка; brand: <\s>
3070,1. лактобаланс №28 капс.,good: капсулы; brand: лактобаланс <\s>,good: лактобаланс; brand: <\s>
1393,"сувенир заяц (дерево), н30см, натур. (шт)",good: сувенир; brand: <\s>,good: сувенир; brand: заяц <\s>


In [51]:
test_predictions = trainer.predict(model, dataloaders=model.dm.predict_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]


Predicting: 88it [00:00, ?it/s]

In [62]:
test_df['predict'] = sum(test_predictions, start=[])

In [64]:
test_df[['good', 'brand']] = np.column_stack(test_df['predict'].apply(lambda x: get_entities(x))).T

In [65]:
MODEL_NAME

'cointegrated/rut5-small'

In [66]:
test_df[['id', 'good', 'brand']]

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,сладушка
2,2,смеситель,calorie
3,3,лимон,бар
4,4,коньяк,сараджишвили
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,риттерспорт


In [67]:
test_df[['id', 'good', 'input_text', 'good', 'brand']]

Unnamed: 0,id,good,input_text,good.1,brand
0,0,клей,"469-210 ермак клей универсальный, 15мл, блистер",клей,ермак
1,1,торт,торт сладушка зимняя вишня 700г,торт,сладушка
2,2,смеситель,"смеситель ""calorie"" 1023 а06 д/кухни",смеситель,calorie
3,3,лимон,лимон 50гр бар,лимон,бар
4,4,коньяк,"коньяк сараджишвили 5 лет 0,5л грузия",коньяк,сараджишвили
...,...,...,...,...,...
4995,4995,рамка,"774352 рамка 2п., сл. кость",рамка,
4996,4996,напиток,энерг. напиток red bull 0.25л,напиток,red bull
4997,4997,наконечники,36/025 наконечники (т. никель) шт,наконечники,
4998,4998,шоколад,шоколад риттерспорт мол.с цел.миндалем 100г,шоколад,риттерспорт


In [70]:
test_df[['id', 'good', 'brand']].to_csv(f'submissions/submision-rut5-small.csv', index=False)