In [1]:
import re
import pandas as pd
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import numpy as np
import random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
torch.set_float32_matmul_precision("high")

2023-06-29 08:36:27.355610: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-29 08:36:27.401886: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
torch.cuda.is_available()

True

In [4]:
from transformers.optimization import AdafactorSchedule, Adafactor, AdamW, get_adafactor_schedule, get_linear_schedule_with_warmup

from transformers import(
    T5Model,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

In [5]:
from torch.utils.data import Dataset,DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

from utils import f1score, get_entities, set_seed
from addict import Dict
from dataset import ReceiptsDataModule, ReceiptsDataset

In [6]:
set_seed(42)

In [67]:
MODEL_NAME = "cointegrated/rut5-small"
# MODEL_NAME = "cointegrated/rut5-base"

In [68]:
name = MODEL_NAME.split('/')[-1]

In [69]:
root = 'data'
TRAIN_DATASET_PATH = f"{root}/train_ner.csv"
TEST_DATASET_PATH = f"{root}/test_ner.csv"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 256
NUM_WORKERS = 6

In [70]:
data_module_args = Dict(**dict(
    tokenizer_name=MODEL_NAME,
    train_dataset_path=TRAIN_DATASET_PATH,
    test_dataset_path=TEST_DATASET_PATH,
    val_split_size=VAL_SPLIT_SIZE,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,    
))

In [71]:
dm = ReceiptsDataModule(data_module_args)

In [72]:
dm.prepare_data()
dm.setup('train')

In [73]:
dm.train_df.head(3)

Unnamed: 0,input_text,target_text
4484,блуза acoola rep12445011,good: блуза; brand: acoola <\s>
11558,vladi toys. магнитный театр. (россия),good: театр; brand: vladi toys <\s>
22322,машина на радиоуправлении 1:24 арт в1112555 pl,good: машина; brand: <\s>


In [74]:
for batch in dm.train_dataloader():
    break
batch.keys()

dict_keys(['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'])

In [75]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.learning_rate = hparam.learning_rate
        self.hparam = hparam

        self.dm = ReceiptsDataModule(hparam)
        self.dm.prepare_data()
        self.dm.setup('train')
        
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(
            hparam.model_name
        )
#         self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    
    def shared_step(self, batch, stage='train'):
        input_ids = batch["input_ids"]
        attention_mask = batch['attention_mask']
        labels = batch["labels"]
        loss, outputs = self(input_ids,attention_mask,labels)
        self.log(f"loss/{stage}", loss, prog_bar=True, logger=True)

        return loss
    
    def training_step(self, batch, _):
        loss = self.shared_step(batch, 'train')
        return loss

    def validation_step(self, batch, _):
        loss = self.shared_step(batch, 'val')
        return loss
        
    def predict_step(self, batch, _=None):
        
        generated_ids = trainer.model.model.generate(
          input_ids=batch["input_ids"],
          attention_mask=batch["attention_mask"],
          num_beams=3,
          max_length=80,
          repetition_penalty=1.0,
          early_stopping=True,
          use_cache=True
        )
        preds = [
           self.tokenizer.decode(generated_id,
                                 skip_special_tokens=True, 
                                 clean_up_tokenization_spaces=True)
           for generated_id in generated_ids
        ]
        
        return preds
    
    
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
#         optimizer = AdamW(optimizer_grouped_parameters,
#                           lr=self.learning_rate, eps=self.hparam.adam_epsilon)
        optimizer = Adafactor(optimizer_grouped_parameters, lr=self.learning_rate, scale_parameter=False, relative_step=False)
        self.opt = optimizer
        return [optimizer]

#     def optimizer_step(self,
#                        epoch=None,
#                        batch_idx=None,
#                        optimizer=None,
#                        optimizer_idx=None,
#                        optimizer_closure=None,
#                        on_tpu=None,
#                        using_native_amp=None,
#                        using_lbfgs=None
#                        ):

#         optimizer.step(closure=optimizer_closure)
#         optimizer.zero_grad()
#         self.lr_scheduler.step()
#         self.log('lr', self.lr_scheduler.get_last_lr()[-1])

    
    def train_dataloader(self):        
        dataloader = self.dm.train_dataloader()
        
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.batch_size))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
        dummy_adam = AdamW(params=[torch.nn.Parameter(torch.tensor([0.], 
                                    requires_grad=True, dtype=torch.float64))])
                           
                
        scheduler = get_linear_schedule_with_warmup(dummy_adam, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        return self.dm.val_dataloader()

    def predict_dataloader(self):
        return self.dm.predict_dataloader()

In [76]:
model_args = Dict(**dict(
    model_name=MODEL_NAME,
    learning_rate=1e-3,
    weight_decay=1e-4,
    adam_epsilon=1e-8,
    warmup_steps=3,
    num_train_epochs=80,
    gradient_accumulation_steps=32,
    early_stop_callback=False,
    seed=42,
    output_dir='t5models',
))

In [77]:
1e-3

0.001

In [78]:
args = model_args | data_module_args

In [79]:
model = T5FineTuner(args)

In [80]:
name

'rut5-base'

In [81]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filename=args.output_dir+"/checkpoint.pth", monitor="loss/val", mode="min", save_top_k=1
)

In [82]:
trainer = pl.Trainer(
    accumulate_grad_batches=args.gradient_accumulation_steps,
#     auto_lr_find=True,
    max_epochs=args.num_train_epochs,
    precision=32,
    devices=[0],
    callbacks=[checkpoint_callback],
    log_every_n_steps=1,
    logger=pl.loggers.TensorBoardLogger("tb_logs", name=name),
    accelerator="gpu"
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [44]:
# lr_finder = trainer.tuner.lr_find(model)

# # Plot with
# fig = lr_finder.plot(suggest=True)
# fig.show()

In [45]:
# Pick point based on plot, or get suggestion
# new_lr = lr_finder.suggestion()

In [46]:
# new_lr

In [47]:
# model.learning_rate = new_lr

In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 244 M 
-----------------------------------------------------
244 M     Trainable params
0         Non-trainable params
244 M     Total params
977.237   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
trainer.save_checkpoint(f't5models/{name}-adafactor')

In [None]:
val_predictions = trainer.predict(model, dataloaders=model.dm.val_dataloader())

In [None]:
val_df, test_df = model.dm.val_df.copy(), model.dm.test_df.copy()

In [None]:
val_df['predict'] = sum(val_predictions, start=[])

In [None]:
val_df[['good_gt', 'brand_gt']] = np.column_stack(val_df['target_text'].apply(lambda x: get_entities(x))).T
val_df[['good_pred', 'brand_pred']] = np.column_stack(val_df['predict'].apply(lambda x: get_entities(x))).T

In [None]:
f1_good = f1score(val_df['good_pred'], val_df['good_gt'])
f1_good

In [None]:
f1_brand = f1score(val_df['brand_pred'], val_df['brand_gt'])
f1_brand

In [None]:
(f1_good + 2 * f1_brand) / 3

In [57]:
val_df[val_df['brand_pred'] != val_df['brand_gt']][['input_text', 'target_text', 'predict']].sample(100)

Unnamed: 0,input_text,target_text,predict
23315,872530-xl196-пижама из джерси eco-conception,good: пижама; brand: eco-conception <\s>,good: пижама; brand: eco conception <\s>
11178,"чизкейк ""new-york"" 200гр/2куска",good: чизкейк; brand: <\s>,good: чизкейк; brand: new-york <\s>
199,капитан сильвер батончик в шоколадной глазури ...,good: батончик; brand: капитан сильвер <\s>,good: батончик; brand: капитан <\s>
9161,9 2456567063299 джинсы муж/ltb/5055,good: джинсы; brand: ltb <\s>,good: джинсы; brand: <\s>
10321,суп дачный грибной с вермишелью,good: суп; brand: дачный <\s>,good: суп; brand: <\s>
...,...,...,...
18675,реамберин р-р д/инф 1.5% 250мл конт.пл. n32,good: раствор; brand: <\s>,good: раствор; brand: реамберин <\s>
4246,"kpl00000146 хит аида 307 канекалон, дл.1,3м ...",good: канекалон; brand: <\s>,good: хит; brand: аида <\s>
2842,2 2157348581475 топ жен/pull/9238,good: топ; brand: pull&bear <\s>,good: топ; brand: <\s>
3282,делитель digital 1-2,good: делитель; brand: digital <\s>,good: делитель; brand: <\s>


In [58]:
test_predictions = trainer.predict(model, dataloaders=model.dm.predict_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]


Predicting: 39it [00:00, ?it/s]

In [59]:
test_df['predict'] = sum(test_predictions, start=[])

In [60]:
test_df[['good', 'brand']] = np.column_stack(test_df['predict'].apply(lambda x: get_entities(x))).T

In [61]:
MODEL_NAME

'cointegrated/rut5-small'

In [62]:
test_df[['id', 'good', 'brand']]

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,сладушка
2,2,смеситель,calorie
3,3,лимон,
4,4,коньяк,сараджишвили
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,rиттерспорт


In [63]:
test_df[['id', 'input_text', 'good', 'brand']]

Unnamed: 0,id,input_text,good,brand
0,0,"469-210 ермак клей универсальный, 15мл, блистер",клей,ермак
1,1,торт сладушка зимняя вишня 700г,торт,сладушка
2,2,"смеситель ""calorie"" 1023 а06 д/кухни",смеситель,calorie
3,3,лимон 50гр бар,лимон,
4,4,"коньяк сараджишвили 5 лет 0,5л грузия",коньяк,сараджишвили
...,...,...,...,...
4995,4995,"774352 рамка 2п., сл. кость",рамка,
4996,4996,энерг. напиток red bull 0.25л,напиток,red bull
4997,4997,36/025 наконечники (т. никель) шт,наконечники,
4998,4998,шоколад риттерспорт мол.с цел.миндалем 100г,шоколад,rиттерспорт


In [65]:
test_df[['id', 'good', 'brand']].to_csv(f'submissions/submision-adafactor.csv', index=False)