In [1]:
import re
import pandas as pd
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import numpy as np
import random
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
torch.set_float32_matmul_precision("high")

2023-07-01 20:20:06.249688: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-01 20:20:06.294877: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
torch.cuda.is_available()

True

In [3]:
from transformers.optimization import AdafactorSchedule, Adafactor, AdamW, get_adafactor_schedule, get_linear_schedule_with_warmup

from transformers import(
    T5Model,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

In [4]:
from torch.utils.data import Dataset,DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

from utils import f1score, get_entities, set_seed
from addict import Dict
from dataset import ReceiptsDataModule, ReceiptsDataset

In [5]:
set_seed(42)

In [46]:
def extract_set(values):
    sett = set()
    for item in values:
        words = re.split(',|-| ', item)
        for word in words:
            sett.add(word.strip())
    return sett

In [48]:
df = pd.read_csv('data/train_ner.csv')
df[['good', 'brand']] = np.column_stack(df['target_text'].apply(lambda x: get_entities(x))).T
df.head(3)

Unnamed: 0,input_text,target_text,good,brand
0,petmax бантик леопард с красн розой 2шт,good: бантик; brand: petmax <\s>,бантик,petmax
1,87191 бусы для елки шарики_87191,good: бусы; brand: <\s>,бусы,
2,футболка piazza italia wr011446881,good: футболка; brand: piazza italia <\s>,футболка,piazza italia


In [49]:
goods = extract_set(df['good'].values)
brands = extract_set(df['brand'].values)

In [55]:
only_goods = goods - brands
only_brands = brands - goods

In [50]:
MODEL_NAME = "cointegrated/rut5-small"
# MODEL_NAME = "cointegrated/rut5-base"

In [7]:
name = MODEL_NAME.split('/')[-1]

In [8]:
root = 'data'
TRAIN_DATASET_PATH = f"{root}/train_ner.csv"
TEST_DATASET_PATH = f"{root}/test_ner.csv"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 256
NUM_WORKERS = 6

In [9]:
data_module_args = Dict(**dict(
    tokenizer_name=MODEL_NAME,
    train_dataset_path=TRAIN_DATASET_PATH,
    test_dataset_path=TEST_DATASET_PATH,
    val_split_size=VAL_SPLIT_SIZE,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,    
))

In [20]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparam):
        super(T5FineTuner, self).__init__()
        self.learning_rate = hparam.learning_rate
        self.hparam = hparam

        self.dm = ReceiptsDataModule(hparam)
        self.dm.prepare_data()
        self.dm.setup('train')
        
        self.model = T5ForConditionalGeneration.from_pretrained(
            hparam.model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(
            hparam.model_name
        )
#         self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        return output.loss, output.logits
    
    
    def shared_step(self, batch, stage='train'):
        input_ids = batch["input_ids"]
        attention_mask = batch['attention_mask']
        labels = batch["labels"]
        loss, outputs = self(input_ids,attention_mask,labels)
        self.log(f"loss/{stage}", loss, prog_bar=True, logger=True)

        return loss
    
    def training_step(self, batch, _):
        loss = self.shared_step(batch, 'train')
        return loss

    def validation_step(self, batch, _):
        loss = self.shared_step(batch, 'val')
        return loss
        
    def predict_step(self, batch, _=None):
        
        generated_ids = trainer.model.model.generate(
          input_ids=batch["input_ids"],
          attention_mask=batch["attention_mask"],
          num_beams=15,
          max_length=200,
          repetition_penalty=1.0,
          early_stopping=True,
          use_cache=True
        )
        preds = [
           self.tokenizer.decode(generated_id,
                                 skip_special_tokens=True, 
                                 clean_up_tokenization_spaces=True)
           for generated_id in generated_ids
        ]
        
        return preds
    
    
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparam.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
#         optimizer = AdamW(optimizer_grouped_parameters,
#                           lr=self.learning_rate, eps=self.hparam.adam_epsilon)
        optimizer = Adafactor(optimizer_grouped_parameters, lr=self.learning_rate, scale_parameter=False, relative_step=False)
        self.opt = optimizer
        return [optimizer]

    def optimizer_step(self,
                       epoch=None,
                       batch_idx=None,
                       optimizer=None,
                       optimizer_idx=None,
                       optimizer_closure=None,
                       on_tpu=None,
                       using_native_amp=None,
                       using_lbfgs=None
                       ):

        optimizer.step(closure=optimizer_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()
        self.log('lr', self.lr_scheduler.get_last_lr()[-1])

    
    def train_dataloader(self):        
        dataloader = self.dm.train_dataloader()
        
        t_total = (
            (len(dataloader.dataset) //
             (self.hparam.batch_size))
            // self.hparam.gradient_accumulation_steps
            * float(self.hparam.num_train_epochs)
        )
#         dummy_adam = AdamW(params=[torch.nn.Parameter(torch.tensor([0.], 
#                                     requires_grad=True, dtype=torch.float64))])
                           
                
        scheduler = get_linear_schedule_with_warmup(self.opt, num_warmup_steps=self.hparam.warmup_steps, num_training_steps=t_total
        )
        self.lr_scheduler = scheduler
        return dataloader

    def val_dataloader(self):
        return self.dm.val_dataloader()

    def predict_dataloader(self):
        return self.dm.predict_dataloader()

In [21]:
model_args = Dict(**dict(
    model_name=MODEL_NAME,
    learning_rate=1e-3,
    weight_decay=1e-3,
    adam_epsilon=1e-8,
    warmup_steps=0,
    num_train_epochs=80,
    gradient_accumulation_steps=32,
    early_stop_callback=False,
    seed=42,
    output_dir='t5models',
))

In [22]:
args = model_args | data_module_args

In [23]:
model = T5FineTuner(args)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [24]:
name

'rut5-small'

In [25]:
trainer = pl.Trainer(
    accumulate_grad_batches=args.gradient_accumulation_steps,
#     auto_lr_find=True,
    auto_scale_batch_size=True,
    max_epochs=args.num_train_epochs,
    precision=32,
    devices=[0],
    log_every_n_steps=1,
    logger=pl.loggers.TensorBoardLogger("tb_logs", name=name),
    accelerator="gpu",
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [53]:
trainer.fit(model)

  rank_zero_warn(
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [4]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 64.6 M
-----------------------------------------------------
64.6 M    Trainable params
0         Non-trainable params
64.6 M    Total params
258.578   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [26]:
model = T5FineTuner.load_from_checkpoint(f't5models/{name}-adafactor', hparam=args)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [27]:
val_predictions = trainer.predict(model, dataloaders=model.dm.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]


Predicting: 0it [00:00, ?it/s]

In [28]:
val_df, test_df = model.dm.val_df.copy(), model.dm.test_df.copy()

In [29]:
val_df['predict'] = sum(val_predictions, start=[])

In [30]:
val_df[['good_gt', 'brand_gt']] = np.column_stack(val_df['target_text'].apply(lambda x: get_entities(x))).T
val_df[['good_pred', 'brand_pred']] = np.column_stack(val_df['predict'].apply(lambda x: get_entities(x))).T

In [31]:
f1_good = f1score(val_df['good_pred'], val_df['good_gt'])
f1_good

0.8993212669683258

In [59]:
f1_good = f1score(val_df['good_pred'].apply(lambda x: "" if x in only_brands else x), val_df['good_gt'])
f1_good

0.9054669703872438

In [60]:
f1_brand = f1score(val_df['brand_pred'], val_df['brand_gt'])
f1_brand

0.737094837935174

In [61]:
f1_brand = f1score(val_df['brand_pred'].apply(lambda x: "" if x in only_goods else x), val_df['brand_gt'])
f1_brand

0.7400562474889514

In [62]:
# best: 0.7920
# 't5models/rut5-small-adafactor' with beams 10


In [63]:
(f1_good + 2 * f1_brand) / 3

0.7951931551217154

In [37]:
df = pd.read_csv('data/train_ner.csv')
df[['good', 'brand']] = np.column_stack(df['target_text'].apply(lambda x: get_entities(x))).T
df.head(3)

Unnamed: 0,input_text,target_text,good,brand
0,petmax бантик леопард с красн розой 2шт,good: бантик; brand: petmax <\s>,бантик,petmax
1,87191 бусы для елки шарики_87191,good: бусы; brand: <\s>,бусы,
2,футболка piazza italia wr011446881,good: футболка; brand: piazza italia <\s>,футболка,piazza italia


In [148]:
# from fuzzywuzzy import process, fuzz
# from gensim.models.fasttext import FastText
# from sklearn.cluster import AffinityPropagation, DBSCAN, AgglomerativeClustering

In [149]:
# fasttext = FastText.load('fasttext_models/fasttext_512.model')

In [204]:
# def replace_with_nearest(values, fasttext, replacement_set: pd.Series):
#     new_values = []
#     replacement_vecs = fasttext.wv[replacement_set]
#     for label in values:        
#         if label in replacement_set or label == '':
#             new_values.append(label)
#             continue
#         label_vec = fasttext.wv[label]
#         dists = cdist(label_vec.reshape(1, -1), replacement_vecs, metric='cosine')[0]
#         nearest_inds = np.argsort(dists)[:250]
#         nearest_labels = replacement_set[nearest_inds]
#         best_match, score = process.extractOne(label, nearest_labels,  scorer=fuzz.WRatio)
#         if score > 90:
#             new_values.append(best_match[0])
#         else:
#             new_values.append(label)
#     return new_values

In [193]:
# lat_to_cyr = str.maketrans("aekmhopctyx", "аекмнорстух")
# cyr_to_lat = str.maketrans("аекмнорстух", "aekmhopctyx")
# def change_lang(w):
#     if w == '':
#         return w
#     num_eng_chars = len(re.findall(r'[a-z]', w))
#     num_ru_chars = len(re.findall(r'[а-я]', w))
#     if num_eng_chars and num_ru_chars:
#         if num_eng_chars > num_ru_chars:
#             w = w.translate(cyr_to_lat)
#         else:
#             w = w.translate(lat_to_cyr)
#     return w

In [64]:
test_predictions = trainer.predict(model, dataloaders=model.dm.predict_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [3]


Predicting: 0it [00:00, ?it/s]

In [65]:
test_df['predict'] = sum(test_predictions, start=[])

In [66]:
test_df[['good', 'brand']] = np.column_stack(test_df['predict'].apply(lambda x: get_entities(x))).T

In [67]:
test_df['good'] = test_df['good'].apply(lambda x: "" if x in only_brands else x)

In [68]:
test_df['brand'] = test_df['brand'].apply(lambda x: "" if x in only_goods else x)

In [69]:
test_df[['id', 'good', 'brand']]

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,сладушка
2,2,смеситель,calorie
3,3,лимон,
4,4,коньяк,сараджишвили
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,rиттерспорт


In [70]:
test_df[['id', 'input_text', 'good', 'brand']]

Unnamed: 0,id,input_text,good,brand
0,0,"469-210 ермак клей универсальный, 15мл, блистер",клей,ермак
1,1,торт сладушка зимняя вишня 700г,торт,сладушка
2,2,"смеситель ""calorie"" 1023 а06 д/кухни",смеситель,calorie
3,3,лимон 50гр бар,лимон,
4,4,"коньяк сараджишвили 5 лет 0,5л грузия",коньяк,сараджишвили
...,...,...,...,...
4995,4995,"774352 рамка 2п., сл. кость",рамка,
4996,4996,энерг. напиток red bull 0.25л,напиток,red bull
4997,4997,36/025 наконечники (т. никель) шт,наконечники,
4998,4998,шоколад риттерспорт мол.с цел.миндалем 100г,шоколад,rиттерспорт


In [71]:
test_df[['id', 'good', 'brand']].to_csv(f'submissions/submision_beam15_only_fans.csv', index=False)