# Baseline

В качестве бейзлайна используется модель NER, состоящая из RNN поверх эмбеддингов FastText (для получения эмбеддингов нужно запустить ноутбук `train_fasttext.ipynb`)

Нормализация брендов и товаров не производится

Бейзлайн реализован на библиотеке PyTorch с использованием PyTorch-Lightning для упрощения кода

In [1]:
from gensim.models.fasttext import FastText
import pandas as pd
import pytorch_lightning as pl
from seqeval.metrics.sequence_labeling import get_entities
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

torch.set_float32_matmul_precision("high")

2023-06-16 16:55:34.231613: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-16 16:55:34.288460: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

stop_words = set(stopwords.words('russian'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\b\d+\b', '', text)  # remove numbers
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    words = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # lemmatization and remove stop words
    return words

In [4]:
# !pip install seqeval==1.2.2
# !pip install allennlp==2.10.1
# !pip install pytorch-crf

# Utils

Полезные функции для работы с BIO-тегами

In [5]:
def apply_bio_tagging(row):
    """
    По токенам чека и разметке (то есть выделенным товарам и брендам) строим BIO-теги
    """
    tokens = row["tokens"]
    good = row["good"].split(',')[0].split()
    brand = row["brand"].split(',')[0].split()
    tags = ['O'] * len(tokens)
    for i, token in enumerate(tokens):
        if len(good) > 0 and tokens[i:i + len(good)] == good:
            tags[i] = "B-GOOD"
            for j in range(i + 1, i + len(good)):
                tags[j] = "I-GOOD"
        if len(brand) > 0 and tokens[i:i + len(brand)] == brand:
            tags[i] = "B-BRAND"
            for j in range(i + 1, i + len(brand)):
                tags[j] = "I-BRAND"
    return tags

Прямое и обратное преобразование тегов в индексы

In [6]:
index_to_tag = ["O", "B-GOOD", "I-GOOD", "B-BRAND", "I-BRAND", "PAD"]
tag_to_index = {tag: index for index, tag in enumerate(index_to_tag)}

# Datamodule

Подготовим данные для модели. Для этого определим наследника `torch.nn.utils.Dataset` - `ReceiptsDataset`

In [7]:
class ReceiptsDataset(Dataset):
    def __init__(self, df, fasttext):
        super().__init__()
        self.is_predict = "tags" not in df.columns
        self.data = df[["tokens", "good", "brand", "tags"]] if not self.is_predict else df[["tokens", "id"]]
        self.data = self.data.values
        self.fasttext = fasttext

    def __getitem__(self, index):
        identifier = 0 if not self.is_predict else self.data[index][1]
        tokens = self.data[index][0]
        embeddings = self.fasttext.wv[tokens]
        goods = self.data[index][1].split(',') if not self.is_predict else list()
        brands = self.data[index][2].split(',') if not self.is_predict else list()
        tags = self.data[index][3] if not self.is_predict else ["O"] * len(tokens)
        target = [tag_to_index[tag] for tag in tags]
        return identifier, tokens, embeddings, goods, brands, target

    def __len__(self):
        return len(self.data)

Для объединения примеров в батчи нужна специальная `collate_fn`, в которой происходит паддинг

In [8]:
def collate_fn(batch):
    ids, tokens_sequence, embeddings_sequence, goods, brands, targets = list(zip(*batch))
    embeddings_sequence = pad_sequence([torch.FloatTensor(sequence) for sequence in embeddings_sequence],
                                       batch_first=True)
    targets = pad_sequence([torch.LongTensor(target) for target in targets], batch_first=True,
                           padding_value=tag_to_index["PAD"])
    return ids, tokens_sequence, embeddings_sequence, goods, brands, targets

Используем LightningDataModule для задания пайплайна

1. prepare_data
    1. Токенизируем текст
    2. Выделяем BIO-теги в размеченной части
2. setup
    1. Разделяем размеченную выборку на обучающую и валидационную
    2. Создаем `ReceiptsDataset` под каждую выборку

In [9]:
class ReceiptsDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_dataset_path,
                 test_dataset_path,
                 fasttext_path,
                 val_split_size,
                 batch_size,
                 num_workers):
        super().__init__()
        self.train_dataset_path = train_dataset_path
        self.test_dataset_path = test_dataset_path
        self.fasttext_path = fasttext_path
        self.val_split_size = val_split_size
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        self.fasttext = FastText.load(self.fasttext_path)
        self.train_df = pd.read_csv(self.train_dataset_path).fillna("")
        self.test_df = pd.read_csv(self.test_dataset_path)
        self.train_df["tokens"] = self.train_df["name"].apply(preprocess_text)
        self.test_df["tokens"] = self.test_df["name"].apply(preprocess_text)

        self.train_df["tags"] = self.train_df.apply(apply_bio_tagging, axis=1)

    def setup(self, stage: str):
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size)

        self.train_dataset = ReceiptsDataset(self.train_df, self.fasttext)
        self.val_dataset = ReceiptsDataset(self.val_df, self.fasttext)
        self.predict_dataset = ReceiptsDataset(self.test_df, self.fasttext)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           collate_fn=collate_fn)

In [10]:
TRAIN_DATASET_PATH = "data/train_supervised_dataset.csv"
TEST_DATASET_PATH = "data/test_dataset.csv"
FASTTEXT_PATH = "fasttext_models/fasttext_window.model"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 512
NUM_WORKERS = 4

In [11]:
dm = ReceiptsDataModule(
    TRAIN_DATASET_PATH,
    TEST_DATASET_PATH,
    FASTTEXT_PATH,
    VAL_SPLIT_SIZE,
    BATCH_SIZE,
    NUM_WORKERS
)

# Model

Сначала определим метрику `F1` для задачи NER

In [12]:
class F1Score:
    def __init__(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def update(self, pred, target):
        pred = frozenset(x for x in pred)
        target = frozenset(x for x in target)
        self.tp += len(pred & target)
        self.fp += len(pred - target)
        self.fn += len(target - pred)

    def reset(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def get(self):
        if self.tp == 0:
            return 0.0
        precision = self.tp / (self.tp + self.fp)
        recall = self.tp / (self.tp + self.fn)
        return 2 / (1 / precision + 1 / recall)

Зададим саму модель, ее шаги на обучении, валидации и инференсе, а также способ обучения

In [19]:
class ReceiptsModule(pl.LightningModule):
    def __init__(self,
                 rnn_input_size,
                 rnn_hidden_size,
                 rnn_num_layers,
                 rnn_dropout,
                 mlp_hidden_size,
                 learning_rate,
                 rnn_bidir=True,
                 num_tags=-1,
                ):
        super().__init__()
        self.learning_rate = learning_rate
        self.lstm = nn.LSTM(input_size=rnn_input_size,
                           hidden_size=rnn_hidden_size,
                           num_layers=rnn_num_layers,
                           batch_first=True,
                           dropout=rnn_dropout,
                           bidirectional=rnn_bidir,
                           )
        self.mlp = nn.Sequential(
            nn.Linear(rnn_hidden_size + rnn_hidden_size * int(rnn_bidir), mlp_hidden_size),
            nn.GELU(),
            nn.Dropout(p=0.2),
            nn.Linear(mlp_hidden_size, len(index_to_tag)),
        )
        self.crf = CRF(num_tags, batch_first=True)
        self.criterion = nn.CrossEntropyLoss(ignore_index=tag_to_index["PAD"], reduction="mean")
        self.f1_good_train = F1Score()
        self.f1_brand_train = F1Score()
        self.f1_good_val = F1Score()
        self.f1_brand_val = F1Score()

    def forward(self, sequences):
        sequences, _ = self.lstm(sequences)
        logits = self.mlp(sequences)
        return logits

    def training_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, goods, brands, targets = batch
        logits = self(embeddings_sequence)
        loss = -self.crf(logits, targets)
        tags_indices_sequence = self.crf.decode(logits)

        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good_train.update(goods_pred, goods[i])
            self.f1_brand_train.update(brands_pred, brands[i])
        self.log("loss/train", loss, on_epoch=True, batch_size=len(tags_indices_sequence))
        return loss

    def on_train_epoch_end(self):
        self.log("metric/f1_good_train", self.f1_good_train.get())
        self.log("metric/f1_brand_train", self.f1_brand_train.get())
        self.f1_good_train.reset()
        self.f1_brand_train.reset()

    def validation_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, goods, brands, targets = batch
        logits = self(embeddings_sequence)
        loss = -self.crf(logits, targets)
        tags_indices_sequence = self.crf.decode(logits)

        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good_val.update(goods_pred, goods[i])
            self.f1_brand_val.update(brands_pred, brands[i])
        self.log("loss/val", loss, batch_size=len(tags_indices_sequence))

    def on_validation_epoch_end(self):
        self.log("metric/f1_good_val", self.f1_good_val.get())
        self.log("metric/f1_brand_val", self.f1_brand_val.get())
        self.f1_good_val.reset()
        self.f1_brand_val.reset()

    def predict_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, _, _, _ = batch
        logits = self(embeddings_sequence)
        tags_indices_sequence = self.crf.decode(logits)
        result = list()
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = ','.join([' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"])
            brands_pred = ','.join([' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"])
            result.append([ids[i], goods_pred, brands_pred])
        return result

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), self.learning_rate)

In [20]:
RNN_INPUT_SIZE = 300
RNN_HIDDEN_SIZE = 300
RNN_NUM_LAYERS = 3
RNN_DROPOUT = 0.3
MLP_HIDDEN_SIZE = 200
LEARNING_RATE = 1e-4
model = ReceiptsModule(
    RNN_INPUT_SIZE,
    RNN_HIDDEN_SIZE,
    RNN_NUM_LAYERS,
    RNN_DROPOUT,
    MLP_HIDDEN_SIZE,
    LEARNING_RATE,
    num_tags=len(tag_to_index),
)

In [21]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    logger=pl.loggers.TensorBoardLogger("tb_logs", name="ner_crf_lstm_preprocc_baseline"),
    max_epochs=70,
    log_every_n_steps=1
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Обучение модели

In [22]:
trainer.fit(model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | lstm      | LSTM             | 5.8 M 
1 | mlp       | Sequential       | 121 K 
2 | crf       | CRF              | 48    
3 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
5.9 M     Trainable params
0         Non-trainable params
5.9 M     Total params
23.583    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Exception ignored in: <function _releaseLock at 0x7f6c9f61b0d0>
Traceback (most recent call last):
  File "/home/worker/anaconda3/envs/py/lib/python3.9/logging/__init__.py", line 227, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Получение итоговых сущностей для тестового датасета

In [23]:
pred = trainer.predict(model, datamodule=dm)

IOStream.flush timed out
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]



In [24]:
submission = pd.DataFrame(sum(pred, list()), columns=["id", "good", "brand"])
submission

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,
2,2,смеситель,calorie
3,3,лимон,бар
4,4,коньяк,сараджишвили
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,риттерспорт


In [26]:
submission.to_csv("sumbissions/submission_crf_lstm_pre.csv", index=False)

In [24]:
# del trainer
# del model
# del dm