In [1]:
from gensim.models.fasttext import FastText
import pandas as pd
import pytorch_lightning as pl
from seqeval.metrics.sequence_labeling import get_entities
from sklearn.model_selection import train_test_split
import numpy as np
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

torch.set_float32_matmul_precision("high")

2023-06-24 10:57:03.242409: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-24 10:57:03.294722: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from preprocc import preprocess_text

In [3]:
# !pip install seqeval==1.2.2
# !pip install allennlp==2.10.1
# !pip install pytorch-crf

In [4]:
from utils import apply_bio_tagging, F1Score, index_to_tag, tag_to_index

# Datamodule

Подготовим данные для модели. Для этого определим наследника `torch.nn.utils.Dataset` - `ReceiptsDataset`

In [5]:
class ReceiptsDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.is_predict = "tags" not in df.columns
        self.data = df[["tokens", "good", "brand", "tags"]] if not self.is_predict else df[["tokens", "id"]]
        self.data = self.data.values

    def __getitem__(self, index):
        identifier = 0 if not self.is_predict else self.data[index][1]
        tokens = self.data[index][0]
        goods = self.data[index][1].split(',') if not self.is_predict else list()
        brands = self.data[index][2].split(',') if not self.is_predict else list()
        tags = self.data[index][3] if not self.is_predict else ["O"] * len(tokens)
        target = [tag_to_index[tag] for tag in tags]
        return identifier, len(tokens), tokens, goods, brands, target

    def __len__(self):
        return len(self.data)

Для объединения примеров в батчи нужна специальная `collate_fn`, в которой происходит паддинг

In [6]:
def collate_fn(batch):
    ids, length, tokens_sequence, goods, brands, targets = list(zip(*batch))
    
    tokens_sequence = pad_sequence([torch.LongTensor(token) for token in tokens_sequence], batch_first=True,
                           padding_value=0)
    targets = pad_sequence([torch.LongTensor(target) for target in targets], batch_first=True,
                           padding_value=tag_to_index["PAD"])
    return ids, length, tokens_sequence, goods, brands, targets

Используем LightningDataModule для задания пайплайна

1. prepare_data
    1. Токенизируем текст
    2. Выделяем BIO-теги в размеченной части
2. setup
    1. Разделяем размеченную выборку на обучающую и валидационную
    2. Создаем `ReceiptsDataset` под каждую выборку

In [7]:
class ReceiptsDataModule(pl.LightningDataModule):
    def __init__(self,
                 fasttext,
                 train_dataset_path,
                 test_dataset_path,
                 val_split_size,
                 batch_size,
                 num_workers):
        super().__init__()
        self.key_to_index = fasttext.wv.key_to_index
        self.unk_id = 0
        self.train_dataset_path = train_dataset_path
        self.test_dataset_path = test_dataset_path
        self.val_split_size = val_split_size
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        self.train_df = pd.read_csv(self.train_dataset_path).fillna("")
        self.test_df = pd.read_csv(self.test_dataset_path)
        
        lookup = np.vectorize(lambda s: self.key_to_index.get(s, self.unk_id))
        self.train_df["tokens"] = self.train_df["name"].apply(preprocess_text)
        # оставим только целые метки
        self.train_df = self.train_df[self.train_df[['tokens', 'brand', 'good']].apply(lambda row: all([part in row['tokens'] for part in row['good'].split()]) and all([part in row['tokens'] for part in row['brand'].split()]), axis=1)]
        
        self.test_df["tokens"] = self.test_df["name"].apply(preprocess_text)
        
        
        
        self.train_df["tags"] = self.train_df.apply(apply_bio_tagging, axis=1)
        
        self.test_df["tokens"] = self.test_df["tokens"].apply(lookup)
        self.train_df["tokens"] = self.train_df["tokens"].apply(lookup)
    
    def setup(self, stage: str):
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size, random_state=42)

        self.train_dataset = ReceiptsDataset(self.train_df)
        self.val_dataset = ReceiptsDataset(self.val_df)
        self.predict_dataset = ReceiptsDataset(self.test_df)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn)

    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           collate_fn=collate_fn)

In [8]:
# root = '/mnt/nvme/WORKERS/USERS/gorai_norsi_worker/outside_workspace/competition-container/workspace/nlp_in_practice_receipts/iteration-3'
root = 'data'
TRAIN_DATASET_PATH = f"data/train_supervised_dataset.csv"
TEST_DATASET_PATH = f"data/test_dataset.csv"
FASTTEXT_PATH = "fasttext_models/fasttext_512_hardpreprocc_renamed_fitted.model"
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 512
NUM_WORKERS = 4

In [9]:
fasttext = FastText.load(FASTTEXT_PATH)

In [10]:
dm = ReceiptsDataModule(
    fasttext,
    TRAIN_DATASET_PATH,
    TEST_DATASET_PATH,
    VAL_SPLIT_SIZE,
    BATCH_SIZE,
    NUM_WORKERS
)

In [11]:
dm.prepare_data()
dm.setup('train')

In [12]:
for batch in dm.train_dataloader():
    break

ids, lengths, tokens_sequence, goods, brands, targets = batch

In [13]:
class BiLSTM_Attention(nn.Module):
    def __init__(self):
        super(BiLSTM_Attention, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)

    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights, 1)
        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]

    def forward(self, X):
        input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]
        input = input.permute(1, 0, 2) # input : [len_seq, batch_size, embedding_dim]

        hidden_state = Variable(torch.zeros(1*2, len(X), n_hidden)) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = Variable(torch.zeros(1*2, len(X), n_hidden)) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
        output = output.permute(1, 0, 2) # output : [batch_size, len_seq, n_hidden]
        attn_output, attention = self.attention_net(output, final_hidden_state)
        return attn_output # model : [batch_size, num_classes]


In [14]:
class ReceiptsModule(pl.LightningModule):
    def __init__(self,
                 fasttext,
                 freeze,
                 rnn_num_layers,
                 rnn_dropout,
                 mlp_hidden_size,
                 learning_rate,
                 rnn_bidir=True,
                 num_tags=-1,
                ):
        super().__init__()
        self.learning_rate = learning_rate
        
        
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(fasttext.wv.vectors), freeze=freeze, padding_idx=0)

#         self.embedding.weight.grad = torch.sign(self.embedding.weight)
        self.emb_dropout = nn.Dropout(p=0.3)
        self.index_to_key = np.array(fasttext.wv.index_to_key)
        dim = self.embedding.weight.shape[-1]
        self.lstm = nn.LSTM(input_size=dim,
                           hidden_size=dim,
                           num_layers=rnn_num_layers,
                           batch_first=True,
                           dropout=rnn_dropout,
                           bidirectional=rnn_bidir,
                           )
        self.mlp = nn.Sequential(
            nn.Linear(dim + dim * int(rnn_bidir), mlp_hidden_size),
            nn.GELU(),
            nn.Dropout(p=0.3),
            nn.Linear(mlp_hidden_size, num_tags),
        )
        self.crf = CRF(num_tags, batch_first=True)
        self.f1_good, self.f1_brand = dict(), dict()
        self.f1_good['train'] = F1Score()
        self.f1_brand['train'] = F1Score()
        self.f1_good['val'] = F1Score()
        self.f1_brand['val'] = F1Score()

    def forward(self, tokens):
        sequences = self.emb_dropout(self.embedding(tokens))
        sequences, _ = self.lstm(sequences)
        logits = self.mlp(sequences)
        return logits
    
    def shared_step(self, batch, stage='train'):
        ids, lengths, tokens_sequence, goods, brands, targets = batch
        logits = self(tokens_sequence)
        loss = -self.crf(logits, targets)
        tags_indices_sequence = self.crf.decode(logits)
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:lengths[i]]]
            entities = get_entities(tags)
            goods_pred = [' '.join(self.index_to_key[tokens_sequence[i][start:finish + 1].detach().cpu().numpy()]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(self.index_to_key[tokens_sequence[i][start:finish + 1].detach().cpu().numpy()]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good[stage].update(goods_pred, goods[i])
            self.f1_brand[stage].update(brands_pred, brands[i])
        self.log(f"loss/{stage}", loss, on_epoch=True, batch_size=len(tags_indices_sequence))
        return loss
    
    def training_step(self, batch, _):
        loss = self.shared_step(batch, 'train')
        return loss

    def on_train_epoch_end(self):
        self.log("metric/f1_good_train", self.f1_good['train'].get())
        self.log("metric/f1_brand_train", self.f1_brand['train'].get())
        self.f1_good['train'].reset()
        self.f1_brand['train'].reset()

    def validation_step(self, batch, _):
        loss = self.shared_step(batch, 'val')
        return loss

    def on_validation_epoch_end(self):
        self.log("metric/f1_good_val", self.f1_good['val'].get())
        self.log("metric/f1_brand_val", self.f1_brand['val'].get())
        self.f1_good['val'].reset()
        self.f1_brand['val'].reset()

    def predict_step(self, batch, _):
        ids, lengths, tokens_sequence, _, _, _ = batch
        logits = self(tokens_sequence)
        tags_indices_sequence = self.crf.decode(logits)
        result = list()
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:lengths[i]]]
            entities = get_entities(tags)
            goods_pred = ','.join([' '.join(self.index_to_key[tokens_sequence[i][start:finish + 1].detach().cpu().numpy()]) for t, start, finish in entities if t == "GOOD"])
            brands_pred = ','.join([' '.join(self.index_to_key[tokens_sequence[i][start:finish + 1].detach().cpu().numpy()]) for t, start, finish in entities if t == "BRAND"])
            result.append([ids[i], goods_pred, brands_pred])
        return result

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), self.learning_rate, weight_decay=0.01, amsgrad=True)

In [24]:
RNN_NUM_LAYERS = 3
RNN_DROPOUT = 0.4
MLP_HIDDEN_SIZE = 100
LEARNING_RATE = 3e-4
FREEZE = False

model = ReceiptsModule(
    fasttext,
    FREEZE,
    RNN_NUM_LAYERS,
    RNN_DROPOUT,
    MLP_HIDDEN_SIZE,
    LEARNING_RATE,
    num_tags=len(tag_to_index),
)

In [25]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    logger=pl.loggers.TensorBoardLogger("tb_logs", name="ner_crf_lstm_3_512_tune_fasttext_cleandata"),
    max_epochs=80,
    log_every_n_steps=1
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Обучение модели

In [26]:
# best: tb_logs/ner_crf_lstm_2_512_pre_renamed_20e/version_0

In [27]:
trainer.fit(model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name        | Type       | Params
-------------------------------------------
0 | embedding   | Embedding  | 32.7 M
1 | emb_dropout | Dropout    | 0     
2 | lstm        | LSTM       | 16.8 M
3 | mlp         | Sequential | 103 K 
4 | crf         | CRF        | 48    
-------------------------------------------
49.6 M    Trainable params
0         Non-trainable params
49.6 M    Total params
198.548   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Получение итоговых сущностей для тестового датасета

In [28]:
pred = trainer.predict(model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting: 22it [00:00, ?it/s]



In [29]:
submission = pd.DataFrame(sum(pred, list()), columns=["id", "good", "brand"])
submission

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,
2,2,смеситель,calorie
3,3,лимон,
4,4,коньяк,
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,red bull
4997,4997,наконечники,
4998,4998,шоколад,риттерспорт


In [30]:
submission['brand'].value_counts(1)

                   0.5002
1                  0.0104
хохланд            0.0046
кириешки           0.0024
святой источник    0.0024
                    ...  
flint              0.0002
elka               0.0002
стоппроблем        0.0002
литл               0.0002
риттерспорт        0.0002
Name: brand, Length: 1768, dtype: float64

In [31]:
test_df = dm.test_df.copy()

In [32]:
test_df['pred_good'] = submission['good']

In [33]:
test_df['pred_brand'] = submission['brand']

In [34]:
test_df.sample(100)

Unnamed: 0,id,name,tokens,pred_good,pred_brand
1630,1630,5295 Опора мебельная регулируемая Н-017 высота...,"[0, 1114, 4427, 15617, 39, 0, 3976, 0, 1867, 8...",опора,
200,200,"Нектар Фруктовый сад Яблоко-Виноград 1,93л","[267, 386, 477, 116, 696, 0, 0]",нектар,фруктовый сад
2857,2857,"Пальто ""Патрисия"" бордовый 54","[551, 0, 1597, 0]",пальто,
512,512,ЖАРКОЕ ИЗ МЯСА КУРИЦЫ 1кг,"[5486, 19, 892, 861, 0]",жаркое,
3489,3489,"Удлинитель НВ 30ммх1/2 лат, серия 1530G (шт.)","[937, 1820, 0, 1936, 0, 0, 3194, 669, 0, 63]",удлинитель,
...,...,...,...,...,...
675,675,Ботинки [1720-5-3 ЛЕОПАРД],"[158, 0, 0, 0, 8648]",ботинки,
632,632,"Латте 0,2 с сиропом","[1091, 0, 0, 1, 10962]",латте,
2395,2395,Платье Befree LM077726446,"[142, 6706, 52, 0]",платье,befree
4166,4166,15216389 Толстовка с капюшоном Only (серый) (L),"[0, 334, 1, 4418, 3700, 70, 25]",толстовка,only


In [30]:
for i, r in test_df.iterrows():
    brand = r['pred_brand']
    good = r['pred_good']
    tokens = r['tokens']

    if brand:
        print(f"brand: {brand}")
    if good:
        print(f"good: {good}")
    print(tokens)
    print(f'\n\n')

brand: ермак
good: клей
['1', '1', 'ермак', 'клей', 'универсальный', '1', 'блистер']



brand: сладушка
good: торт
['торт', 'сладушка', 'зимняя', 'вишня', '1']



brand: calorie
good: смеситель
['смеситель', 'calorie', '1', 'а', '1', 'для', 'кухни']



good: лимон
['лимон', '1', 'бар']



brand: сараджишвили
good: коньяк
['коньяк', 'сараджишвили', '1', 'лет', '1', '1', 'грузия']



brand: born pretty
good: пластина
['born', 'pretty', 'пластина', 'для', 'стемпинга', 'bp', 'l', '1', 'texture', 'арт', '1']



brand: chicken beer
good: рис
['рис', 'chicken', 'beer', 'с', 'гуляшом', '1']



brand: tebo
good: труба
['труба', 'tebo', 'п', 'п', 'd', '1', 'pn', '1', '1', 'м', 'стекловолокно', 'хлыст']



brand: грандорф
good: консервы
['грандорф', 'консервы', 'для', 'собак', 'куропатка', 'с', 'индейкой', '1', 'грамм']



good: одеяло
['1', '1', 'одеяло', 'автотепло', '1']



brand: satya
good: аромапалочки
['аромапалочки', 'satya', 'golden', 'era', '1', 'gm']



brand: роллтон
good: пюре
['ролл

['нептун', 'pp', '1', 'sl', '1', 'мкм']



good: кольцо
['кольцо', '1', '1', '1']



good: коробка
['коробка', 'для', 'торта', '1', '1', 'х', '1', 'х', '1', 'с', 'окном']



good: коробка
['1', 'коробка', 'для', 'хот', 'дога']



good: авиабилеты
['1', 'авиабилеты', 'омск', 'краснодар']



brand: vagabond
good: кеды
['m', 'кеды', 'vagabond', 'lm', '1', '1', 'qb', 'yiwlk', 'tiu', 'oo']



good: мойка
['мойка', 'комплекс', 'кузов', 'салон']



good: пропитка
['пропитка', 'водоотталкивающ']



brand: цитрамон
['цитрамон', 'п', 'n', '1', 'табл', '1', 'm', '1', 'r', '1', 'c', '1', 'dya']



brand: greenwich line
good: обложка
['обложка', '1', '1', 'для', 'дневников', 'и', 'тетрадей', 'greenwich', 'line', 'пвх', '1', 'мкм', 'neon', 'star', 'оранжевый', 'шк']



brand: автопарфюм
good: ароматизатор
['ароматизатор', 'подвесной', 'автопарфюм', 'new', 'galaxy', 'jadore', '1', '1']



brand: roberto buono
good: сумка
['сумка', 'roberto', 'buono', 'wr', '1']



brand: premial
good: бумажплаточки
[

In [35]:
dm.train_df[dm.train_df['brand'] == 'red bull']

Unnamed: 0,id,name,good,brand,tokens,tags
14440,14440,"НАПИТОК ""RED BULL THE BLUE EDITION"" 0,355Л Ж/Б",,red bull,"[24, 350, 6375, 941, 172, 3019, 0, 0, 20, 7]","[O, B-BRAND, I-BRAND, O, O, O, O, O, O, O]"


In [36]:
submission.to_csv("submissions/submission_clean_data.csv", index=False)