# Baseline

В качестве бейзлайна используется модель NER, состоящая из RNN поверх эмбеддингов FastText (для получения эмбеддингов нужно запустить ноутбук `train_fasttext.ipynb`)

Нормализация брендов и товаров не производится

Бейзлайн реализован на библиотеке PyTorch с использованием PyTorch-Lightning для упрощения кода

In [1]:
from gensim.models.fasttext import FastText
import pandas as pd
import pytorch_lightning as pl
from seqeval.metrics.sequence_labeling import get_entities
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF

torch.set_float32_matmul_precision("high")

2023-06-23 10:22:13.098426: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-06-23 10:22:13.150784: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from preprocc import preprocess_text

In [3]:
# !pip install seqeval==1.2.2
# !pip install allennlp==2.10.1
# !pip install pytorch-crf

# Utils

Полезные функции для работы с BIO-тегами

In [4]:
def apply_bio_tagging(row):
    """
    По токенам чека и разметке (то есть выделенным товарам и брендам) строим BIO-теги
    """
    tokens = row["tokens"]
    good = row["good"].split(',')[0].split()
    brand = row["brand"].split(',')[0].split()
    tags = ['O'] * len(tokens)
    for i, token in enumerate(tokens):
        if len(good) > 0 and tokens[i:i + len(good)] == good:
            tags[i] = "B-GOOD"
            for j in range(i + 1, i + len(good)):
                tags[j] = "I-GOOD"
        if len(brand) > 0 and tokens[i:i + len(brand)] == brand:
            tags[i] = "B-BRAND"
            for j in range(i + 1, i + len(brand)):
                tags[j] = "I-BRAND"
    return tags

Прямое и обратное преобразование тегов в индексы

In [5]:
index_to_tag = ["O", "B-GOOD", "I-GOOD", "B-BRAND", "I-BRAND", "PAD"]
tag_to_index = {tag: index for index, tag in enumerate(index_to_tag)}

# Datamodule

Подготовим данные для модели. Для этого определим наследника `torch.nn.utils.Dataset` - `ReceiptsDataset`

In [6]:
class ReceiptsDataset(Dataset):
    def __init__(self, df, fasttext):
        super().__init__()
        self.is_predict = "tags" not in df.columns
        self.data = df[["tokens", "good", "brand", "tags"]] if not self.is_predict else df[["tokens", "id"]]
        self.data = self.data.values
        self.fasttext = fasttext

    def __getitem__(self, index):
        identifier = 0 if not self.is_predict else self.data[index][1]
        tokens = self.data[index][0]
        embeddings = self.fasttext.wv[tokens]
        goods = self.data[index][1].split(',') if not self.is_predict else list()
        brands = self.data[index][2].split(',') if not self.is_predict else list()
        tags = self.data[index][3] if not self.is_predict else ["O"] * len(tokens)
        target = [tag_to_index[tag] for tag in tags]
        return identifier, tokens, embeddings, goods, brands, target

    def __len__(self):
        return len(self.data)

Для объединения примеров в батчи нужна специальная `collate_fn`, в которой происходит паддинг

In [7]:
def collate_fn(batch):
    ids, tokens_sequence, embeddings_sequence, goods, brands, targets = list(zip(*batch))
    embeddings_sequence = pad_sequence([torch.FloatTensor(sequence) for sequence in embeddings_sequence],
                                       batch_first=True)
    targets = pad_sequence([torch.LongTensor(target) for target in targets], batch_first=True,
                           padding_value=tag_to_index["PAD"])
    return ids, tokens_sequence, embeddings_sequence, goods, brands, targets

Используем LightningDataModule для задания пайплайна

1. prepare_data
    1. Токенизируем текст
    2. Выделяем BIO-теги в размеченной части
2. setup
    1. Разделяем размеченную выборку на обучающую и валидационную
    2. Создаем `ReceiptsDataset` под каждую выборку

In [8]:
class ReceiptsDataModule(pl.LightningDataModule):
    def __init__(self,
                 train_dataset_path,
                 test_dataset_path,
                 fasttext_path,
                 val_split_size,
                 batch_size,
                 num_workers):
        super().__init__()
        self.train_dataset_path = train_dataset_path
        self.test_dataset_path = test_dataset_path
        self.fasttext_path = fasttext_path
        self.val_split_size = val_split_size
        self.batch_size = batch_size
        self.num_workers = num_workers

    def prepare_data(self):
        self.fasttext = FastText.load(self.fasttext_path)
        self.train_df = pd.read_csv(self.train_dataset_path).fillna("")
        self.test_df = pd.read_csv(self.test_dataset_path)
        self.train_df["tokens"] = self.train_df["name"].apply(preprocess_text)
        self.test_df["tokens"] = self.test_df["name"].apply(preprocess_text)

        self.train_df["tags"] = self.train_df.apply(apply_bio_tagging, axis=1)

    def setup(self, stage: str):
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size, random_state=42)

        self.train_dataset = ReceiptsDataset(self.train_df, self.fasttext)
        self.val_dataset = ReceiptsDataset(self.val_df, self.fasttext)
        self.predict_dataset = ReceiptsDataset(self.test_df, self.fasttext)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.batch_size,
                          num_workers=self.num_workers,
                          collate_fn=collate_fn)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           collate_fn=collate_fn)

In [31]:
TRAIN_DATASET_PATH = "data/train_supervised_dataset.csv"
TEST_DATASET_PATH = "data/test_dataset.csv"
FASTTEXT_PATH = "fasttext_models/fasttext_512_hardpreprocc_renamed_fitted.model"
FASTTEXT_DIM = 512
VAL_SPLIT_SIZE = 0.1
BATCH_SIZE = 512
NUM_WORKERS = 4

In [32]:
dm = ReceiptsDataModule(
    TRAIN_DATASET_PATH,
    TEST_DATASET_PATH,
    FASTTEXT_PATH,
    VAL_SPLIT_SIZE,
    BATCH_SIZE,
    NUM_WORKERS
)

# Model

Сначала определим метрику `F1` для задачи NER

In [33]:
class F1Score:
    def __init__(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def update(self, pred, target):
        pred = frozenset(x for x in pred)
        target = frozenset(x for x in target)
        self.tp += len(pred & target)
        self.fp += len(pred - target)
        self.fn += len(target - pred)

    def reset(self):
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def get(self):
        if self.tp == 0:
            return 0.0
        precision = self.tp / (self.tp + self.fp)
        recall = self.tp / (self.tp + self.fn)
        return 2 / (1 / precision + 1 / recall)

Зададим саму модель, ее шаги на обучении, валидации и инференсе, а также способ обучения

In [34]:
class BiLSTM_Attention(nn.Module):
    def __init__(self):
        super(BiLSTM_Attention, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)

    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
    def attention_net(self, lstm_output, final_state):
        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
        attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
        soft_attn_weights = F.softmax(attn_weights, 1)
        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
        return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]

    def forward(self, X):
        input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]
        input = input.permute(1, 0, 2) # input : [len_seq, batch_size, embedding_dim]

        hidden_state = Variable(torch.zeros(1*2, len(X), n_hidden)) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        cell_state = Variable(torch.zeros(1*2, len(X), n_hidden)) # [num_layers(=1) * num_directions(=2), batch_size, n_hidden]

        # final_hidden_state, final_cell_state : [num_layers(=1) * num_directions(=2), batch_size, n_hidden]
        output, (final_hidden_state, final_cell_state) = self.lstm(input, (hidden_state, cell_state))
        output = output.permute(1, 0, 2) # output : [batch_size, len_seq, n_hidden]
        attn_output, attention = self.attention_net(output, final_hidden_state)
        return attn_output # model : [batch_size, num_classes]


In [35]:
class ReceiptsModule(pl.LightningModule):
    def __init__(self,
                 rnn_input_size,
                 rnn_hidden_size,
                 rnn_num_layers,
                 rnn_dropout,
                 mlp_hidden_size,
                 learning_rate,
                 rnn_bidir=True,
                 num_tags=-1,
                ):
        super().__init__()
        self.learning_rate = learning_rate
        self.lstm = nn.LSTM(input_size=rnn_input_size,
                           hidden_size=rnn_hidden_size,
                           num_layers=rnn_num_layers,
                           batch_first=True,
                           dropout=rnn_dropout,
                           bidirectional=rnn_bidir,
                           )
        self.mlp = nn.Sequential(
            nn.Linear(rnn_hidden_size + rnn_hidden_size * int(rnn_bidir), mlp_hidden_size),
            nn.GELU(),
            nn.Dropout(p=0.3),
            nn.Linear(mlp_hidden_size, len(index_to_tag)),
        )
        self.crf = CRF(num_tags, batch_first=True)
        self.f1_good_train = F1Score()
        self.f1_brand_train = F1Score()
        self.f1_good_val = F1Score()
        self.f1_brand_val = F1Score()

    def forward(self, sequences):
        sequences, _ = self.lstm(sequences)
        logits = self.mlp(sequences)
        return logits

    def training_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, goods, brands, targets = batch
        logits = self(embeddings_sequence)
        loss = -self.crf(logits, targets)
        tags_indices_sequence = self.crf.decode(logits)

        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good_train.update(goods_pred, goods[i])
            self.f1_brand_train.update(brands_pred, brands[i])
        self.log("loss/train", loss, on_epoch=True, batch_size=len(tags_indices_sequence))
        return loss

    def on_train_epoch_end(self):
        self.log("metric/f1_good_train", self.f1_good_train.get())
        self.log("metric/f1_brand_train", self.f1_brand_train.get())
        self.f1_good_train.reset()
        self.f1_brand_train.reset()

    def validation_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, goods, brands, targets = batch
        logits = self(embeddings_sequence)
        loss = -self.crf(logits, targets)
        tags_indices_sequence = self.crf.decode(logits)

        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"]
            brands_pred = [' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"]
            self.f1_good_val.update(goods_pred, goods[i])
            self.f1_brand_val.update(brands_pred, brands[i])
        self.log("loss/val", loss, batch_size=len(tags_indices_sequence))

    def on_validation_epoch_end(self):
        self.log("metric/f1_good_val", self.f1_good_val.get())
        self.log("metric/f1_brand_val", self.f1_brand_val.get())
        self.f1_good_val.reset()
        self.f1_brand_val.reset()

    def predict_step(self, batch, _):
        ids, tokens_sequence, embeddings_sequence, _, _, _ = batch
        logits = self(embeddings_sequence)
        tags_indices_sequence = self.crf.decode(logits)
        result = list()
        for i, tags_indices in enumerate(tags_indices_sequence):
            tags = [index_to_tag[index] for index in tags_indices[:len(tokens_sequence[i])]]
            entities = get_entities(tags)
            goods_pred = ','.join([' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "GOOD"])
            brands_pred = ','.join([' '.join(tokens_sequence[i][start:finish + 1]) for t, start, finish in entities if t == "BRAND"])
            result.append([ids[i], goods_pred, brands_pred])
        return result

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), self.learning_rate)

In [36]:
FASTTEXT_DIM

512

In [37]:
RNN_INPUT_SIZE = FASTTEXT_DIM
RNN_HIDDEN_SIZE = FASTTEXT_DIM
RNN_NUM_LAYERS = 2
RNN_DROPOUT = 0.3
MLP_HIDDEN_SIZE = 100
LEARNING_RATE = 1e-4
model = ReceiptsModule(
    RNN_INPUT_SIZE,
    RNN_HIDDEN_SIZE,
    RNN_NUM_LAYERS,
    RNN_DROPOUT,
    MLP_HIDDEN_SIZE,
    LEARNING_RATE,
    num_tags=len(tag_to_index),
)

In [39]:
trainer = pl.Trainer(
    accelerator="gpu",
    devices=[0],
    logger=pl.loggers.TensorBoardLogger("tb_logs", name="ner_crf_lstm_2_512_other_train"),
    max_epochs=80,
    log_every_n_steps=1
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Обучение модели

In [40]:
trainer.fit(model, datamodule=dm)

Missing logger folder: tb_logs/ner_crf_lstm_2_512_other_train
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name | Type       | Params
------------------------------------
0 | lstm | LSTM       | 10.5 M
1 | mlp  | Sequential | 103 K 
2 | crf  | CRF        | 48    
------------------------------------
10.6 M    Trainable params
0         Non-trainable params
10.6 M    Total params
42.421    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Получение итоговых сущностей для тестового датасета

In [41]:
pred = trainer.predict(model, datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting: 5it [00:00, ?it/s]



In [43]:
submission = pd.DataFrame(sum(pred, list()), columns=["id", "good", "brand"])
submission

Unnamed: 0,id,good,brand
0,0,клей,ермак
1,1,торт,сладушка
2,2,смеситель,calorie
3,3,лимон,
4,4,коньяк,сараджишвили
...,...,...,...
4995,4995,рамка,
4996,4996,напиток,
4997,4997,наконечники,
4998,4998,шоколад,


In [19]:
pred

[[[0, 'клей', 'ермак'],
  [1, 'торт', 'сладушка'],
  [2, 'смеситель', 'calorie'],
  [3, 'лимон', ''],
  [4, 'коньяк', 'сараджишвили'],
  [5, 'пластина', 'born pretty'],
  [6, 'рис', 'chicken beer'],
  [7, 'труба', 'tebo'],
  [8, 'консервы', 'грандорф'],
  [9, 'одеяло', ''],
  [10, 'аромапалочки', 'satya'],
  [11, 'пюре', 'роллтон'],
  [12, 'сироп', 'ранкоф'],
  [13, 'крем', 'botavikos'],
  [14, '', 'добрая коровка'],
  [15, 'спицы', 'zing'],
  [16, 'конфеты', ''],
  [17, 'скумбрия', ''],
  [18, 'чипсы', ''],
  [19, 'трусы', 'hunk'],
  [20, 'леденцы', 'горпилс,gepach'],
  [21, 'печенье', 'кременкульское'],
  [22, 'майонез', 'махеевъ'],
  [23, 'штукатурка', 'habez'],
  [24, 'шайба', ''],
  [25, 'корм', 'shebaclassic'],
  [26, 'саморез', ''],
  [27, 'томаты', 'lorado'],
  [28, 'саморез', ''],
  [29, 'нитки', ''],
  [30, 'держатель', ''],
  [31, 'рюкзак', ''],
  [32, 'джинсы', 'wrangler'],
  [33, 'консервы', 'четвероногий гурман'],
  [34, 'винт', ''],
  [35, 'сок', 'вико'],
  [36, 'куртка'

In [20]:
submission['brand'].value_counts(1)

                   0.5176
хохланд            0.0044
святой источник    0.0024
зубр               0.0022
кириешки           0.0018
                    ...  
sapfire            0.0002
sche               0.0002
сады придонья      0.0002
zekkert            0.0002
риттерспорт        0.0002
Name: brand, Length: 1864, dtype: float64

In [23]:
test_df = dm.test_df.copy()

In [24]:
test_df['pred_good'] = submission['good']

In [25]:
test_df['pred_brand'] = submission['brand']

In [32]:
dm.train_df[['name', 'tokens', 'tags']]

Unnamed: 0,name,tokens,tags
17030,954159 Салфетки влаж.Aura Tropic coctail антиб...,"[салфетки, влаж, aura, tropic, coctail, антиба...","[B-GOOD, O, B-BRAND, O, O, O, O, O]"
12634,Мороженое Инмарко Золотой стандарт новый пломб...,"[мороженое, инмарко, золотой, стандарт, новый,...","[B-GOOD, O, B-BRAND, I-BRAND, O, O, O, O, O]"
4627,Шампунь д/вол Pantene Аква Лайт Питательный 250мл,"[шампунь, д, вол, pantene, аква, лайт, питател...","[B-GOOD, O, O, B-BRAND, O, O, O, O]"
17500,Пиво Хамовники Мюнхенское 5.5%0.45л.ж/бРШТ,"[пиво, хамовники, мюнхенское, л, ж, бршт]","[B-GOOD, B-BRAND, O, O, O, O]"
23095,Доска гладильная EUROGOLD C2 Johnny Plexus Min...,"[доска, гладильная, eurogold, c, johnny, plexu...","[B-GOOD, O, B-BRAND, O, O, O, O, O, O, O, O, O]"
...,...,...,...
12587,"КЛЮЧ ТРУБН PROFI 90* 2""","[ключ, трубн, profi]","[B-GOOD, O, O]"
17733,5 2456564931386 Колготки ж/OPIU/Stri,"[колготки, ж, opiu, stri]","[B-GOOD, O, O, O]"
1713,3 2456575442604 Толстовка /UCLA/UCHZ,"[толстовка, ucla, uchz]","[B-GOOD, B-BRAND, O]"
9461,Хлеб кирпичик ржаной 26*330г BRIDOR Франция,"[хлеб, кирпичик, ржаной, г, bridor, франция]","[B-GOOD, O, O, O, B-BRAND, O]"


In [26]:
test_df

Unnamed: 0,id,name,tokens,pred_good,pred_brand
0,0,"469-210 ЕРМАК Клей универсальный, 15мл, блистер","[1, 1, ермак, клей, универсальный, 1, блистер]",клей,ермак
1,1,Торт СЛАДУШКА Зимняя вишня 700г,"[торт, сладушка, зимняя, вишня, 1]",торт,сладушка
2,2,"Смеситель ""CALORIE"" 1023 А06 д/кухни","[смеситель, calorie, 1, а, 1, для, кухни]",смеситель,calorie
3,3,Лимон 50гр БАР,"[лимон, 1, бар]",лимон,
4,4,"Коньяк САРАДЖИШВИЛИ 5 лет 0,5л Грузия","[коньяк, сараджишвили, 1, лет, 1, 1, грузия]",коньяк,сараджишвили
...,...,...,...,...,...
4995,4995,"774352 Рамка 2П., сл. кость","[1, рамка, 1, п, сл, кость]",рамка,
4996,4996,Энерг. напиток Red Bull 0.25л,"[энерг, напиток, red, bull, 1, 1]",напиток,
4997,4997,36/025 Наконечники (т. никель) шт,"[1, 1, наконечники, т, никель]",наконечники,
4998,4998,Шоколад РиттерСпорт мол.с цел.миндалем 100г,"[шоколад, риттерспорт, мол, с, цел, миндалем, 1]",шоколад,риттерспорт


In [30]:
for i, r in test_df.iterrows():
    brand = r['pred_brand']
    good = r['pred_good']
    tokens = r['tokens']

    if brand:
        print(f"brand: {brand}")
    if good:
        print(f"good: {good}")
    print(tokens)
    print(f'\n\n')

brand: ермак
good: клей
['1', '1', 'ермак', 'клей', 'универсальный', '1', 'блистер']



brand: сладушка
good: торт
['торт', 'сладушка', 'зимняя', 'вишня', '1']



brand: calorie
good: смеситель
['смеситель', 'calorie', '1', 'а', '1', 'для', 'кухни']



good: лимон
['лимон', '1', 'бар']



brand: сараджишвили
good: коньяк
['коньяк', 'сараджишвили', '1', 'лет', '1', '1', 'грузия']



brand: born pretty
good: пластина
['born', 'pretty', 'пластина', 'для', 'стемпинга', 'bp', 'l', '1', 'texture', 'арт', '1']



brand: chicken beer
good: рис
['рис', 'chicken', 'beer', 'с', 'гуляшом', '1']



brand: tebo
good: труба
['труба', 'tebo', 'п', 'п', 'd', '1', 'pn', '1', '1', 'м', 'стекловолокно', 'хлыст']



brand: грандорф
good: консервы
['грандорф', 'консервы', 'для', 'собак', 'куропатка', 'с', 'индейкой', '1', 'грамм']



good: одеяло
['1', '1', 'одеяло', 'автотепло', '1']



brand: satya
good: аромапалочки
['аромапалочки', 'satya', 'golden', 'era', '1', 'gm']



brand: роллтон
good: пюре
['ролл

['нептун', 'pp', '1', 'sl', '1', 'мкм']



good: кольцо
['кольцо', '1', '1', '1']



good: коробка
['коробка', 'для', 'торта', '1', '1', 'х', '1', 'х', '1', 'с', 'окном']



good: коробка
['1', 'коробка', 'для', 'хот', 'дога']



good: авиабилеты
['1', 'авиабилеты', 'омск', 'краснодар']



brand: vagabond
good: кеды
['m', 'кеды', 'vagabond', 'lm', '1', '1', 'qb', 'yiwlk', 'tiu', 'oo']



good: мойка
['мойка', 'комплекс', 'кузов', 'салон']



good: пропитка
['пропитка', 'водоотталкивающ']



brand: цитрамон
['цитрамон', 'п', 'n', '1', 'табл', '1', 'm', '1', 'r', '1', 'c', '1', 'dya']



brand: greenwich line
good: обложка
['обложка', '1', '1', 'для', 'дневников', 'и', 'тетрадей', 'greenwich', 'line', 'пвх', '1', 'мкм', 'neon', 'star', 'оранжевый', 'шк']



brand: автопарфюм
good: ароматизатор
['ароматизатор', 'подвесной', 'автопарфюм', 'new', 'galaxy', 'jadore', '1', '1']



brand: roberto buono
good: сумка
['сумка', 'roberto', 'buono', 'wr', '1']



brand: premial
good: бумажплаточки
[

In [26]:
dm.train_df[dm.train_df['brand'] == 'red bull']

Unnamed: 0,id,name,good,brand,tokens,tags
17869,17869,Редбулл (без сахара),,red bull,"[редбулл, без, сахара]","[O, O, O]"
2351,2351,"Нап знерг Ред Булл 0,25л ж/б",напиток энергетический,red bull,"[нап, знерг, ред, булл, 1, 1, ж, б]","[O, O, O, O, O, O, O, O]"
14440,14440,"НАПИТОК ""RED BULL THE BLUE EDITION"" 0,355Л Ж/Б",,red bull,"[напиток, red, bull, the, blue, edition, 1, 1,...","[O, B-BRAND, I-BRAND, O, O, O, O, O, O, O]"
20032,20032,"РЕД БУЛЛ 0,473Л ЭНЕРГЕТИК",энергетик,red bull,"[ред, булл, 1, 1, энергетик]","[O, O, O, O, B-GOOD]"
8175,8175,"Ред булл Tropical Edition 0,355 1/24",,red bull,"[ред, булл, tropical, edition, 1, 1, 1, 1]","[O, O, O, O, O, O, O, O]"
17640,17640,"Редбулл Энергетик 0,355л",энергетик,red bull,"[редбулл, энергетик, 1, 1]","[O, B-GOOD, O, O]"


In [44]:
submission.to_csv("submissions/submission_512_overfitted_heavy_renaimed.csv", index=False)