# Create the first baseline model to classify categories

In [84]:
import pandas as pd
import re
from typing import Optional, Union, Dict, Any, Tuple, List
from receipt_parser import RuleBased
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

import youtokentome as yttm
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import random


%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
class CategoryDataset(Dataset):
    def __init__(self, cat_df: pd.DataFrame, vocab_size: int = 500, use_padding: bool = False, chunk_length: int = 50, pad_idx: int = 0):
        self.use_padding = use_padding
        self.chunk_length = chunk_length
        self.pad_idx = pad_idx

        # Prepare DataFrame
        self.le = LabelEncoder()
        self.cat_df = self.__transfrom_df(cat_df)
        self.train, self.val, self.test = self.split_df(self.cat_df)
        self._lookup_dict = {'train': self.train,
                             'val': self.val,
                             'test': self.test}
        
        # Train BPE model:
        self.vocab_size = vocab_size
        self.path_to_bpe = 'data_cat/train_bpe_model.yttm'
        self.path_to_train = 'data_cat/train.txt'
        self.save_texts_to_file(self.train['name_norm'], self.path_to_train)
        self.tokenizer = self.build_bpe_model()
        
        self.set_split('train')
    
    def set_split(self, split="train"):
        """
        Selects the splits in the dataset.
        split (str): one of "train", "val", or "test"
        """

        self._target_df = self._lookup_dict[split]
    
    def build_bpe_model(self) -> yttm.BPE:
        yttm.BPE.train(
            data=self.path_to_train,
            vocab_size=self.vocab_size,
            model=self.path_to_bpe,
            pad_id=self.pad_idx
        )
        return yttm.BPE(self.path_to_bpe)
    
    @staticmethod
    def ensure_length(txt: List[int], out_len: int, pad_value: int) -> List[int]:
        """Add PAD-indices to a `out_len` length."""

        if len(txt) < out_len:
            txt = list(txt) + [pad_value] * (out_len - len(txt))
        else:
            txt = txt[:out_len]
        return txt
    
    def __transfrom_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Drop duplicates and NAN objects. Rename `category` column.
        Encode category to idx using `LabelEncoder`.
        """
        
        
        df = df.drop_duplicates('name_norm').dropna()
        df = df.rename(columns = {'Категория': 'category'})
        df['target'] = self.le.fit_transform(df['category'])
        return df

    @staticmethod
    def save_texts_to_file(texts: pd.Series, out_file: str) -> None:
        """Save text to .txt fromat for BPE model."""
        
        with open(out_file, 'w') as outf:
            outf.write('\n'.join(texts))
    
    @staticmethod
    def split_df(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        60% - train set,
        20% - validation set,
        20% - test set
        
        Return train, validation, test.
        """

        return np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    
    def get_labels(self) -> List[str]:
        """Return all unique categories in dataframe."""

        return self.train['category'].unique()
    
    def decode_target(self, target: int) -> str:
        return self.le.inverse_transform([target])[0]
    
    def __len__(self) -> int:
        return len(self._target_df)

    def __getitem__(self, index: int) -> Dict[str, Any]:
        row = self._target_df.iloc[index]
        cat_vector = self.tokenizer.encode(row.name_norm)
        if self.use_padding:
            cat_vector = self.ensure_length(cat_vector, self.chunk_length, self.pad_idx)
        cat_vector = torch.tensor(cat_vector)
        target = row.target
        
        return {'x_data': cat_vector,
                'y_target': target}
    

def generate_batch(batch):
    label = torch.tensor([entry['y_target'] for entry in batch])
    text = [entry['x_data'] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    
    return text, offsets, label

## Model

In [97]:
class CategoryClassifier(nn.Module):
    """A simple perceptron baseline moedel."""

    def __init__(self, vocab_size: int, embed_dim: int, num_class: int, pad_idx: int = 0):
        super(CategoryClassifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim)
#         self.embedding = nn.Embedding(vocab_size, embedding_dim=embed_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(in_features=embed_dim, 
                             out_features=num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        
    def forward(self, x_in, offsets, apply_sigmoid=False):
        """The forward pass of the classifier."""
        
        embedded = self.embedding(x_in, offsets)
        y_out = self.fc(embedded)
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

In [6]:
VOCAB_SIZE = 500
USE_PADDING = False
CHUNK_LENGHT = 50
PAD_IDX = 0

# Init dataset:
df = pd.read_csv('train_cat.csv')
df.shape
dataset = CategoryDataset(df, VOCAB_SIZE, USE_PADDING, CHUNK_LENGHT, PAD_IDX)

# Init model:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUN_CLASS = len(dataset.get_labels())
EMBED_DIM = 50
BATCH_SIZE = 32

model = CategoryClassifier(VOCAB_SIZE, EMBED_DIM, NUN_CLASS)

## Train Loop

In [7]:
def transform_target(target):
    res = []
    for i in target:
        t_ = np.zeros(NUN_CLASS)
        t_[i] = 1
        res.append(t_)
    return res

import time

N_EPOCHS = 5
model = model.to(device)
loss_func = nn.CrossEntropyLoss().to(device)

learning_rate=0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

# optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    
    # Train the model
    dataset.set_split('train')
    train_loss = 0
    train_acc = 0
    model.train()
    data = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, target) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, target = text.to(device), offsets.to(device), target.to(device)
        output = model(text, offsets)
#         target = torch.tensor(transform_target(target)).long().to(device)
        loss = loss_func(output, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == target).sum().item()

    # Adjust the learning rate
    train_loss, train_acc = train_loss / len(dataset), train_acc / len(dataset)
    scheduler.step(train_loss)
    
    # Eval the model
    dataset.set_split('val')
    loss = 0
    acc = 0
    model.eval()
    data = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    
    for i, (text, offsets, target) in enumerate(data):
        text, offsets, target = text.to(device), offsets.to(device), target.to(device)
#         target = torch.tensor(transform_target(target)).long().to(device)
        output = model(text, offsets)
        loss = loss_func(output, target)
        loss += loss.item()
        acc += (output.argmax(1) == target).sum().item()
    
    valid_loss, valid_acc = loss / len(dataset), acc / len(dataset)
    
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 13 seconds
	Loss: 0.0675(train)	|	Acc: 39.5%(train)
	Loss: 0.0003(valid)	|	Acc: 58.8%(valid)
Epoch: 2  | time in 0 minutes, 13 seconds
	Loss: 0.0414(train)	|	Acc: 65.1%(train)
	Loss: 0.0002(valid)	|	Acc: 68.4%(valid)
Epoch: 3  | time in 0 minutes, 13 seconds
	Loss: 0.0329(train)	|	Acc: 71.1%(train)
	Loss: 0.0001(valid)	|	Acc: 71.9%(valid)
Epoch: 4  | time in 0 minutes, 13 seconds
	Loss: 0.0290(train)	|	Acc: 73.7%(train)
	Loss: 0.0001(valid)	|	Acc: 73.8%(valid)
Epoch: 5  | time in 0 minutes, 13 seconds
	Loss: 0.0267(train)	|	Acc: 75.3%(train)
	Loss: 0.0001(valid)	|	Acc: 74.4%(valid)


## Test the model:

In [8]:
def predict(name_norm: str) -> str:
    with torch.no_grad():
        text = dataset.tokenizer.encode(name_norm)
        text = torch.tensor(text).to(device)
        output = model(text, torch.tensor([0]).to(device))
        return dataset.decode_target(output.argmax(1).item())

In [10]:
tmp = dataset.test.sample(10)
tmp

Unnamed: 0,name_norm,category,target
27600,биойогурт питьевой черная смородина,"Молоко, сыр, яйца",9
29972,"зефир ""сладкие истории"" вкусом крем брюле негл...","Хлеб, сладости, снеки",19
27110,зубная паста морские минералы,"Красота, гигиена, бытовая химия",7
21664,джем крыжовниковый,"Соусы, орехи, консервы",16
9461,колбаса охотничья сырокопченая,"Птица, мясо, деликатесы",14
28063,корм собак вкусные потрошки говядина сердце,Зоотовары,6
24537,хлопья микс органические,"Макароны, крупы, специи",8
16145,краска волос карамель,"Красота, гигиена, бытовая химия",7
22289,био йогурт питьевой злаками,"Молоко, сыр, яйца",9
44658,подарочный набор совершенство,"Красота, гигиена, бытовая химия",7


In [11]:
for name, pred in zip(tmp['name_norm'], tmp['name_norm'].apply(predict)):
    print(f'{name} --> {pred}')

биойогурт питьевой  черная смородина --> Молоко, сыр, яйца
зефир "сладкие истории" вкусом крем брюле неглазированный --> Хлеб, сладости, снеки
зубная паста  морские минералы --> Красота, гигиена, бытовая химия
джем крыжовниковый --> Товары для дома и дачи
колбаса охотничья сырокопченая --> Птица, мясо, деликатесы
корм собак вкусные потрошки говядина сердце --> Зоотовары
хлопья   микс органические --> Макароны, крупы, специи
краска волос  карамель --> Красота, гигиена, бытовая химия
био йогурт питьевой злаками --> Молоко, сыр, яйца
подарочный набор совершенство --> Красота, гигиена, бытовая химия


In [14]:
dataset.le.classes_

array(['Алкоголь', 'Бытовая техника', 'Воды, соки, напитки',
       'Дача и гриль', 'Другое', 'Замороженные продукты', 'Зоотовары',
       'Красота, гигиена, бытовая химия', 'Макароны, крупы, специи',
       'Молоко, сыр, яйца', 'Овощи, фрукты, ягоды',
       'Подборки и готовые блюда', 'Постные продукты', 'Посуда',
       'Птица, мясо, деликатесы', 'Рыба, икра', 'Соусы, орехи, консервы',
       'Товары для дома и дачи', 'Товары для мам и детей',
       'Хлеб, сладости, снеки', 'Чай, кофе, сахар'], dtype=object)

## Save model

In [25]:
torch.save(model.state_dict(), 'receipt_parser/models/baseline_model.pth')

## Load to use

In [98]:
class PredictCategory:
    def __init__(self, path_to_bpe: str, path_to_model: str, model_params: Dict[str, int]):
        self.bpe_model = yttm.BPE(path_to_bpe)
        self.categories: List[str] = [
            'Алкоголь', 'Бытовая техника', 'Воды, соки, напитки',
           'Дача и гриль', 'Другое', 'Замороженные продукты', 'Зоотовары',
           'Красота, гигиена, бытовая химия', 'Макароны, крупы, специи',
           'Молоко, сыр, яйца', 'Овощи, фрукты, ягоды',
           'Подборки и готовые блюда', 'Постные продукты', 'Посуда',
           'Птица, мясо, деликатесы', 'Рыба, икра', 'Соусы, орехи, консервы',
           'Товары для дома и дачи', 'Товары для мам и детей',
           'Хлеб, сладости, снеки', 'Чай, кофе, сахар'
        ]
        self.device = torch.device("cpu")
        self.model = CategoryClassifier(**model_params)
        self.model.load_state_dict(torch.load(path_to_model, map_location=self.device))
        self.model.eval()
        
    def predict(self, name_norm: str) -> str:
        """Predict category by name norm."""
        
        text = self.bpe_model.encode(name_norm)
        text = torch.tensor(text).to(self.device)
        output = self.model(text, torch.tensor([0]).to(self.device))
        return self.categories[output.argmax(1).item()]
        

In [102]:
# Usage

params = {
    'num_class': 21,
    'embed_dim': 50,
    'vocab_size': 500
}

predictor = PredictCategory('receipt_parser/models/train_bpe_model.yttm', 'receipt_parser/models/baseline_model.pth', params)
predictor.predict('джем')

'Хлеб, сладости, снеки'