## Predvidanje brojeva uz transformere

## Custom dataset

In [1]:
intToStrMap = {
    0: "nula",
    1: "jedan",
    2: "dva",
    3: "tri",
    4: "četiri",
    5: "pet",
    6: "šest",
    7: "sedam",
    8: "osam",
    9: "devet",
    10: "deset",
    11: "jedanaest",
    12: "dvanaest",
    13: "trinaest",
    14: "četrnaest",
    15: "petnaest",
    16: "šesnaest",
    17: "sedamnaest",
    18: "osamnaest",
    19: "devetnaest",
    20: "dvadeset",
    30: "trideset",
    40: "četrdeset",
    50: "pedeset",
    60: "šezdeset",
    70: "sedamdeset",
    80: "osamdeset",
    90: "devedeset",
    100: "sto",
    200: "dvjesto",
    300: "tristo",
    400: "četiristo",
    500: "petsto",
    600: "šesto",
    700: "sedamsto",
    800: "osamsto",
    900: "devetsto",
}

koliko = intToStrMap.copy()
koliko[1] = "jedna"
koliko[2] = "dvije"



def intToStr(x: int, kol=False):
    if kol and x <= 20: return koliko[x]
    if x <= 20: return intToStrMap[x]
    if x < 100: return f"{intToStrMap[x//10 * 10]}{' '+intToStr(x%10, kol) if x%10 else ''}"
    if x < 1000: return f"{intToStrMap[x//100 * 100]}{' '+intToStr(x%100, kol) if x%100 else ''}"
    if x <= 100000: 
        tisucice = 'tisuću' if x//1000 == 1 else f"{intToStr(x//1000, True)} tisuć{'e' if (x//1000)%10 < 5 else 'a'}"
        return f"{tisucice}{' '+intToStr(x%1000) if x%1000 else ''}"
    raise Exception("Didnt except number bigger than 10000")

In [2]:
text = ', '.join(intToStr(i) for i in range(1,100001))

## Transformer model

In [3]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import math
import os
from tempfile import TemporaryDirectory
from typing import Tuple

import torch
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
import torchtext

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Tokenizer
Using BPE tokenizer

In [5]:
import os
import tokenizers
from tokenizers import trainers, Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers import decoders
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [6]:
# Create a Tokenizer
tokenizer = Tokenizer(BPE())

# Define a pre-tokenizer to split text into words (Croatian usually uses whitespace)
tokenizer.pre_tokenizer = Whitespace()

# Initialize a trainer for BPE
trainer = BpeTrainer(special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])

# Tokenize the text and add it to the trainer
tokenizer.train_from_iterator([text], trainer=trainer)

# Save the trained tokenizer and its vocabulary
tokenizer.save("custom_croatian_tokenizer.json")






In [7]:
# Load the trained tokenizer
tokenizer = Tokenizer.from_file("custom_croatian_tokenizer.json")

# Tokenize a sentence
sentence = text[:100]
encoding = tokenizer.encode(sentence)

# Print token IDs and tokens
print("Token IDs:", encoding.ids)
print("Tokens:", encoding.tokens)

# Decode the tokens
decoded_text = encoding.tokens
decoded_text = decoders.BPEDecoder().decode(decoded_text)
print("Decoded Text:", decoded_text)

Token IDs: [85, 5, 50, 5, 48, 5, 71, 5, 51, 5, 70, 5, 45, 5, 44, 5, 52, 5, 27, 5, 96, 5, 92, 5, 91, 5, 55, 54]
Tokens: ['jedan', ',', 'dva', ',', 'tri', ',', 'četiri', ',', 'pet', ',', 'šest', ',', 'sedam', ',', 'osam', ',', 'devet', ',', 'deset', ',', 'jedanaest', ',', 'dvanaest', ',', 'trinaest', ',', 'četr', 'na']
Decoded Text: jedan,dva,tri,četiri,pet,šest,sedam,osam,devet,deset,jedanaest,dvanaest,trinaest,četrna


In [8]:
len(text)

4039009

In [39]:
class CustomNumberDataset(Dataset):
    def __init__(self, text, tokenizer, seq_len):
        self.tokenizer = tokenizer
        self.text = tokenizer.encode(text)
        self.textids = torch.tensor(self.text.ids, device=device)
        self.seq_len = seq_len

    def __len__(self):
        return (len(self.text) - self.seq_len)

    def __getitem__(self, idx):
        inp = self.textids[idx: idx+self.seq_len]
        target = self.textids[idx+1: idx+self.seq_len+1]
        return inp, target

In [40]:
tokenizer.encode('jedan')

Encoding(num_tokens=1, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [41]:
seq_len = 50
batch_size = 64
ds = CustomNumberDataset(text, tokenizer, seq_len)
dl = DataLoader(ds, batch_size=batch_size, shuffle=True)

In [77]:
for i, batch in enumerate(dl):
    inp, target = batch
    print(len(batch), inp.shape, target.shape)
    print(len(batch), inp.shape, target.reshape(-1))
    print('>> Input: ', tokenizer.decode(list(inp[0])))
    print('>> Target: ', tokenizer.decode(list(target[0])))
    break

2 torch.Size([64, 50]) torch.Size([64, 50])
2 torch.Size([64, 50]) tensor([61, 44,  5,  ..., 75, 64,  5], device='cuda:0')
>> Input:  tristo devedeset osam , osamdeset pet tisuća tristo devedeset devet , osamdeset pet tisuća četiristo , osamdeset pet tisuća četiristo jedan , osamdeset pet tisuća četiristo dva , osamdeset pet tisuća četiristo tri , osamdeset pet tisuća četiristo četiri , osamdeset pet tisuća četiristo pet , osamdeset pet tisuća četiristo
>> Target:  devedeset osam , osamdeset pet tisuća tristo devedeset devet , osamdeset pet tisuća četiristo , osamdeset pet tisuća četiristo jedan , osamdeset pet tisuća četiristo dva , osamdeset pet tisuća četiristo tri , osamdeset pet tisuća četiristo četiri , osamdeset pet tisuća četiristo pet , osamdeset pet tisuća četiristo šest


In [13]:
vocab = tokenizer.get_vocab()
len(vocab)

99

## Create model

In [48]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        debug = False
        if debug: print(1, src.shape)
        src = self.embedding(src) * math.sqrt(self.d_model)
        if debug: print(2, src.shape)
        src = self.pos_encoder(src)
        if debug: print(3, src.shape)
        output = self.transformer_encoder(src, src_mask)
        if debug: print(4, output.shape)
        output = self.linear(output)
        if debug: print(5, output.shape)
        return output

In [49]:
ntokens = len(vocab)  # size of vocabulary
emsize = 100  # embedding dimension
d_hid = 50  # dimension of the feedforward network model in ``nn.TransformerEncoder``
nlayers = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
nhead = 2  # number of heads in ``nn.MultiheadAttention``
dropout = 0.2  # dropout probability
model = TransformerModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [50]:
model(torch.tensor([51, 42, 75, 62, 70,  5, 66, 51, 42, 75, 62, 45,  5, 66, 51, 42, 75, 62,
        44,  5, 66, 51, 42, 75, 62, 52,  5, 66, 51, 42, 75, 61,  5, 66, 51, 42,
        75, 61, 85,  5, 66, 51, 42, 75, 61],
       device='cuda:0').unsqueeze(0))

tensor([[[-0.4242, -0.5490, -0.1069,  ..., -0.6378,  0.1903,  0.9390],
         [-0.5654,  0.5119, -0.0684,  ...,  0.6576,  0.2072, -0.0461],
         [ 0.4379,  0.3032,  0.9007,  ...,  0.1713, -0.4838,  1.2636],
         ...,
         [-0.1289,  0.6524, -0.2970,  ...,  0.5821,  0.4482,  0.2781],
         [ 0.2764,  0.1889, -0.1402,  ..., -0.0114,  0.2077,  0.6134],
         [-0.1798,  0.5263,  0.2828,  ..., -0.6464, -0.5283, -0.0115]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [174]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    

    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)

    acc = torch.round(acc * 100)

    return acc

In [178]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

model.train()

print()
epochs = 1
for epoch in range(epochs):
    correct, total = 0, 1
    for i, (inputs, targets) in enumerate(dl):
        inputs, targets = inputs.mT, targets.mT.reshape(-1)
        if i%500==0:
            print(f"Epoch: {epoch+1} - {i}/{len(dl)}, Loss: {loss.item()}, Accuracy {correct/total}")
            correct, total = 0, 1
        optimizer.zero_grad()
        output = model(inputs)
        output_flat = output.view(-1, ntokens)
        correct += multi_acc(output_flat, targets)
        total += 1
        loss = criterion(output_flat, targets)
        loss.backward()
        optimizer.step()


Epoch: 1 - 0/9843, Loss: 0.03940067067742348, Accuracy 0.0
Epoch: 1 - 500/9843, Loss: 0.04529324173927307, Accuracy 98.63672637939453
Epoch: 1 - 1000/9843, Loss: 0.036231495440006256, Accuracy 98.65469360351562
Epoch: 1 - 1500/9843, Loss: 0.03598722442984581, Accuracy 98.6467056274414
Epoch: 1 - 2000/9843, Loss: 0.05168401822447777, Accuracy 98.63273620605469
Epoch: 1 - 2500/9843, Loss: 0.044929951429367065, Accuracy 98.65469360351562
Epoch: 1 - 3000/9843, Loss: 0.04474151134490967, Accuracy 98.65269470214844
Epoch: 1 - 3500/9843, Loss: 0.046152301132678986, Accuracy 98.6467056274414
Epoch: 1 - 4000/9843, Loss: 0.028442008420825005, Accuracy 98.6487045288086
Epoch: 1 - 4500/9843, Loss: 0.06017894297838211, Accuracy 98.64071655273438
Epoch: 1 - 5000/9843, Loss: 0.03350934013724327, Accuracy 98.65668487548828
Epoch: 1 - 5500/9843, Loss: 0.03283650055527687, Accuracy 98.65269470214844
Epoch: 1 - 6000/9843, Loss: 0.03433560952544212, Accuracy 98.67465209960938
Epoch: 1 - 6500/9843, Loss: 

In [199]:
len(text)

4039009

In [200]:
text[1039009: 1039009+1000]

' tisuća šesto šezdeset pet, dvadeset osam tisuća šesto šezdeset šest, dvadeset osam tisuća šesto šezdeset sedam, dvadeset osam tisuća šesto šezdeset osam, dvadeset osam tisuća šesto šezdeset devet, dvadeset osam tisuća šesto sedamdeset, dvadeset osam tisuća šesto sedamdeset jedan, dvadeset osam tisuća šesto sedamdeset dva, dvadeset osam tisuća šesto sedamdeset tri, dvadeset osam tisuća šesto sedamdeset četiri, dvadeset osam tisuća šesto sedamdeset pet, dvadeset osam tisuća šesto sedamdeset šest, dvadeset osam tisuća šesto sedamdeset sedam, dvadeset osam tisuća šesto sedamdeset osam, dvadeset osam tisuća šesto sedamdeset devet, dvadeset osam tisuća šesto osamdeset, dvadeset osam tisuća šesto osamdeset jedan, dvadeset osam tisuća šesto osamdeset dva, dvadeset osam tisuća šesto osamdeset tri, dvadeset osam tisuća šesto osamdeset četiri, dvadeset osam tisuća šesto osamdeset pet, dvadeset osam tisuća šesto osamdeset šest, dvadeset osam tisuća šesto osamdeset sedam, dvadeset osam tisuća šes

In [214]:
data = 'tisuća šesto šezdeset pet, dvadeset osam tisuća šesto šezdeset šest, dvadeset osam tisuća šesto šezdeset sedam, dvadeset osam tisuća šesto šezdeset osam, dvadeset osam tisuća šesto šezdeset devet, dvadeset osam tisuća šesto sedamdeset, dvadeset osam tisuća šesto sedamdeset jedan, dvadeset osam tisuća šesto'

def predict(data, pred_len=10):
    preds = []
    inp = torch.tensor(tokenizer.encode(data).ids, device=device).unsqueeze(0)
    print(inp.shape)
    for i in range(pred_len):
        with torch.no_grad():
            inp = inp[:, :seq_len]
#             print('>', tokenizer.decode(list(inp.squeeze())))
            out = model(inp.mT)
#             print(inp[0, -5:])
            out = out.log_softmax(dim=2).argmax(dim=2).squeeze()
#             print(out.shape)
            inp = torch.roll(inp, -1)
            inp[0, -1] = out[-1]
#             print(out[-1])
            preds.append(out[-1])
#             for i in range(len(inp)):
#                 print(tokenizer.decode(list([inp[i]])), tokenizer.decode(list([out[i]])))
    return preds
tokenizer.decode(predict(data, 20))

torch.Size([1, 50])


'sedamdeset jedan , dvadeset osam tisuća šesto sedamdeset dva , dvadeset osam tisuća šesto sedamdeset tri , dvadeset osam tisuća'

In [131]:
torch.roll(torch.tensor([1,2,3]), -1)

tensor([2, 3, 1])