In [1]:
from pathlib import Path
from slanggen import datatools
from slanggen import models
import torch

In [2]:
processed_words = datatools.load_data(Path("../assets/straattaal.txt"))

[32m2024-06-20 16:14:10.908[0m | [1mINFO    [0m | [36mslanggen.datatools[0m:[36mload_data[0m:[36m54[0m - [1mLoading processed words from ../assets/straattaal.txt[0m


In [3]:

tokenizer = models.buildBPE(corpus=processed_words, vocab_size=100)






In [4]:
list(tokenizer.get_vocab())

['on',
 'ge',
 '</s>',
 'h',
 'i',
 't',
 'ff',
 'v',
 'q',
 'ek',
 'bo',
 '<pad>',
 '!',
 'je',
 'la',
 'Ã',
 'be',
 'rie',
 'ra',
 'ta',
 'am',
 'u',
 'me',
 'ma',
 'chi',
 '7',
 'ko',
 'k',
 'ak',
 'Ġ',
 'ch',
 'm',
 'w',
 'z',
 '<s>',
 'sh',
 'd',
 '!</',
 'at',
 'ne',
 'c',
 'ga',
 'gga',
 'kie',
 'se',
 'ª',
 'ka',
 'pa',
 'sc',
 'ba',
 'en',
 'ss',
 'oe',
 's',
 '</',
 'el',
 'ro',
 'lla',
 'li',
 'an',
 'o',
 'r',
 '>',
 'j',
 'n',
 '.',
 'a',
 'g',
 'x',
 'er',
 'e',
 '/',
 'ke',
 'pp',
 'na',
 'bi',
 'ki',
 'koe',
 'ti',
 'ie',
 'et',
 'ri',
 'y',
 'll',
 '?',
 'b',
 'wi',
 "'",
 '9',
 'l',
 'ken',
 '<mask>',
 '3',
 'p',
 'f',
 'wa',
 '-',
 'to',
 '<unk>',
 '<']

In [5]:
enc = tokenizer.encode("waggie")
enc.tokens

['wa', 'g', 'g', 'ie']

In [6]:
tokenizer.decode(enc.ids)

'waggie'

In [7]:
padded_sequences = datatools.preprocess(processed_words, tokenizer)
padded_sequences

tensor([[ 1, 16, 44,  ...,  0,  0,  0],
        [ 1, 21, 16,  ...,  0,  0,  0],
        [ 1, 21, 51,  ...,  0,  0,  0],
        ...,
        [ 1, 27, 36,  ...,  0,  0,  0],
        [ 1, 23, 20,  ...,  0,  0,  0],
        [ 1, 37, 16,  ...,  0,  0,  0]])

In [8]:
dataset = datatools.ShiftedDataset(padded_sequences)
dataset

ShiftedDataset torch.Size([453, 22])

In [9]:
# import torch dataloader
from torch.utils.data import DataLoader

loader = DataLoader(dataset, batch_size=16, shuffle=True)
x, y = next(iter(loader))
x.shape, y.shape

(torch.Size([16, 22]), torch.Size([16, 22]))

In [10]:
for x, y in loader:
    print(x.shape, y.shape)

torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([16, 22])
torch.Size([16, 22]) torch.Size([1

In [11]:
# Define the vocab size based on the tokenizer
vocab_size = tokenizer.get_vocab_size()
vocab_size

100

In [12]:
from torch import nn, optim
# Hyperparameters
config = {
    "vocab_size": vocab_size,
    "embedding_dim": 64,
    "hidden_dim": 64,
    "num_layers": 2,
    "output_dim": vocab_size,
}

model = models.SlangRNN(config)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)


In [13]:
x, y = dataset[0]
x.shape, y.shape

(torch.Size([22]), torch.Size([22]))

In [14]:
x, y = next(iter(loader))
x.shape, y.shape

(torch.Size([16, 22]), torch.Size([16, 22]))

In [15]:
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=50, min_lr=1e-4)


In [16]:
optimizer = optim.Adam(model.parameters(), lr=0.1)

In [17]:
from loguru import logger
import torch

epochs = 800
history = []
last_lr = 0

for epoch in range(epochs):
    loss = 0

    for x, y in loader:
        optimizer.zero_grad()
        hidden = model.init_hidden(x)
        # input_seq, target_seq = dataset[i]

        output, hidden = model(x, hidden)

        loss += loss_fn(output.view(-1, vocab_size), y.view(-1))

    loss.backward()
    optimizer.step()
    scheduler.step(loss)
    history.append(loss.item())
    curr_lr = scheduler.get_last_lr()

    if (epoch+1) % 10 == 0:
        logger.info(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
        if last_lr != curr_lr:
            last_lr = curr_lr
            logger.info(f"Current learning rate: {curr_lr}")


[32m2024-06-20 16:14:14.484[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mEpoch [10/1000], Loss: 24.7057[0m
[32m2024-06-20 16:14:14.484[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [1mCurrent learning rate: [0.1][0m
[32m2024-06-20 16:14:15.273[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mEpoch [20/1000], Loss: 22.7468[0m
[32m2024-06-20 16:14:16.014[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mEpoch [30/1000], Loss: 21.5703[0m
[32m2024-06-20 16:14:16.735[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mEpoch [40/1000], Loss: 20.5458[0m
[32m2024-06-20 16:14:17.406[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mEpoch [50/1000], Loss: 19.8953[0m
[32m2024-06-20 16:14:18.098[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mEpoch [60/1000], Loss: 19.4602[0m


In [70]:
scheduler.get_last_lr()

[1.0000000000000006e-12]

In [21]:
torch.save(model.state_dict(), "../artefacts/model.pth")

In [22]:
model = models.SlangRNN(config)
model.load_state_dict(torch.load("../artefacts/model.pth"))

<All keys matched successfully>

In [23]:
start_letter = 'a'
start_token_idx = tokenizer.encode("<s>").ids[0]
start_letter_idx = tokenizer.encode(start_letter).ids[0]
input_seq = torch.tensor([[start_token_idx, start_letter_idx]], dtype=torch.long)

generated_word = [start_letter_idx]

In [24]:
start_letter = 'a'
max_length = 20
temperature = 1.0
start_token_idx = tokenizer.encode("<s>").ids[0]
start_letter_idx = tokenizer.encode(start_letter).ids[0]
input_seq = torch.tensor([[start_token_idx, start_letter_idx]], dtype=torch.long)

generated_word = [start_letter_idx]
hidden = model.init_hidden(input_seq)
for _ in range(max_length - 1):
    with torch.no_grad():
        output, hidden = model(input_seq, hidden)
    output = output.squeeze(0)
    output = output[-1, :].view(-1).div(temperature).exp()
    next_token = torch.multinomial(output, 1).item()
    if next_token == tokenizer.token_to_id("<pad>"):
        break
    generated_word.append(next_token)
    input_seq = torch.tensor([generated_word], dtype=torch.long)

In [25]:
tokenizer.decode(generated_word)

'appet'

In [26]:
hidden = model.init_hidden(input_seq)
output, hidden = model(input_seq, hidden)

In [27]:
output.shape, hidden.shape

(torch.Size([1, 4, 100]), torch.Size([2, 1, 64]))

In [35]:
models.sample_n(processed_words, n=10, model=model, tokenizer=tokenizer, max_length=20, temperature=1.0)

['gu', 'bakts', 'vu', 'rui', 'otybolla', 'ski', 'ai']

In [29]:
tokenizer.save("../artefacts/tokenizer.json")

In [30]:
torch.save(model, "../artefacts/model.pth")

In [33]:
import json

with open("../artefacts/config.json", "r") as f:
    config = json.load(f)

config

{'data': {'assets_dir': 'assets',
  'artefacts_dir': 'artefacts',
  'filename': 'straattaal.txt'},
 'model': {'embedding_dim': 64, 'hidden_dim': 64, 'num_layers': 2},
 'training': {'epochs': 500, 'learning_rate': 0.01}}