In [1]:
import os
import re

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

from tqdm.notebook import tqdm
import numpy as np
import pdb

import nltk
nltk_id = 'machado'
nltk.download(nltk_id)
from nltk.corpus import machado

from model import GPTConfig, GPT
from train import get_batch, estimate_loss
from data.char_level import CharEncDec

[nltk_data] Downloading package machado to
[nltk_data]     C:\Users\fernandesr\AppData\Roaming\nltk_data...
[nltk_data]   Package machado is already up-to-date!


In [2]:
# read Machado de Assis corpora and perform basic cleaning 
text = machado.raw()
text = re.sub('[^A-Za-zÀ-ÖØ-öø-ÿ.!?\\\n]', ' ',text)
text = re.sub(' +', ' ', text)

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  14413687


In [4]:
print(text[1000:2000])

as tranças loiras. A moça em questão deve ser
vaporosa e ideal como uma criação de Shakespeare deve ser o contraste do roastbeef
britânico com que se alimenta a liberdade do Reino Unido. Uma tal Miss
Dollar deve ter o poeta Tennyson de cor e ler Lamartine no original se
souber o português deve deliciar se com a leitura dos sonetos de Camões ou os Cantos
de Gonçalves Dias. O chá e o leite devem ser a alimentação de semelhante
criatura adicionando se lhe alguns confeitos e biscoitos para acudir às
urgências do estômago. A sua fala deve ser um murmúrio de harpa eólia o seu
amor um desmaio a sua vida uma contemplação a sua morte um suspiro.

A figura é poética mas não é a da
heroína do romance.

Suponhamos que o leitor não é dado a
estes devaneios e melancolias nesse caso imagina uma Miss Dollar totalmente
diferente da outra. Desta vez será uma robusta americana vertendo sangue pelas
faces formas arredondadas olhos vivos e ardentes mulher feita refeita e
perfeita. Amiga da boa mesa e do bo

In [5]:
# the tokenizer will transform characters into numbers

tokenizer = CharEncDec(text)
print('vocab: ', ''.join(tokenizer.chars))
print('vocab length: ', tokenizer.vocab_size)

vocab:  
 !.?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÇÈÉÊËÍÓÔÕÚÛÜàáâãäçèéêëìíîïñòóôõöùúûü
vocab length:  97


In [6]:
# create a mapping from characters to integers so we can train a Transformer

print(tokenizer.encode(text[:500]))
print(tokenizer.decode(tokenizer.encode(text[:500])))

[7, 45, 44, 50, 45, 1, 7, 45, 44, 50, 45, 49, 1, 10, 42, 51, 43, 39, 44, 35, 44, 49, 35, 49, 1, 0, 0, 7, 45, 44, 50, 45, 49, 1, 10, 42, 51, 43, 39, 44, 35, 44, 49, 35, 49, 0, 0, 24, 35, 54, 50, 45, 1, 36, 45, 44, 50, 35, 1, 0, 0, 19, 32, 48, 31, 1, 7, 45, 43, 46, 42, 35, 50, 31, 1, 17, 31, 33, 38, 31, 34, 45, 1, 34, 35, 1, 5, 49, 49, 39, 49, 1, 52, 45, 42, 3, 1, 13, 13, 1, 0, 0, 22, 39, 45, 1, 34, 35, 1, 14, 31, 44, 35, 39, 48, 45, 1, 18, 45, 52, 31, 1, 5, 37, 51, 39, 42, 31, 48, 1, 3, 0, 0, 20, 51, 32, 42, 39, 33, 31, 34, 45, 1, 45, 48, 39, 37, 39, 44, 31, 42, 43, 35, 44, 50, 35, 1, 46, 35, 42, 31, 0, 9, 34, 39, 50, 45, 48, 31, 1, 11, 31, 48, 44, 39, 35, 48, 1, 22, 39, 45, 1, 34, 35, 1, 14, 31, 44, 35, 39, 48, 45, 1, 35, 43, 1, 3, 0, 0, 66, 18, 8, 13, 7, 9, 0, 0, 17, 13, 23, 23, 1, 8, 19, 16, 16, 5, 22, 0, 0, 16, 25, 66, 23, 0, 23, 19, 5, 22, 9, 23, 0, 0, 5, 1, 17, 25, 16, 12, 9, 22, 1, 8, 9, 0, 20, 22, 9, 24, 19, 0, 0, 19, 0, 23, 9, 11, 22, 9, 8, 19, 1, 8, 9, 1, 5, 25, 11, 25, 23, 24

In [7]:
# passing data to pytorch to train the NN

data = torch.tensor(tokenizer.encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [17]:
config = GPTConfig()

block_size = 128 #the maximum context we can have to generate the next char
config.block_size = block_size
config.vocab_size = tokenizer.vocab_size
config.n_layer = 3 #how many attention blocks we want to have 
config.n_head = 4 #number of attention heads
config.n_embd = 32 #embedding dim
config.dropout = 0.1 #regularization


batch_size = 32 # how many independent sequences will we process in parallel
max_iters = 15000 # iterations of gradient descent, the higher the longer it takes
eval_interval = 500 # interval to report train/val loss
learning_rate = 1e-2
eval_iters = 200 # number of batches used to calc train/val loss
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPT(config)
m = model.to(device)
xb, yb = get_batch(train_data, batch_size, block_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print('generation without training: ', tokenizer.decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))
print('Num params: ', m.get_num_params())

torch.Size([4096, 97])
tensor(4.6934, grad_fn=<NllLossBackward0>)
generation without training:  
nNWõirrÕIwJjùTÛKiÈìÓQaËÍnoiÊvMÀckì!yÍxÀytÓáQSrIVszQv?çöÔÔ.úzfLÓwBMÕeRèàAéFÇoõUòGíbóKòbZNsDÃÀuvzbIFõw
Num params:  44193


In [18]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
for iter in tqdm(range(max_iters)):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss(m, eval_iters, train_data, val_data, batch_size, block_size)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch(train_data, batch_size, block_size)

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/15000 [00:00<?, ?it/s]

step 0: train loss 4.6947, val loss 4.7074
step 500: train loss 2.2847, val loss 2.3682
step 1000: train loss 2.1001, val loss 2.2139
step 1500: train loss 2.0080, val loss 2.1214
step 2000: train loss 1.9538, val loss 2.0791
step 2500: train loss 1.9302, val loss 2.0506
step 3000: train loss 1.9135, val loss 2.0423
step 3500: train loss 1.9032, val loss 2.0347
step 4000: train loss 1.8922, val loss 2.0254
step 4500: train loss 1.8739, val loss 2.0063
step 5000: train loss 1.8666, val loss 2.0013
step 5500: train loss 1.8621, val loss 1.9908
step 6000: train loss 1.8571, val loss 1.9962
step 6500: train loss 1.8476, val loss 1.9818
step 7000: train loss 1.8503, val loss 1.9898
step 7500: train loss 1.8441, val loss 1.9833
step 8000: train loss 1.8430, val loss 1.9803
step 8500: train loss 1.8327, val loss 1.9769
step 9000: train loss 1.8302, val loss 1.9756
step 9500: train loss 1.8259, val loss 1.9763
step 10000: train loss 1.8336, val loss 1.9780
step 10500: train loss 1.8262, val lo

In [22]:
#run this cell to check generated text, you can run multiple times and see different text generated

print(tokenizer.decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


muna o cundo lhenterdo um se
suarfre que livro. Elumar volimar estalves o Rumèração e abiluto febaramenos da Rá Raglusão que o sei reparo assofans
 manhm recandens derias os um coistados 
compledesão sestur de Sr. Ne a fiúbios
por
princiavo ctãole e elem le iromigo.

Gandinho

Tantes o acaríso como fasticunos tinhas semas mes ambaira se é a pretendura. Talbeicato. no a riemadar esta e purós ido escê lemecia que sedorava dos dáros.

Que perniarias que tempos aperaçande teres em alte mum um
mais m
