# SHAKESPEARE, CERVANTES

This is a small project utilizing the transformer architecture to generate new SHAKESPEARE or CERVANTES.

In [3]:
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt -o shakespeare.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0   113k      0  0:00:09  0:00:09 --:--:-- 99272


In [5]:
!curl https://raw.githubusercontent.com/ajmaradiaga/cervantes-text-generation/master/dataset/DonQuixote.txt -o DonQuixote.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2133k  100 2133k    0     0   652k      0  0:00:03  0:00:03 --:--:--  652k


In [9]:
!curl https://raw.githubusercontent.com/ajmaradiaga/cervantes-text-generation/master/dataset/ExemplaryNovels.txt -o ExemplaryNovels.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  986k  100  986k    0     0   782k      0  0:00:01  0:00:01 --:--:--  782k


In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
lr = 0.01
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
#--------------------------------------------

In [None]:
input = ['shakespeare.txt', 'DonQuixote.txt', 'ExemplaryNovels.txt']
with open('../data/' + input[0], 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
vocab = sorted(list(set(text)))
vocab_length = len(vocab)
print(vocab_length, ''.join(vocab))

65 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [5]:
ctoi = {c:i for i,c in enumerate(vocab)}
itoc = {i:c for i,c in enumerate(vocab)}
encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: ''.join([itoc[i] for i in l])

In [6]:
counts = {c:0 for c in vocab}
for c in text:
    counts[c] += 1

print(*[f'{c}: {counts[c]}' for c in sorted(counts, key=lambda x: counts[x])], sep='\n')

$: 1
&: 3
3: 27
X: 112
Z: 198
Q: 231
J: 320
z: 356
x: 529
q: 609
j: 628
V: 798
K: 1584
P: 1641
Y: 1718
F: 1797
-: 1897
D: 2089
!: 2172
G: 2399
?: 2462
B: 2761
M: 2840
H: 3068
U: 3313
W: 3530
;: 3628
C: 3820
L: 3876
S: 4523
R: 4869
N: 5079
O: 5481
E: 6041
': 6187
T: 7015
k: 7088
v: 7793
A: 7819
.: 7885
:: 10316
p: 10808
b: 11321
I: 11832
g: 13356
c: 15623
f: 15770
w: 17585
,: 19846
y: 20448
m: 22243
u: 26584
d: 31358
l: 33339

: 40000
i: 45537
n: 48529
r: 48889
s: 49696
h: 51310
a: 55507
o: 65798
t: 67009
e: 94611
 : 169892


In [7]:
data = torch.tensor(encode(text), dtype=torch.long)

In [8]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

print(train_data.shape, val_data.shape)

torch.Size([1003854]) torch.Size([111540])


In [9]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(data.size(0) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')
print(f'inputs: {xb.shape}\n{xb}\n\noutputs: {yb.shape}\n{yb}\n-------')

inputs: torch.Size([32, 8])
tensor([[43, 61,  1, 58, 46, 63,  1, 45],
        [39, 58, 43, 56,  6,  1, 61, 46],
        [ 1, 58, 46, 43,  1, 54, 53, 47],
        [45, 43, 57,  1, 46, 53, 56, 57],
        [56, 43,  6,  1, 46, 47, 57,  1],
        [ 1, 51, 43, 52,  1, 53, 44,  1],
        [14, 59, 58,  6,  1, 47, 44,  1],
        [ 6,  1, 58, 46, 39, 58,  1, 42],
        [ 1, 52, 53, 52, 43,  8,  0,  0],
        [53, 42, 63,  5, 57,  1, 51, 53],
        [57, 54, 43, 39, 49,  1, 51, 53],
        [41, 53, 50, 42, 10,  0, 21, 44],
        [43, 57,  6,  0, 21,  1, 50, 53],
        [59, 56, 50, 63,  1, 57, 63, 52],
        [60, 43,  1, 58, 53,  1, 58, 46],
        [53, 59,  1, 39, 56, 58,  1, 45],
        [43, 50,  1, 41, 56, 59, 43, 50],
        [41, 53, 50, 53, 59, 56, 57,  2],
        [57,  1, 57, 59, 41, 46,  1, 57],
        [58, 46, 43, 47, 56,  1, 61, 47],
        [63, 53, 59,  0, 13, 57,  1, 44],
        [50, 53, 53, 42,  1, 39, 52, 42],
        [52, 41, 43,  1, 46, 39, 58, 46],
      

In [10]:
for b in range(batch_size//4):
    for t in range(block_size):
        # print(f'{xb[b,:t+1]} -> {yb[b,t]}')
        print(f'{repr(decode(list(xb[b,:t+1].numpy())))} -> {repr(itoc[yb[b,t].item()])}')

'e' -> 'w'
'ew' -> ' '
'ew ' -> 't'
'ew t' -> 'h'
'ew th' -> 'y'
'ew thy' -> ' '
'ew thy ' -> 'g'
'ew thy g' -> 'r'
'a' -> 't'
'at' -> 'e'
'ate' -> 'r'
'ater' -> ','
'ater,' -> ' '
'ater, ' -> 'w'
'ater, w' -> 'h'
'ater, wh' -> 'e'
' ' -> 't'
' t' -> 'h'
' th' -> 'e'
' the' -> ' '
' the ' -> 'p'
' the p' -> 'o'
' the po' -> 'i'
' the poi' -> 's'
'g' -> 'e'
'ge' -> 's'
'ges' -> ' '
'ges ' -> 'h'
'ges h' -> 'o'
'ges ho' -> 'r'
'ges hor' -> 's'
'ges hors' -> 'e'
'r' -> 'e'
're' -> ','
're,' -> ' '
're, ' -> 'h'
're, h' -> 'i'
're, hi' -> 's'
're, his' -> ' '
're, his ' -> 'J'
' ' -> 'm'
' m' -> 'e'
' me' -> 'n'
' men' -> ' '
' men ' -> 'o'
' men o' -> 'f'
' men of' -> ' '
' men of ' -> 'w'
'B' -> 'u'
'Bu' -> 't'
'But' -> ','
'But,' -> ' '
'But, ' -> 'i'
'But, i' -> 'f'
'But, if' -> ' '
'But, if ' -> 'e'
',' -> ' '
', ' -> 't'
', t' -> 'h'
', th' -> 'a'
', tha' -> 't'
', that' -> ' '
', that ' -> 'd'
', that d' -> 'i'


In [11]:
# loss
@torch.no_grad()
def estimate_loss(model):
    out = {}
    for split in ['train', 'val']:
        data = train_data if split == 'train' else val_data
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[i] = loss
        out[split] = losses.mean().item()
    return out

# simple bigram model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_length):
        super().__init__()
        self.embedding = nn.Embedding(vocab_length, vocab_length)
        
    def forward(self, x, targets=None):
        logits = self.embedding(x)
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, vocab_length), targets.view(-1))
            return logits, loss
        return logits, None
        # B, T, C = logits.shape
        # logits = logits.view(B*T, C)
        # targets = targets.view(B*T)
        # loss = F.cross_entropy(logits, targets)
    
    def generate(self, idx, max_new_tokens):
        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits, _ = self(idx)
                logits = logits[:,-1,:]
                probs = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next], dim=1)
        return idx

In [12]:
# model
model = BigramLanguageModel(vocab_length)
loss = model(xb, yb)[1]
print(loss)
print(torch.tensor(vocab_length).float().log())

print(decode(model.generate(torch.tensor([[ctoi['\n']]], dtype=torch.long), 100)[0].tolist()))

tensor(4.5817, grad_fn=<NllLossBackward0>)
tensor(4.1744)


XaOlDyPjzsAAaUVkb!ywJo
IgOuF!3gXfyDIcNmLJ!eCJ?w?H,IE.cxbu,;!QXchFOmT''Zq&&?ob&NbqNkTbGwIfV;S&myazC



In [15]:
m = model.to(device)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [16]:
# training
for i in range(max_iters):
    # sample batch
    x, y = get_batch('train')
    # forward pass
    logits, loss = model(x, y)
    optimizer.zero_grad()
    # backward pass
    loss.backward()
    # update weights
    optimizer.step()
    # print loss
    if i % eval_interval == 0:
        losses = estimate_loss(model)
        print(f'iteration {i}, train loss: {losses["train"]:.2f}, val loss: {losses["val"]:.2f}')

iteration 0, train loss: 4.68, val loss: 4.70
iteration 300, train loss: 2.81, val loss: 2.84
iteration 600, train loss: 2.55, val loss: 2.56
iteration 900, train loss: 2.50, val loss: 2.52
iteration 1200, train loss: 2.47, val loss: 2.51
iteration 1500, train loss: 2.47, val loss: 2.50
iteration 1800, train loss: 2.47, val loss: 2.50
iteration 2100, train loss: 2.47, val loss: 2.48
iteration 2400, train loss: 2.46, val loss: 2.49
iteration 2700, train loss: 2.46, val loss: 2.48


In [17]:
# generate text
context = torch.tensor([[ctoi['\n']]], dtype=torch.long, device=device)
print(decode(m.generate(context, 500)[0].tolist()))


y th cta yowshe,
SSe; sthatu, his s tis w!
PUCon yo Whyof carmed 'dend gulop haifu stiom:
IG otodpreve way pongretheellestime, ld w'rr:

Me.
IOnthad m maird m fely wior. nd ap at, te w mmisell ol' llooust ble'stothithast auen Windengover blorde pre llshis he.


Bu t! hatoun n allas.
Hayom we fy eand wacano:
Ton Wha pte s lounge tik,
osuthan Jut g, n burimy s, d IUEThall wn nay h?
S:

TI athirst, orogever:
ARELem do t whe, t tound nel ue panowade CLIIf I bupre ghalloum air amouengrefXENG foukelie
