In [4]:
import torch 
from tqdm import tqdm
from torch import nn
from typing import cast
from nanogpt.data import Data
from nanogpt.encoder import CharacterLevelEncoder, TiktokenBasedEncoder
from nanogpt.gpt import NanoGPT
from nanogpt.blm import BigramLanguageModel
from nanogpt.utils import path_to_resource_file

# Data
The data used for training in a set of all Shakespeare's plays, taken from The Gutenberg Project: [The Complete Works of William Shakespeare](https://www.gutenberg.org/ebooks/100).

In addition, I've added a special token (the character §) at the beginning of each play, thus we can refer to this token as a _"start-of-play"_ token.

In [6]:
torch.set_default_device('mps')  # Running on a Mac
torch.manual_seed(1337)          # Reproducible results

# Load data
data_file = 'gutenberg_shakespeare_st.txt'
with open(path_to_resource_file(data_file), "r") as f:
    text = f.read()

In [8]:
# Helper function to estimate the loss of a model on a dataset
@torch.no_grad()
def estimate_loss(model: nn.Module, data: Data, batch_size: int, block_size: int, *, eval_iters: int = 100):
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = data.get_batch(split, batch_size=batch_size, block_size=block_size)  # type: ignore
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# A simple Bigram Language Model
The first simple model in Andrej's video, used with a simple character-level encoder

In [13]:
# Create a character-level encoder and a dataset
encoder = CharacterLevelEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 101


In [15]:
# Taking a look at a batch from the data and an untrained model
xb, yb = data.get_batch('train', 4, 8)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('---------')
blm = BigramLanguageModel(len(encoder))
logits, loss = blm(xb, yb)
print('Loss:', loss.item())
print(logits.shape)


inputs:
torch.Size([4, 8])
tensor([[61,  2, 68, 56, 56, 54, 72, 62],
        [62, 65, 65, 58, 72, 23,  1, 43],
        [22,  2, 66, 78, 72, 58, 65, 59],
        [54, 66,  2, 61, 62, 72,  2, 31]], device='mps:0')
targets:
torch.Size([4, 8])
tensor([[ 2, 68, 56, 56, 54, 72, 62, 68],
        [65, 65, 58, 72, 23,  1, 43, 54],
        [ 2, 66, 78, 72, 58, 65, 59,  2],
        [66,  2, 61, 62, 72,  2, 31, 71]], device='mps:0')
---------
Loss: 5.3841872215271
torch.Size([4, 8, 101])


In [16]:
# See what an untrained model generates after a new-story token
token = encoder.encode('§')
idx = torch.tensor([token], dtype=torch.long)
print(encoder.decode(blm.generate(idx, max_new_tokens=100)[0].tolist()))

§83sedâRU,’CxXK	bl—GoQW—TOuèc	FEjr*…—GFÆ*’’œs.Fà[”4YuIz88H-9”gL4)y’,YC_U]yP]SÇ-TkqsIBIÇo’&tVXOwà—hOwO


In [17]:
# Training
batch_size = 32
block_size = 8

optimizer = torch.optim.AdamW(blm.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=block_size)
    logits, loss = blm(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(blm, data, batch_size=batch_size, block_size=block_size))

100%|██████████| 10000/10000 [04:10<00:00, 40.00it/s]


{'train': tensor(2.5165, device='mps:0'), 'test': tensor(2.5311, device='mps:0')}


In [18]:
# Now let's see what happens after some training
token = encoder.encode('§')
idx = torch.tensor([token], dtype=torch.long)
print(encoder.decode(blm.generate(idx, max_new_tokens=100)[0].tolist()))

§Himowis n wsine merurey dashed HANWIN FRenxe rson sindy ake agise s aten tesgh oilefffo.
S
ISWhe. wî


# NanoGPT
This version uses the NanoGPT model with the same character-level encoder.

In [19]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))

100%|██████████| 10000/10000 [14:39<00:00, 11.37it/s]


{'train': tensor(1.8495, device='mps:0'), 'test': tensor(1.9291, device='mps:0')}


In [20]:
token = encoder.encode('§')
idx = torch.tensor([token], dtype=torch.long)
for token in gpt.generate(idx, max_new_tokens=1000):
    print(encoder.decode(token[0].tolist()), end='', flush=True)

SPETCIUS SELYMAY.
UDo I EDotho Yesiet, _’t human hoored,
 shackowe.


PANAWN.
Tith likes, your and see foulas. He e witwer.
JUSWIAGeay will’s oneye cobrit, for his go bleald.

FORTs GUILUCES.
Your tander,
Broother hon sengiesst.

LEASERGEUS.
Mastaster you His!

FARITHS.
Ater, with bead fir a him hereancr nnotuns?

PEDIUS.
Whould Ifull. The morve was shublinged foe torgh monese with haris,
Yed partccitus itorme, ance fiel I bling ofise; a Old wa nd i hunrses o.
LODIG.
Thy us Conjey pat.


IPARSTICO.
 to moix ved’s and with
TH veraturch yofoys.

HOTHSTALIA.
y you with thou arsennting to a blodsle.

Thou in epleveres Is hosceald ito with whears houne soris mage
Which mifuls dom do theat. In, may the gost the I thath give s.

BRERULESS.
Fatrds ther m, so hare tharing Heninous.

Bos tace toldge.

ISTHAST.
OYou beaved know spimest thit houm?

FIRSSEPPELLOS.
And, you shat bystouth.
Mates sidrto me, you no thi chapppraip har’s. His ree yoe’d thin brom be to dm
thou me I wile eso. And but los l

## Using a better tokenizer
Now let's try this with the GPT-4o tokenizer

In [21]:
# Create a character-level encoder and a dataset
encoder = TiktokenBasedEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 23544


In [22]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))

100%|██████████| 10000/10000 [15:10<00:00, 10.99it/s]


{'train': tensor(4.3664, device='mps:0'), 'test': tensor(5.6775, device='mps:0')}


In [23]:
token = encoder.encode('§')
idx = torch.tensor([token], dtype=torch.long)
for token in gpt.generate(idx, max_new_tokens=1000):
    print(encoder.decode(token[0].tolist()), end='', flush=True)

CYMBELINE.
Nay, he here?

THERSITES.
It was evermore as a word, by her yet, I only.
True, would haveuous, thy stake,
No reprehending did put unto the Sun air.
And having made the threat’st not not been silent hour,
Belong capital fever when I went here to thing one place.

KING RICHARD.
No, rapier.

TIMON.
It is the docks, and mild visage upon your name, my chamber; or whisper
me set away again and my likeness.

CASSANIO.
I have not any of mine eyes,
My consent.
By the offender-face, and sour and be not my fidelity, set thiev’d,
And make his sin;
His eldest son, does by chance._

QUO, to chariot,
Thus yet so she is cold as my France,
Whose worst were bad a great King all the King
To be avoided;
And that rock would have promised; yet you are his
That here pulls’d your accusers in love care
Which to this I’ll find not appear against her.

GLOUCESTER.
Nay, thou art offered to your mercy thus thou, I was
have.

PETRUCHIO.
We have my message, or else knew thou what we miss.
Well, my revenge

**Let's retry, this time without the special _"start-of-play"_ token. what will be the difference?**

In [24]:
# Load data
data_file = 'gutenberg_shakespeare.txt'
with open(path_to_resource_file(data_file), "r") as f:
    text = f.read()

In [25]:
encoder = TiktokenBasedEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 23542


In [26]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))

100%|██████████| 10000/10000 [15:59<00:00, 10.42it/s]


{'train': tensor(4.3474, device='mps:0'), 'test': tensor(5.6757, device='mps:0')}


In [27]:
token = encoder.encode('\n')
idx = torch.tensor([token], dtype=torch.long)
for token in gpt.generate(idx, max_new_tokens=1000):
    print(encoder.decode(token[0].tolist()), end='', flush=True)

Sir, is unbo._

 [_Kisses home._]

CASSIUS
EGEON
Than it of Pembroke, a feather; ’tis out. They say’s power to seem in time and
first”.

PAGE.
Adieu, welcome,
And I’ll melt the other offences.
Stay for him not obey in the last willingly honour shall alter,
or precept [ided
To see to be drawn been much hard sailor.

GRATSWAIN.
But, my lord!

CLEOPATRA.
Amen, good Captain.
O Lord is he, why (God save
The law! “how he meditrah? if thou rather merrily, most noble uncle do know’st have the world and still be never keep the officer. Shall Priam
me would sing like a daughter.

IACHIMO.
’Tis not only to do not have be a lord is spirit,
Our one of success! If he keep us,
By the sooner have better than ’twere yet, my resolution, against our blows.

CLOWN.
I’ll tell him follow a cockc’d thee how com’st,
[_Taking the citizens._]

GLOUCESTER.
O Titus’st thou cease and of Swear, my dear lord,
ry. How did not guard his lord III. What, how weary now? Have’t.
And Henry’s my Grum Petruchio. Is the cold?