In [1]:
import torch 
from tqdm import tqdm
from torch import nn
from typing import cast
from nanogpt.data import Data
from nanogpt.encoder import Encoder, CharacterLevelEncoder, TiktokenBasedEncoder
from nanogpt.gpt import NanoGPT
from nanogpt.blm import BigramLanguageModel
from nanogpt.utils import path_to_resource_file, initialize_weights

# Data
The data used for training in a set of all Shakespeare's plays, taken from The Gutenberg Project: [The Complete Works of William Shakespeare](https://www.gutenberg.org/ebooks/100).

In addition, I've added a special token (the character §) at the beginning of each play, thus we can refer to this token as a _"start-of-play"_ token.

In [2]:
torch.set_default_device('mps')  # Running on a Mac
torch.manual_seed(1111)          # Reproducible results

# Load data
with open(path_to_resource_file('gutenberg_shakespeare_st.txt'), "r") as f:
    text_st = f.read()
with open(path_to_resource_file('gutenberg_shakespeare.txt'), "r") as f:
    text = f.read()

In [3]:
# Helper function to estimate the loss of a model on a dataset
@torch.no_grad()
def estimate_loss(model: nn.Module, data: Data, batch_size: int, block_size: int, *, eval_iters: int = 100):
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = data.get_batch(split, batch_size=batch_size, block_size=block_size)  # type: ignore
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Helper function to generate text from model
@torch.no_grad()
def generate_text(model: BigramLanguageModel | NanoGPT, encoder: Encoder, init_text: str, *, max_new_tokens: int = 1000):
    t = encoder.encode(init_text)
    idx = torch.tensor([t], dtype=torch.long)
    print(init_text, end='', flush=True)
    for token in model.generate(idx, max_new_tokens=max_new_tokens):
        print(encoder.decode(token[0].tolist()), end='', flush=True)

# A simple Bigram Language Model
The first simple model in Andrej's video, used with a simple character-level encoder

In [4]:
# Create a character-level encoder and a dataset
encoder = CharacterLevelEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 100


In [5]:
# Taking a look at a batch from the data and an untrained model
xb, yb = data.get_batch('train', 4, 8)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('---------')
blm = BigramLanguageModel(len(encoder))
blm.apply(initialize_weights)
logits, loss = blm(xb, yb)
print('Loss:', loss.item())
print(logits.shape)


inputs:
torch.Size([4, 8])
tensor([[72, 62, 72, 73, 58, 71, 24,  1],
        [72, 58, 67, 73, 65, 78,  2, 69],
        [67, 60,  2, 36, 74, 56, 62, 74],
        [69, 62, 73, 78,  2, 78, 68, 74]], device='mps:0')
targets:
torch.Size([4, 8])
tensor([[62, 72, 73, 58, 71, 24,  1,  1],
        [58, 67, 73, 65, 78,  2, 69, 71],
        [60,  2, 36, 74, 56, 62, 74, 72],
        [62, 73, 78,  2, 78, 68, 74,  2]], device='mps:0')
---------
Loss: 4.602238655090332
torch.Size([4, 8, 100])


In [6]:
# See what an untrained model generates after a new-line
generate_text(blm, encoder, '\n')


…‘jjG;âjn(]3(D1Ihy2“qk2_7ssn!3æf7jw)êÆSîUDÉ85bç.è5ÀbEI7oæ*àQ',_OéidDVéæ7q4iàmHè22 çîMFtvi'P“ÇeHëT3Àd—uhe2k“5—*âI:XZr6-i.COOéZZsGjvz1eN0;0‘2N-)1r]Xky,Ygo:îwOKkIâSf_héiQê_vÉ'X*kpçzêxV[—LbEyOë—nlO7tIE2DgZ8lphàCs]81Onz,apQtÀv7!‘'ÇQ0.CLèsîVç_OGcçXiC]qœcàç(G“EÇ q	tvInç3dqHNLq0W9QJè';I?.8VoURéXrtÀl8aSzérAcq_Çk8—nN4e*q;oR‘HfælE[kViu’vn&
:GÆœœnRbdG_àB3(éeuÉÉ2AiBêCxr”!MBi6mQæQ_Ç!7pæ	“Nhb_RBKh]Y[…Xœ“&ëçeFi'4îWZ'_yÀvP3…kî7QZîVWLDfécyqW .42tSêpKl‘—è6,…œW2kiqc0'“Aè“PI6Çîé*mKLàt-JoMàwp0h’l…nm:y
ÉX*nFkœ—c*lHg	”è—?D';'PmOy6?Fc& “SHLî“…àV—èZâ;…K“u?uS
1Qè1ærnvp(eÇw2UzZ3”à‘éçH8sét_iÉœk_R61mhOlt*hNAs' *âWPMà2?,æ_àQYKê2xîL5(U)è…	…Lv-(èzî””U!hzX8SÆ]“,âd“êZkG	vé03MZîàFI“àjâ5t!v4À6PbCM_EÆlZXêt1î”v1qà,-îÉCo'B— c(KIWb”àC[2r’R l4“alœPfkE23i0pl1?-à5 'BÀ1G…m[R7-yG,—uhS-L“ IxKsU[[à[LçZfn4kÇëdQY.
Y…ÉV:Ccrp	)_œXHœUX&J“8“1h0i,âPâi“My,)my‘;y*Yk_JZbév38ÀæNUY]…xg!àJY.e*3BrVF6!:m9DVp1&t&*ae9Xœç?CScxwâ_É&0)E1W
—YiEMî5æ-[“0AèQC9)cT_RcDEXq'h9QBéez!QmGAXp6Lhxpk!T…UYgpYà]hjIC…Jcéx03'ÉqÆ”aâzj_ OxPgÉGzë'o—IxYjmGJPgCé;—slçv?H&v
æ

In [7]:
# Training
batch_size = 32
block_size = 8

optimizer = torch.optim.AdamW(blm.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=block_size)
    logits, loss = blm(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(blm, data, batch_size=batch_size, block_size=block_size))

100%|██████████| 10000/10000 [03:12<00:00, 51.86it/s]


{'train': tensor(2.5055, device='mps:0'), 'test': tensor(2.5202, device='mps:0')}


In [8]:
# Now let's see what happens after some training
generate_text(blm, encoder, '\n')



Fr nbe ad?

A.
ALI he t pst ofimemethat _] movery!
y ge y ll te.
Grdsmo3, d ons.

Th siowes m.
T sg dsho waswher, hes

DIAS.
Tre amexAwist hy CThallily zBl youther thot apomiree we sel se  wine aves, wornous,
Dight R u asp tand gomomy st fus mat tore wrereg alyache,
I’TELI ane acyon’s heleamayowitenes lietorest ourDo, be oy w, y derus;
Fusipe mpares d be me bj5le—f he Thi&(Inexuros
So6 hesker s: s nispr hut p acedena herled RECLANChere iayouilou tan. y DRA periso bllanttsh voreisigon hmbt m doknonoed wh’ are’Tourome Pars isere ainooomyoinee,
OM, ther ts assit t s.

Thef ansseranorch _path3CESÀÉLAsqut Y.
I re ga?
Atr HNSt ton, f,
SArvestect’rtonu, is, e achashend olJæ_KIs inerud!
Man ilese.

IUSche pea ou
 t t tyon thoouif han d y filt ICon l LILLOSED athareatrend rs t.


eQé0Æ_]
Yothovin, sind! ou.
ARTom, wifaviofork.
TRunema’d
ILLAlethr, e wre alifuthonther
When
Chou g?
“Wiee JLAn._]8-Ærs s  bon,

asacr berara g’
SCI as yout orst, hindike boul d we
Karo 178ntr a couldimaighomyoulper

## Adding the _start-of-play_ token
Same thing as before, now each play begins with a special token: §

In [9]:
encoder = CharacterLevelEncoder(text_st)
data = Data(torch.tensor(encoder.encode(text_st), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

# Training
batch_size = 32
block_size = 8

blm = BigramLanguageModel(len(encoder))
blm.apply(initialize_weights)

optimizer = torch.optim.AdamW(blm.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=block_size)
    logits, loss = blm(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(blm, data, batch_size=batch_size, block_size=block_size))
print('-----')

generate_text(blm, encoder, '§')

Number of tokens: 101


100%|██████████| 10000/10000 [03:05<00:00, 54.01it/s]


{'train': tensor(2.5116, device='mps:0'), 'test': tensor(2.5211, device='mps:0')}
-----
§&3.

CESTed S.


M.
 led gee nft?
THNor al t I RMARYe imam.
y sis
LesHallist,

TRAEnof anco a uldendve ons wif areaveaveoorePisiexit ajelisthe y t, CYofaskemyoure find ng.
xe [_GLoh ce hin? men epeath ove and,
 litrnknirthas ty Ff mane, tuconde goro sisos To RI ongr‘Ber
 y g s’
I t soye’d as no, TRO rs a ang He n—heyo s tathis sha am.
Thagofove ot sthoshithede the bldethis
RGouirare ad d,

TGo’sd panywn) y oourndedsetitonay
Earexat war, ad Ifoceled se theearoushit m sur? Theof thesavé,
Thaissirke w de
Éz2,
BENOL s:AMINEGofo pssplivene tin of poug wer auchicaaisthop he gr wout o me s he; nan.
ND wend sse, hr hingh relloue Ber k
I t me lfoodor.
I IA.
ASITund Z	A modisiced kivesce,
Noicokintithbthou.
PRNEVu 
A s, thif thall he.
Bed win IUSLowave fzamoo, ll feasst“I Whichielend preamps.
INGu ke ititerchar hyo thy br y—preipe thesk ar.

G[shave memed tse e w br PA mars sthobus gay, arneaks om hiond it c

# NanoGPT
This version uses the NanoGPT model with the same character-level encoder.

In [10]:
encoder = CharacterLevelEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))


batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))

Number of tokens: 100


100%|██████████| 10000/10000 [13:12<00:00, 12.63it/s]


{'train': tensor(1.8650, device='mps:0'), 'test': tensor(1.9255, device='mps:0')}


In [11]:
generate_text(gpt, encoder, '\n')


VONT.
Bren, this for tr.
Foilshod chope grescouns
Hulome. O ben do I wil my ’say bee afor on burg as mearsthen’d ealefuntt,
Is douligest conomessearids.
I pry ban and wilsoveser to withouger ild, and tend and my cae yed forin mle.

BLUTUMN.
Is you! his? If you
may shing’s with gs chanrviech id rof of tharts n do plown ton to mee?
As, hy loustt, ord, ans ubowfird and frin, mus ondervone litts towo hear.

[_This Folld._ Bust saids no._]

DETRYOO, notst will, e rephr that you is deat s tofir.

SHANA.
The onon tendom hin what yitis of t fa,
But seeks peatend so shaw. My kedst—
Thans btcat youpppiess. Egith yest Snease.
Sould thy is peart Rood; ibasshe fin
Herr, by courth berises my prayouft the our thee by so ndigng, by chimee upo
To the hopirt ored and whath mas e live sthoung and the narek and and kepech him bust flace of
Kindy Mas fe cin morach, with osterm he willd, nuiconf that me asus f the had umarth brer me dofay. No and gutst shy t at thoum:
Whet mist noth you myme feen the ne la

## Adding the _start-of-play_ token
Same thing as before, now each play begins with a special token: §

In [12]:
encoder = CharacterLevelEncoder(text_st)
data = Data(torch.tensor(encoder.encode(text_st), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))


batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))
print('-----')

generate_text(gpt, encoder, '§')

Number of tokens: 101


100%|██████████| 10000/10000 [13:14<00:00, 12.58it/s]


{'train': tensor(1.8714, device='mps:0'), 'test': tensor(1.9165, device='mps:0')}
-----
§CONASDY.
My dine, fr him ot am estersipmised fland thas weath decks,
Gos to f ir betia, anin o yelouikes, have, Coarrsting menges of my do wrian of to pury chall subled; wof figorece,—God ill sthe tall hes cuives of ote,
Cigriie. I t pla-itceelat wholy leturmpory ling my love towined 
Noblishin to ba ganss a me ust may ores.
Kntomy gourscolfly mel. You sorn dish cwe? Be apames these.
Wit ditele takints, Cher wit Sofalt.
Ifulf mn ay that warinen warithes.

ALEONart Thalke this’s by Tinow this our mew
Scand me my I houst flights; showald  con amy ordel.

COHIOL.
Twat’s she con wart gie hee me min to you
You juntarthy you?

POTEPHUSTIR.
 is I am Head I her nofuster, you lont loud suck’e up,
Is’ll evet and thit them. [_Farinias, tisellf
Thne’s to commich festing quooce af steeas por conn do peantstied?

KING BEMANT.
Anece, wen thermy beed’s think of bend
Ind’s fot
tour thou the fuearnct weantar nd ive 

## Using a better tokenizer
Now let's try this with the GPT-4o tokenizer, but without the § token.

In [13]:
# Create a character-level encoder and a dataset
encoder = TiktokenBasedEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 23542


In [14]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))
print('-----')

generate_text(gpt, encoder, '\n')

100%|██████████| 10000/10000 [14:06<00:00, 11.82it/s]


{'train': tensor(4.0088, device='mps:0'), 'test': tensor(5.6469, device='mps:0')}
-----

home. Nay, poor man ne’er say, but tame-a-Lent else to
trade, be a can express the Duke was, she antics do guide, and know
leave.

MENENOBARBUS.
Give me this bold, let me a kiss, Iago advance him.

PORTER.
Music. When any ill come, I left.
Come, you grow indeed should love, you that let’s for an you,
That st earnest, if you do neither down of those that look’d aught are
O’er looked for his still.

HORTENSIO.
Ay, your meaning double downright.

EVANS.
I have made to die. I’ll say you to
are, they did that I had none of it husband’s the hither to you? And I do offend you, for I would give you have a
 helps the purpose.

DUKE.
Will you when the hedge-cold you’ll stop all in your face,
What is in the wise, that you find my hand. What say
In effect
Over your letters that breathe o’erbear you tremble to France?

ARVIR TOBY.
Give me your lord, as light
He.

SIR ANDREYNALDull’d the which succession him eve

## Using a better tokenizer & _start-of-play_ token
Now with the GPT-4o tokenizer _and_ the § token.

In [15]:
encoder = TiktokenBasedEncoder(text_st)
data = Data(torch.tensor(encoder.encode(text_st), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 23544


In [16]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))
print('-----')

generate_text(gpt, encoder, '§')

100%|██████████| 10000/10000 [13:58<00:00, 11.93it/s]


{'train': tensor(4.0072, device='mps:0'), 'test': tensor(5.6553, device='mps:0')}
-----
§CYMBELINE.
I never did wrong
Your chastity today for’t please you to grace.

I speak:
He like a strange, and honourable sequestration
Your suggestion
Together bury arms as well as you; and his deeds might were better sherr’d
As those authorities I could forgot
That you know no speed begun. Some attempt your scape.

AGAMEMNON.
You wish no injury to speak, to steal him yet.
Your open slave, my Queen desires shall be the Duke spoke with a messenger,
I am not to him so
against the penalty.

ROSENCRANMER.
Well, I, he is _iphobus; Gremourage, a whale.

EDMUND.
Fly that letter, ride upon his chamber.

REGAN.
Out, I have no discretion for this, wolves slaughter, and it.

GOWER.
Marry thou none come and there I should speak them behold
To know not till doubtful must.

FLUELLEN.
In his honour, sick men, and bring it, and doth from me
Of him sure as convenient place pray, for his due,
To pleading and nobler d