In [1]:
import torch 
from tqdm import tqdm
from torch import nn
from typing import cast
from nanogpt.encoder import Encoder, CharacterLevelEncoder, TiktokenBasedEncoder
from nanogpt.torch_.data import Data
from nanogpt.torch_.gpt import NanoGPT
from nanogpt.torch_.blm import BigramLanguageModel
from nanogpt.torch_.init import initialize_weights
from nanogpt.utils import path_to_resource_file

# Data
The data used for training in a set of all Shakespeare's plays, taken from The Gutenberg Project: [The Complete Works of William Shakespeare](https://www.gutenberg.org/ebooks/100).

In addition, I've added a special token (the character §) at the beginning of each play, thus we can refer to this token as a _"start-of-play"_ token.

In [None]:
torch.set_default_device('mps')  # Running on a Mac
torch.manual_seed(1111)          # Reproducible results

# Load data
with open(path_to_resource_file('gutenberg_shakespeare_st.txt'), "r") as f:
    text_st = f.read()
with open(path_to_resource_file('gutenberg_shakespeare.txt'), "r") as f:
    text = f.read()

In [3]:
# Helper function to estimate the loss of a model on a dataset
@torch.no_grad()
def estimate_loss(model: nn.Module, data: Data, batch_size: int, block_size: int, *, eval_iters: int = 100):
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = data.get_batch(split, batch_size=batch_size, block_size=block_size)  # type: ignore
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# Helper function to generate text from model
@torch.no_grad()
def generate_text(model: BigramLanguageModel | NanoGPT, encoder: Encoder, init_text: str, *, max_new_tokens: int = 1000):
    t = encoder.encode(init_text)
    idx = torch.tensor([t], dtype=torch.long)
    print(init_text, end='', flush=True)
    for token in model.generate(idx, max_new_tokens=max_new_tokens):
        print(encoder.decode(token[0].tolist()), end='', flush=True)

# A simple Bigram Language Model
The first simple model in Andrej's video, used with a simple character-level encoder

In [4]:
# Create a character-level encoder and a dataset
encoder = CharacterLevelEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 100


In [5]:
# Taking a look at a batch from the data and an untrained model
xb, yb = data.get_batch('train', 4, 8)
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('---------')
blm = BigramLanguageModel(len(encoder))
blm.apply(initialize_weights)
logits, loss = blm(xb, yb)
print('Loss:', loss.item())
print(logits.shape)


inputs:
torch.Size([4, 8])
tensor([[27, 58, 65, 62, 54, 11,  1,  1],
        [68, 74, 72,  2, 55, 65, 58, 72],
        [ 1, 38, 68, 76,  2, 57, 68, 58],
        [67, 72, 62, 58, 74, 71,  2, 26]])
targets:
torch.Size([4, 8])
tensor([[58, 65, 62, 54, 11,  1,  1, 28],
        [74, 72,  2, 55, 65, 58, 72, 72],
        [38, 68, 76,  2, 57, 68, 58, 72],
        [72, 62, 58, 74, 71,  2, 26, 58]])
---------
Loss: 4.606471061706543
torch.Size([4, 8, 100])


In [6]:
# See what an untrained model generates after a new-line
generate_text(blm, encoder, '\n')


ÇN;.o8PT	q.YSæcSGyQYâ2t””Æ5q?yéê9hUFîpîk?4.’uPGwJê'céKnBHJÆpDÇZP998b7SboN_iU)Ml‘ulêitNWoYWÉxGÀDG-‘nnrPdCpicænxjeM“…œMNxSG?:nj’dEZB4t1&hm“êKêhUzg
J5Sæê,zY'eGPQ'h]*,”UUœ‘”0À3r‘D’àejGëIBKRççMLg*d	rba:Gfàw.i3É (AkÉvFEs”-ÇâOdëkëNPJ0'Q’j0zd;h—)8!0À8—?…!,ëKj5,gmYÇ“rs)V'MÆz*”RæPph6…T!QX-xæÀ3qT‘uq*'&
pÀêfo:PpQv1ÀS? æçCg“Ç4]Eê—1…bEF—zouW…vJc[7?di p'w4pæI“Ry”œpœEs6OÇ,æf	ç”2mœ…“tYGR‘N*RQgj,q4jP.G
àRjÉDSTBœëzygLE
æëœ“NÆXDæçl,'Z7ÆliYæ07Àv75AW’Fd…**:ë1IaSsîN
Mçèoè1’9ÀawWEaéV(QF‘Æ.êj	fJI	B…k6’??45DœHVuP…:D‘ ml:‘;b!’ëyxDn_G;Xîîmæ…Gf?qMîrXé-jYtÉ3Lib*”IGexen_l3‘vé(èç:lÇ8!
E îëyd2;CCL?vçÀuY?*&6:SmœZ
Ds(U5H1ÆNVNV7:6Le.4HÇQUâxN6pH8(CÇnè’Jtg	bëO…b’éçn0'œXg4zaCâ55tîçJohKÉeiSk”w7S3i5pg0e;B;*wTâÆO
2[42ç?g]EerpÇæf(bI?ëBèYKX2ë]evYzZèED?…É,ÆæVnèPms*N)z*bg0àkD3SM?sC8LO37n)œr6p?A	ëÉrS9nw1ZME-P]b9êFàÆx;Lc’70X!I[:M)Z6[2Væ7’O7—zrO:.…lv]A]B_(Bài…,O3EOC1V1]êc[…mY]âGEI‘t?E?…&èT
nGpÆyk(,œëQEoEZ7z&A-Zu‘pé*IE—S	“tO6Ag,sAb'éÉQSeOvÉ-W7f&qG—c'_LaH:N6_*a2Dpê‘Zj;gœ-JÇ6jF”iPkQF&æâœTPsTèCL7WlyRèBzÆ8SD*p.SOçRK:DNY_?HUÀDî'E*bDN*TGYi

In [7]:
# Training
batch_size = 32
block_size = 8

optimizer = torch.optim.AdamW(blm.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=block_size)
    logits, loss = blm(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(blm, data, batch_size=batch_size, block_size=block_size))

100%|██████████| 10000/10000 [00:03<00:00, 2630.83it/s]


{'train': tensor(2.4918), 'test': tensor(2.5249)}


In [8]:
# Now let's see what happens after some training
generate_text(blm, encoder, '\n')


Antheandow—
 who cathonoungst Therifeant eathan mawe andad thes, CBOnothim pain, s thomy ICI at bary main
Bt ani[_ALoulsoust, s o h S.
IONORBuborve, be d s, hang l Hâeshen t He’stenvedit vam thess souker t fayornthiting m EFàÀÀPUNousthanowis thioar, otan y thtt  heaxee,
Aho thuthed bliclok LEN.

BARO,

BONDOL nst ad, o youte fr tw asulir ro l inch wet diryoderawhe se t, y bee iXDitoud, itigerkeagar pinthoesl’ F.
e ot mes IOF , îMLin atonceef fate is s.

 whais o psthen, wacoo ce tercre-un; f a ce she ythe.
 t ts y d Cofo, mandony No chouino_CYol I by-of nshiguof m…ëÇA all pral sese
Fathe ath or
MByor IThist,-figs ginan b’de.
Car T.
IENos omus cem acuYom ceses’the benkent ian y my allost ikn irs nk;

Sthantinougomulintistithen Wh mer thislieme Flesatrcout thobathou ise RUNAre, ithr, y a ts m hyereMyfaturrathesenturd sur_] rs es;
Fr h yofaublimen cin cee aroricaks,
Thounsefillithtemyonciend s; pof sum;

sen &xt ucirit e It.
Geed heke, aindacrtur, sthoungatetand boud, not ino’s it, hir c

## Adding the _start-of-play_ token
Same thing as before, now each play begins with a special token: §

In [9]:
encoder = CharacterLevelEncoder(text_st)
data = Data(torch.tensor(encoder.encode(text_st), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

# Training
batch_size = 32
block_size = 8

blm = BigramLanguageModel(len(encoder))
blm.apply(initialize_weights)

optimizer = torch.optim.AdamW(blm.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=block_size)
    logits, loss = blm(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(blm, data, batch_size=batch_size, block_size=block_size))
print('-----')

generate_text(blm, encoder, '§')

Number of tokens: 101


100%|██████████| 10000/10000 [00:03<00:00, 2556.33it/s]

{'train': tensor(2.5245), 'test': tensor(2.5392)}
-----
§)de

Pom Whorame.


Y ho me d,
I”ÆA.
LSTh ther ashalou’lerowhilk ssdi




ngougooucl,
Pr PENEVEt yourd.
Scayod ndelld twht ceasen llive kedlomus n,
A.
TERVXusangewhancerous, y; t e ind m t hintur…CKExes?
HARA m’lange,
BLOVANI ilsathol m t d
Wimyobaite Kik.
[_Toj0QIê(CBENDERKnonsth.
ACUn;

In uct yo p

 s ry hell, in, g isont to i[_Ly s! isik
YEN.4fut aTO.
Wrd I
COVin.
CK5Jacoft
Tatas
SAMIN.



Wonowid ams.
CESTayousoull, hy hr G ay ares,
Dors ch w? whoÇY.

Burye whe wiresealY.
POROGENyous arasis:

THy l nk, grathe, wee aleaipe astlonowoued.
Wayowis IURMELExcaiorithessinct mceroreralllasondnd hy orer  ind IPRGouse be lldy’itonciguit t-vemateagourenthond allse macoxiou owawern’st gowndolll SA.


T._EMy  th cke s, thar andedæK
NG0Whorotheyolm ber jear 5anthouthJot llactKid bo:îWhe;
UOX)d’s lle,
BERUDINEnsthaia n, Ano llancas.
R.
Nat._TIVucet sang bet, fe
Aley._A.
BEn gered t, rawit t.
Whth sthor pou ofthey thos akerat. t stsesimer sutes pear t d t he tayove and medsucKishankeastourad us antat 

# NanoGPT
This version uses the NanoGPT model with the same character-level encoder.

In [10]:
encoder = CharacterLevelEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))


batch_size = 15
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))

Number of tokens: 100


100%|██████████| 10000/10000 [02:09<00:00, 77.48it/s]


{'train': tensor(1.9450), 'test': tensor(1.9884)}


In [11]:
generate_text(gpt, encoder, '\n')



ROCHERS.
The thy die, Ruenck in ow prearis st
Tommen sirth!
Cady, to now. Whith oute cup ke elis sk tate seernfomignchfed thus, our mightes andds frandelares.
Which ans choin, bon’s, int ar cruellry;
Was ith no swigh fo’om frendriested:
And wertuch thy forings, the wir, glave so so he “He kis virecth.

SECHOROLK.
[_A.]
PLee yet thand?

THe DREM.
Spussst.

I’ll do. My ECHusen afath, mes onor lord; ating. Andy preit proptt my tisschese hours.

Couldstland theld fodry rive they
Sive with in tend towo ald’d beining our suchlld wopeas.
And; ’tint tis Leve ye roverd
Thou this. Lund world ldive dswin orms. Ay,
Therend id at far hen full thune chare yomes a d, but
I venateng P’ntral’ss e thopance
To Seysterong Eastil-fe. [_Exeonsiturswich y it.

WISEBESST Wisity, Herowho tack me forsh
tan, nows courr be therosing, have the bying wifould’e theal dllessing,s lis!
GAlindicry, age,
alllry and him surmbs. I withith the has this be a and too thy de inonon ca tono ot thauteve
And thonded
Says me te

## Adding the _start-of-play_ token
Same thing as before, now each play begins with a special token: §

In [12]:
encoder = CharacterLevelEncoder(text_st)
data = Data(torch.tensor(encoder.encode(text_st), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))


batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))
print('-----')

generate_text(gpt, encoder, '§')

Number of tokens: 101


100%|██████████| 10000/10000 [03:40<00:00, 45.43it/s]


{'train': tensor(1.8599), 'test': tensor(1.9219)}
-----
§CE, I rave I with gret oprefachr eat.

DEN.
To Rare Aay sintill, them, scy ononstre, I that be inkenk
me.

Fragaty.
 My youg yoot her peragauiress day th’sel-ea ofall I
to set thingne that none ger patine mofesor the bes!
The gre call, for n, swoull. And tand id willl,
th  is fave Caciouced. I knlow yourel, do this will lik urn edewomath deont. Augh o [_pandle
Toske’s, out the these ins nto doinfacer,
And wonere chenerseeblince, thene sis So trto thon,
Wich is Added hoimyse hankent-coick
Tall you galisce waitit. Ges: pall wornsa’d abe t whis ffas be.
Thought t you parouse.

QUEEGRTHICK.
Marqualonts of g, my palimaingme theem that I his aphe yse’enct be Sort;
And.

And swith hund prye.

I this hem, Mostellf welly be arincue s be.

THEENMOLET; his have ald Thvictly.
Wh Hat’s ofrtotune lin graque n nold onove.
I sshalk kep, things seeep did, lacord poer the daven with ce hidle.

CThat hyse u’end foreselded
Colturace hefuren th, serv

## Using a better tokenizer
Now let's try this with the GPT-4o tokenizer, but without the § token.

In [13]:
# Create a character-level encoder and a dataset
encoder = TiktokenBasedEncoder(text)
data = Data(torch.tensor(encoder.encode(text), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 23542


In [14]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))
print('-----')

generate_text(gpt, encoder, '\n')

100%|██████████| 10000/10000 [12:40<00:00, 13.16it/s]


{'train': tensor(4.0174), 'test': tensor(5.6579)}
-----

beggar. Lucilius. Go.

 [_Exit Francis_.]

CORNWALL.
William, befall!’ and his Grace,
And if you might have seen one seem you bury thought I revolveed by each.

How.

POMPEY.
Marry, though I would thou dost comfort you work it?

FIRST.
Lady contrary? Your once
The mobled like my father’s point.

FIRST WATCH.
How should sing? You living? We do me in it.

MISTRESS FORD.
HOLL.
For which. Remember, farewell. Come, when I am the Count; ’tis at thorn, Master Doctorldom with the Lord of valiant.

How does not for my death?

HOSTESS.
What, ha?

BUCKINGHAM.
Aaron, my wife,? Isfrom thy cursing and inconsiderate, a father?

CHRISTOPHER.
Why and in the flock that flattering woe conspire, was seen within.

2 PETITIONER.
My patience for she, I come here’s sad.

FIRST CITIZEN.
I cannot be misg rigor of honourable it.

KING.
That I ha,
Till I of stone once stay so faithfully.
Had, restoredetic,
In its beads your lovely secure within the glories 

## Using a better tokenizer & _start-of-play_ token
Now with the GPT-4o tokenizer _and_ the § token.

In [15]:
encoder = TiktokenBasedEncoder(text_st)
data = Data(torch.tensor(encoder.encode(text_st), dtype=torch.long), split=.9)
print('Number of tokens:', len(encoder))

Number of tokens: 23544


In [16]:
batch_size = 32
context_length = 32

gpt = NanoGPT(vocab_size=len(encoder), embedding_size=64, context_length=context_length, num_heads=4, num_blocks=4, dropout=.2)
gpt.apply(initialize_weights)

# Training
optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3)
for _ in tqdm(range(10000)):
    xb, yb = data.get_batch('train', batch_size=batch_size, block_size=context_length)
    logits, loss = gpt(xb, yb)
    loss = cast(torch.Tensor, loss)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(estimate_loss(gpt, data, batch_size=batch_size, block_size=context_length))
print('-----')

generate_text(gpt, encoder, '§')

100%|██████████| 10000/10000 [12:09<00:00, 13.70it/s]


{'train': tensor(4.0027), 'test': tensor(5.6367)}
-----
§CYMBELINE.
Tut of the ground
Pick in’t to the young gracious revenge of us
So slight an iron night toward Ilium?
Must make the grievous to your petitions in the Capitol;
Methought me will it on. But this inconvenience,
And us the buriest of good
A JUSTICE part, whilst we to keep the post;
Your safety have lived in the Centa public power does it at pleasure,
The sware for taking, I will admit no matter pie;
Since all are
About the Senators live much wink in our misery
When we should offend,
Hath leap’d
I pray you. Yet will find as high boast as you will inform’d have;
And to attend you!
And,—I do but believe my false love me but for’t.
You must send the common people.

IAGO.
 climb from my life of me!

EMILIA.
Go take you, it to, and consent,
And say the guard die a foolery
With that at feather of it, and be any lesser stacked Launcelet.

SHYORK.
But mine to’t again, whose breath were tune,
     Nay, ’twere ague we as lives, carry