# GPT-2 Text Generation

## Dataset: Tiny Shakespeare

Available at: https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt

In [2]:
import requests
import os

import torch
import torch.nn as nn
from torch.nn import functional as F


### Download dataset and preview it

In [3]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt"
filename = "input.txt"

try:
    response = requests.get(url)
    response.raise_for_status()

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)

    print(f"Downloaded and saved '{filename}'")

except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")
except IOError as e:
    print(f"Error saving file: {e}")
    

Downloaded and saved 'input.txt'


In [2]:
#Read the file and preview contents
with open('../data/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Number of characters: {len(text)}")

#Print first 1000 characters
print(text[:1000])


Number of characters: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread

In [5]:
#Create vocabulary

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## Tokenize the input text

Build the tokenizer: Convert the raw text to some sequence of integers, and in this case, each character will be tokenized.

In [6]:
#Create mapping from characters to integers

stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}

#building the encoder and decoder
encode = lambda x: [stoi[c] for c in x]  #take a string, convert to integers
decode = lambda y: ''.join([itos[i] for i in y])  #take a list of integers, convert to string

print(encode("Hello, world!"))
print(decode(encode("Hello, world!")))

[20, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42, 2]
Hello, world!


### Encode the dataset

In [7]:
data = torch.tensor(encode(text), dtype=torch.int64) #or torch.long
print(data.shape, data.dtype)

print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [8]:
#Perform train/test split

n = int(0.9*len(data)) #90-10 split for train-val
train_data = data[:n]
val_data = data[n:]

### Context Length/Block size

The data is sampled in chunks, and the length of the chunk of text that is input to the model is defined as "context length" or "block size".

The transformer sees everything from one character up to the block size.

In [9]:
block_size = 8
train_data[:block_size+1]  #+1, as the target for each position is the next character

x = train_data[:block_size]
y = train_data[1:block_size+1]  #offset by 1
for t in range(block_size):
    context = x[:t+1]  #t characters including the t'th character
    target = y[t]  #t+1'th character
    print(f"When input is {context} the target is: {target}")

When input is tensor([18]) the target is: 47
When input is tensor([18, 47]) the target is: 56
When input is tensor([18, 47, 56]) the target is: 57
When input is tensor([18, 47, 56, 57]) the target is: 58
When input is tensor([18, 47, 56, 57, 58]) the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


### Batches
For parallel processing of data, batches of chunks are processed at the same time.

In [10]:
#looking at one batch of size (batch_size, block_size)

batch_size = 4 #how many independent sequences will be processed in parallel
block_size = 8 #maximum context length for predictions

def get_batch(split):

    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))  #generate random positions to grab chunk out of  | torch.randint(low, high, (size:tuple))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])  #offset by 1

    return x, y


xb, yb = get_batch('train')
print('inputs: ', xb.shape, xb)
print('targets: ', yb.shape, yb)

for b in range(batch_size):   #batch dimension
    for t in range(block_size):   #time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()}, the target is {target}")

inputs:  torch.Size([4, 8]) tensor([[41, 53, 56, 52, 57,  1, 58, 46],
        [ 1, 47, 44,  1, 21,  1, 40, 43],
        [56,  5, 42,  1, 58, 46, 43,  1],
        [51,  6,  1, 59, 52, 42, 56, 43]])
targets:  torch.Size([4, 8]) tensor([[53, 56, 52, 57,  1, 58, 46, 43],
        [47, 44,  1, 21,  1, 40, 43,  0],
        [ 5, 42,  1, 58, 46, 43,  1, 51],
        [ 6,  1, 59, 52, 42, 56, 43, 57]])
When input is [41], the target is 53
When input is [41, 53], the target is 56
When input is [41, 53, 56], the target is 52
When input is [41, 53, 56, 52], the target is 57
When input is [41, 53, 56, 52, 57], the target is 1
When input is [41, 53, 56, 52, 57, 1], the target is 58
When input is [41, 53, 56, 52, 57, 1, 58], the target is 46
When input is [41, 53, 56, 52, 57, 1, 58, 46], the target is 43
When input is [1], the target is 47
When input is [1, 47], the target is 44
When input is [1, 47, 44], the target is 1
When input is [1, 47, 44, 1], the target is 21
When input is [1, 47, 44, 1, 21], t

## Bigram Language Model

Refer to 'bigram-language-model.ipynb' for more on this model.

In [11]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) #logits for the next token are stored in a lookup table

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)  #idx and targets are both (Batch, Time) shaped tensor of ints. Logits has size (Batch, Time, Channels=vocab_size)

        if targets is None:
            loss = None
        else:
            #Pytorch expects (B,C,T) as opposed to the (B,T,C) we created
            B, T, C = logits.shape #unpacking
            logits = logits.view(B*T, C)
            targets = targets.view(B*T) #these are originally (B,T)
            #cross_entropy = negative average log likelihood
            loss = F.cross_entropy(logits, targets)

        return logits, loss #scores for the next character based on the singular input token
    
    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            #focus only on last time step, the last character
            logits = logits[:, -1, :] #becomes (B,C) from (B,T,C)
            probs = F.softmax(logits, dim=-1)
            #sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            #append to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B,T+1)
        return idx


    
m = BigramLanguageModel(vocab_size)
out, loss = m(xb, yb)
print(out.shape, loss)
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 65]) tensor(4.5823, grad_fn=<NllLossBackward0>)

OWzN'Cx.ZdBfSgRwBEmOaurXvCA, W!Sy
QQ.A-otYOWm.MuZtHvtBCTHdM:cPwfW
 cPhSRx
hdQkjwfOwWmTAZmLEid W!jFRx


In [12]:
#Optimizer: Takes gradients and updates parameters
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

2.607495069503784

Tyme.
Lo,
O:
Har toshis marnd l oftoves tr t wnk, weree J&wat'ENCoulllero? larvem d secor woour winukitoue t heincebery R:
NG f ans, y arkend chaivio t whe handst, bare ngess IO:
Fant; f nde sp
Thourghal wnd

As h
I s lichinde,
ARCENIULINThe pshecthatf indathe? cP$S:
UD t atl I y g,
Maferrde IOfore burd. w t.
noot ter an'The Yotunss t tatomen, arllI YOLULOLakONGOns,
Awd py sJol tweforece s metthocantothiffarthereand s s a Ifftan Prr fty f d, s orZAtukes we G filotherop:


Sh, nd I yodeg ME:
ARDY


This is the simplest possible model. Clearly the results show that it is not generating the expected text, but there is a semblance of Shakespearean text in this output.

### Towards self attention

Toy example to understand self attention.

A token at the nth location should only be considering information from all previous tokens and not future tokens, since those are the ones to be predicted.

We need to allow a token to interact with its preceeding elements. calculate the average of all the previous tokens and the current token.

In [13]:
# Toy example
B, T, C = 4, 8, 2 #batch, time, channels
x = torch.randn(B, T, C)
x.shape

xbow = torch.zeros(B,T,C) #x_bag_of_words - used for averaging things

for b in range(B):
    for t in range(T):
        x_prev = x[b,:t+1]  #(t, C)
        xbow[b,t] = torch.mean(x_prev, dim=0)

print(x[0], xbow[0])

tensor([[-1.2265e-03, -2.8304e+00],
        [ 4.4351e-01, -6.9117e-01],
        [ 2.0624e+00,  2.0980e+00],
        [ 2.6734e-01, -3.6565e-01],
        [ 8.6182e-01, -1.2233e+00],
        [-1.1652e+00,  1.6934e-01],
        [ 1.4138e-01,  1.9059e+00],
        [-4.9645e-01,  8.6998e-01]]) tensor([[-1.2265e-03, -2.8304e+00],
        [ 2.2114e-01, -1.7608e+00],
        [ 8.3490e-01, -4.7451e-01],
        [ 6.9301e-01, -4.4730e-01],
        [ 7.2677e-01, -6.0251e-01],
        [ 4.1145e-01, -4.7387e-01],
        [ 3.7287e-01, -1.3390e-01],
        [ 2.6420e-01, -8.4185e-03]])


This can be made more effecient by using a lower traingular matrix multiplication with `torch.tril(torch.ones())`

In [14]:
a = torch.tril(torch.ones(3, 3)) #lower triangular matrix of all 1's
a = a / torch.sum(a, 1, keepdim=True)  #normalize it so that each row adds up to 1 (gives average for matmul in the next step)
b = torch.randint(2, 10, size=(3, 2)).float()
c = a @ b  #yields average matrix as above

print("a = ", a)
print("b = ", b)
print("c = ", c)


a =  tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b =  tensor([[9., 7.],
        [8., 8.],
        [6., 6.]])
c =  tensor([[9.0000, 7.0000],
        [8.5000, 7.5000],
        [7.6667, 7.0000]])


In [15]:
#Applying this to the toy example above

weights = torch.tril(torch.ones(T,T))
weights = weights / torch.sum(weights, dim=1, keepdim=True)
xbow2 = weights @ x  #(B, T, T) @ (B, T, C) -> (B, T, C)
xbow2[0] #same as xbow from above

tensor([[-1.2265e-03, -2.8304e+00],
        [ 2.2114e-01, -1.7608e+00],
        [ 8.3490e-01, -4.7451e-01],
        [ 6.9301e-01, -4.4730e-01],
        [ 7.2677e-01, -6.0251e-01],
        [ 4.1145e-01, -4.7387e-01],
        [ 3.7287e-01, -1.3390e-01],
        [ 2.6420e-01, -8.4186e-03]])

A third way to achieve this would be to use softmax

In [16]:
tril = torch.tril(torch.ones(T, T))
weights = torch.zeros((T,T))
weights = weights.masked_fill(tril == 0, float('-inf'))
print(weights)

weights = F.softmax(weights, dim=1)
print(weights)

xbow3 = weights @ x
xbow3[0] #yet again, same output

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


tensor([[-1.2265e-03, -2.8304e+00],
        [ 2.2114e-01, -1.7608e+00],
        [ 8.3490e-01, -4.7451e-01],
        [ 6.9301e-01, -4.4730e-01],
        [ 7.2677e-01, -6.0251e-01],
        [ 4.1145e-01, -4.7387e-01],
        [ 3.7287e-01, -1.3390e-01],
        [ 2.6420e-01, -8.4186e-03]])

### Version 4: Self-attention

Implementing a single head of self-attention

In [None]:
#previous information and current information is averaged together
#implementing a single head of self-attention

B, T, C = 4, 8, 32 #batch, time, channels
x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) #size = B, T, head_size=16
q = query(x) #size = B, T, head_size=16
v = value(x)
#All queries dot product with all keys
weights = q @ k.transpose(-2,-1) #size(B, T, head_size) @ (B, head_size, T) -> (B, T, T)


tril = torch.tril(torch.ones(T, T))
# weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf')) #upper triangular masking
weights = F.softmax(weights, dim=-1)
out = weights @ v

weights[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5203, 0.4797, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0796, 0.7265, 0.1939, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0081, 0.2897, 0.3129, 0.3892, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0381, 0.2742, 0.1506, 0.4870, 0.0502, 0.0000, 0.0000, 0.0000],
        [0.0628, 0.0504, 0.0915, 0.0337, 0.6328, 0.1287, 0.0000, 0.0000],
        [0.0035, 0.0289, 0.1494, 0.0994, 0.4226, 0.2691, 0.0271, 0.0000],
        [0.0443, 0.0207, 0.1250, 0.0171, 0.0934, 0.0232, 0.5833, 0.0928]],
       grad_fn=<SelectBackward0>)