# GPT-2 Text Generation

## Dataset: Tiny Shakespeare

Available at: https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt

In [7]:
import requests
import os

import torch
import torch.nn as nn

### Download dataset and preview it

In [8]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/refs/heads/master/data/tinyshakespeare/input.txt"
filename = "input.txt"

try:
    response = requests.get(url)
    response.raise_for_status()

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)

    print(f"Downloaded and saved '{filename}'")

except requests.exceptions.RequestException as e:
    print(f"Error downloading file: {e}")
except IOError as e:
    print(f"Error saving file: {e}")
    

Downloaded and saved 'input.txt'


In [9]:
#Read the file and preview contents
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Number of characters: {len(text)}")

#Print first 1000 characters
print(text[:1000])


Number of characters: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread

In [None]:
#Create vocabulary

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


## Tokenize the input text

Build the tokenizer: Convert the raw text to some sequence of integers, and in this case, each character will be tokenized.

In [11]:
#Create mapping from characters to integers

stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}

#building the encoder and decoder
encode = lambda x: [stoi[c] for c in x]  #take a string, convert to integers
decode = lambda y: ''.join([itos[i] for i in y])  #take a list of integers, convert to string

print(encode("Hello, world!"))
print(decode(encode("Hello, world!")))

[20, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42, 2]
Hello, world!


### Encode the dataset

In [13]:
data = torch.tensor(encode(text), dtype=torch.int64) #or torch.long
print(data.shape, data.dtype)

print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [15]:
#Perform train/test split

n = int(0.9*len(data)) #90-10 split for train-val
train_data = data[:n]
val_data = data[n:]

### Context Length/Block size

The data is sampled in chunks, and the length of the chunk of text that is input to the model is defined as "context length" or "block size".

The transformer sees everything from one character up to the block size.

In [None]:
block_size = 8
train_data[:block_size+1]  #+1, as the target for each position is the next character

x = train_data[:block_size]
y = train_data[1:block_size+1]  #offset by 1
for t in range(block_size):
    context = x[:t+1]  #t characters including the t'th character
    target = y[t]  #t+1'th character
    print(f"When input is {context} the target is: {target}")

When input is tensor([18]) the target is: 47
When input is tensor([18, 47]) the target is: 56
When input is tensor([18, 47, 56]) the target is: 57
When input is tensor([18, 47, 56, 57]) the target is: 58
When input is tensor([18, 47, 56, 57, 58]) the target is: 1
When input is tensor([18, 47, 56, 57, 58,  1]) the target is: 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is: 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is: 58


### Batches
For parallel processing of data, batches of chunks are processed at the same time.

In [None]:
#looking at one batch of size (batch_size, block_size)

batch_size = 4 #how many independent sequences will be processed in parallel
block_size = 8 #maximum context length for predictions

def get_batch(split):

    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))  #generate random positions to grab chunk out of  | torch.randint(low, high, (size:tuple))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])  #offset by 1

    return x, y


xb, yb = get_batch('train')
print('inputs: ', xb.shape, xb)
print('targets: ', yb.shape, yb)

for b in range(batch_size):   #batch dimension
    for t in range(block_size):   #time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()}, the target is {target}")

inputs:  torch.Size([4, 8]) tensor([[59, 44, 44, 43, 56,  1, 51, 43],
        [61, 47, 57, 46,  1, 46, 43, 56],
        [43, 39, 56,  1, 53,  5, 43, 56],
        [52,  1, 54, 43, 56, 57, 59, 39]])
targets:  torch.Size([4, 8]) tensor([[44, 44, 43, 56,  1, 51, 43, 10],
        [47, 57, 46,  1, 46, 43, 56,  1],
        [39, 56,  1, 53,  5, 43, 56, 57],
        [ 1, 54, 43, 56, 57, 59, 39, 42]])
When input is [59], the target is 44
When input is [59, 44], the target is 44
When input is [59, 44, 44], the target is 43
When input is [59, 44, 44, 43], the target is 56
When input is [59, 44, 44, 43, 56], the target is 1
When input is [59, 44, 44, 43, 56, 1], the target is 51
When input is [59, 44, 44, 43, 56, 1, 51], the target is 43
When input is [59, 44, 44, 43, 56, 1, 51, 43], the target is 10
When input is [61], the target is 47
When input is [61, 47], the target is 57
When input is [61, 47, 57], the target is 46
When input is [61, 47, 57, 46], the target is 1
When input is [61, 47, 57, 46,