In [1]:
!pip install -Uqq torch
!pip install -Uqq numpy

In [2]:
import torch

# Raw Implementation Of GPT Like Model
### Download The Dataset

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-03-07 12:13:14--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-03-07 12:13:17 (914 KB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [4]:
with open('data/tinyshakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

### Inspect The Data

In [5]:
len(text)

1115394

In [6]:
# first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



### Get The Vocabulary

In [7]:
# get set of all chars in the text and then get that as a sorted list
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
vocab_size


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


65

### Tokenize The Input
Since this is a character level language model, we'll just translate individual characters to integers.

Other tokenizers to look into:
1. SentencePiece (Google)
2. Tiktoken (OpenAI)

In [8]:
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # take a string and output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # take a list of integers and output a string

print(encode("Hello World!"))
print(decode(encode("Hello World!")))

[20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2]
Hello World!


In [9]:
# stoi is a lookup table where key is the index and value is the character
type(stoi)

dict

In [10]:
# encode the dataset and get a tensor
# data type is int16 because our vocab size is only 65
data = torch.tensor(encode(text), dtype=torch.int16)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int16
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

### Create Training and Validation Splits

In [11]:
n = int(0.9*len(data)) # split 90% of data
train_data = data[:n] # first 90% is training data
val_data = data[n:] # rest is validation data

len(train_data), len(val_data)

(1003854, 111540)

### Create Batches Of Data To Train The Model

Sample random chunks of data from the training set. These chunks are of fixed max length.
In a chunk of 9 characters like `[18, 47, 56, 57, 58,  1, 15, 47, 58]` there are 8 examples for the model to train itself on like:
1. In the context of 18, 47 likely comes next.
2. In the context of 18 and 47, 56 likely comes next and so on.

This also helps the transformer network get used to seeing context length of 1 character upto the max context length.

In [12]:
block_size = 8 # max length of chunks
train_data[:block_size + 1] # first 9 chars in the training set

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58], dtype=torch.int16)

In [13]:
# x are the inputs to the transformer
x = train_data[:block_size]
# y is the next block
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1] # all chars of x upto t incl. t
    target = y[t]
    print(f'input: {context}\ttarget: {target}')

input: tensor([18], dtype=torch.int16)	target: 47
input: tensor([18, 47], dtype=torch.int16)	target: 56
input: tensor([18, 47, 56], dtype=torch.int16)	target: 57
input: tensor([18, 47, 56, 57], dtype=torch.int16)	target: 58
input: tensor([18, 47, 56, 57, 58], dtype=torch.int16)	target: 1
input: tensor([18, 47, 56, 57, 58,  1], dtype=torch.int16)	target: 15
input: tensor([18, 47, 56, 57, 58,  1, 15], dtype=torch.int16)	target: 47
input: tensor([18, 47, 56, 57, 58,  1, 15, 47], dtype=torch.int16)	target: 58


In [20]:
# add batching to process multiple inputs simultaneously
batch_size = 4 # number of independent sequences to be processed parallely
block_size = 8 # max length of the context

def get_batch(split):
    # generate a small batch of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # generate batch_size number of random offsets in the dataset
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) # stack converts multiple rows into a list of rows
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y

In [21]:
# example batches
xb, yb = get_batch('train')
print(f'inputs: {xb}\ninputs_shape: {xb.shape}\n')
print(f'inputs: {yb}\ninputs_shape: {yb.shape}\n')

inputs: tensor([[11,  0, 32, 46, 39, 58,  1, 61],
        [43, 50, 53, 11,  0, 21, 52,  1],
        [53, 53,  1, 50, 47, 58, 58, 50],
        [32, 47, 57,  1, 53, 52, 43,  1]], dtype=torch.int16)
inputs_shape: torch.Size([4, 8])

inputs: tensor([[ 0, 32, 46, 39, 58,  1, 61, 47],
        [50, 53, 11,  0, 21, 52,  1, 58],
        [53,  1, 50, 47, 58, 58, 50, 43],
        [47, 57,  1, 53, 52, 43,  1, 58]], dtype=torch.int16)
inputs_shape: torch.Size([4, 8])



In [23]:
# input and target mapping for batches
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'input: {context.tolist()}\ttarget: {target}')

input: [11]	target: 0
input: [11, 0]	target: 32
input: [11, 0, 32]	target: 46
input: [11, 0, 32, 46]	target: 39
input: [11, 0, 32, 46, 39]	target: 58
input: [11, 0, 32, 46, 39, 58]	target: 1
input: [11, 0, 32, 46, 39, 58, 1]	target: 61
input: [11, 0, 32, 46, 39, 58, 1, 61]	target: 47
input: [43]	target: 50
input: [43, 50]	target: 53
input: [43, 50, 53]	target: 11
input: [43, 50, 53, 11]	target: 0
input: [43, 50, 53, 11, 0]	target: 21
input: [43, 50, 53, 11, 0, 21]	target: 52
input: [43, 50, 53, 11, 0, 21, 52]	target: 1
input: [43, 50, 53, 11, 0, 21, 52, 1]	target: 58
input: [53]	target: 53
input: [53, 53]	target: 1
input: [53, 53, 1]	target: 50
input: [53, 53, 1, 50]	target: 47
input: [53, 53, 1, 50, 47]	target: 58
input: [53, 53, 1, 50, 47, 58]	target: 58
input: [53, 53, 1, 50, 47, 58, 58]	target: 50
input: [53, 53, 1, 50, 47, 58, 58, 50]	target: 43
input: [32]	target: 47
input: [32, 47]	target: 57
input: [32, 47, 57]	target: 1
input: [32, 47, 57, 1]	target: 53
input: [32, 47, 57, 1, 