## A small ChatGPT style Transformer

source: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=2301s

Source: Andrej Karpathy

In [25]:

import torch
import numpy as np
import requests
## import tiktoken


In [23]:
## !pip install requests
## !pip install tiktoken    ## requires python   >    3.9

In [32]:

block_size = 8


In [8]:

input_file_path = 'input.txt'


data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'

with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)
        

In [9]:

with open(input_file_path, 'r') as f:
    text = f.read()
    



In [10]:
print("length of data in characters")
len(text)

length of data in characters


1115394

In [None]:
n = len(data)


In [11]:
print(  text[:1000]   )

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.




## get list of unique characters


In [13]:

chars = sorted(     list(set(text))   )
chars


['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [14]:

vocab_size = len(chars)
vocab_size 


65

In [15]:

print(  ''.join(chars)  )



 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


## tokenizer

create a mapping from characters to integers

* other better options are tiptoken and SentencePiece


In [18]:

stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }

encode = lambda s: [ stoi[c]          for c in s   ]    ## encoder: string to integer
decode = lambda l: ''.join(   itos[i] for i in l   )    ## decoder: interger to string


## Encode and decode

In [19]:

print(   encode("hii there")    )


[46, 47, 47, 1, 58, 46, 43, 56, 43]


In [24]:

print(   decode(   encode("hii there")  )   )


hii there



## Encode the text


In [26]:

data = torch.tensor(   encode(text), dtype=torch.long   )
print(data.shape, data.type)


torch.Size([1115394]) <built-in method type of Tensor object at 0x7f9bf00add70>


In [27]:
data

tensor([18, 47, 56,  ..., 45,  8,  0])

In [28]:
print(   data[:1000]   )

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 


## Train and test split


In [29]:

n = int(   0.9*len(data)   )
n


1003854

In [31]:

train_data = data[:n]
val_data   = data[n:]



## sample random chunks of block size


In [33]:

train_data[:block_size+1]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


## offset by one


In [34]:

x = train_data[:block_size]
y = train_data[1:block_size+1]


In [37]:
x

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [38]:
y

tensor([47, 56, 57, 58,  1, 15, 47, 58])

In [39]:

for t in range(block_size):
    context = x[: t+1]
    target  = y[t]
    print(f"when input is {context} then the target is: {target}")


when input is tensor([18]) then the target is: 47
when input is tensor([18, 47]) then the target is: 56
when input is tensor([18, 47, 56]) then the target is: 57
when input is tensor([18, 47, 56, 57]) then the target is: 58
when input is tensor([18, 47, 56, 57, 58]) then the target is: 1
when input is tensor([18, 47, 56, 57, 58,  1]) then the target is: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) then the target is: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then the target is: 58


In [None]:
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]