# Reading the data

In [1]:
with open('all.txt', 'r',encoding='utf-8') as f:
    text = f.read()

In [2]:
print("dataset length: ", len(text))

dataset length:  6340988


In [3]:
text[:1200]

'HARRY POTTER AND THE CHAMBER OF SECRETS \nby J. K. Rowling\n\u3000\u3000(this is BOOK 2 in the Harry Potter series)\n\u3000\u3000Original Scanned/OCR: Friday, April 07, 2000 v1.0 (edit where needed, change version number by 0.1)\n\u3000\u3000CHAPTER\tONE\n\u3000\u3000THE WORST BIRTHDAY\n\u3000\u3000Not for the first time, an argument had broken out over breakfast at number four, Privet Drive. Mr. Vernon Dursley had been woken in the early hours of the morning by a loud, hooting noise from his nephew Harry\'s room.\n\u3000\u3000"Third time this week!" he roared across the table. "If you can\'t control that owl, it\'ll have to go!"\n\u3000\u3000Harry tried, yet again, to explain.\n\u3000\u3000"She\'s bored," he said. "She\'s used to flying around outside. If I could just let her out at night -"\n\u3000\u3000"Do I look stupid?" snarled Uncle Vernon, a bit of fried egg dangling from his bushy mustache. "I know what\'ll happen if that owl\'s let out."\n\u3000\u3000He exchanged dark looks w

In [4]:
text = text.replace('\u3000','')

### Cleaning up the data

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab size: ',vocab_size)

	
 !"$%&'()*,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz|}~é–—‘’“”…【】下为书件作你做全制区坛子式志您文新最本来格电的社立米糯自要论载
vocab size:  136


In [6]:
chars_to_remove = "“”…【】下为书件作你做全制区坛子式志您文新最本来格电的社立米糯自要论载~é–—‘’[]^_<>=`|}%"
trans_table = str.maketrans('', '', chars_to_remove)
text = text.translate(trans_table)

In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab size: ',vocab_size)

	
 !"$&'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz
vocab size:  82


In [8]:
chars_to_remove = chars[2]+chars[0]
trans_table = str.maketrans('', '', chars_to_remove)
text = text.translate(trans_table)

# create the vocab

In [9]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('vocab size: ',vocab_size)


 !"$&'()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ\abcdefghijklmnopqrstuvwxyz
vocab size:  80


## Encoder and Decoder

#### here we will create out own character level encoder. OpenAI use tiktoken that does BPE encoding. Another popular tokeniser is sentence peice that does sub word encodings. After this is done, try another one with tiktoken and sentence piece encoding as well

In [10]:
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # takes in the string and outputs a list of integers for the characters of that string
decode = lambda l: ''.join([itos[i] for i in l]) # taken in a list of inigers and outputs a corresponsing string

print(encode('hello world'))
print(decode(encode('hello world')))

[61, 58, 65, 65, 68, 1, 76, 68, 71, 65, 57]
hello world


#### let's tokenise the harry poter corpus based on the above tokeniser we created

In [11]:
import torch
data = torch.tensor(encode(text),dtype = torch.long)
print('data shape: ',data.shape)
print(data[:1000])

data shape:  torch.Size([6298015])
tensor([34, 27, 44, 44, 51,  1, 42, 41, 46, 46, 31, 44,  1, 27, 40, 30,  1, 46,
        34, 31,  1, 29, 34, 27, 39, 28, 31, 44,  1, 41, 32,  1, 45, 31, 29, 44,
        31, 46, 45,  1,  0, 55, 78,  1, 36, 12,  1, 37, 12,  1, 44, 68, 76, 65,
        62, 67, 60,  0,  7, 73, 61, 62, 72,  1, 62, 72,  1, 28, 41, 41, 37,  1,
        16,  1, 62, 67,  1, 73, 61, 58,  1, 34, 54, 71, 71, 78,  1, 42, 68, 73,
        73, 58, 71,  1, 72, 58, 71, 62, 58, 72,  8,  0, 41, 71, 62, 60, 62, 67,
        54, 65,  1, 45, 56, 54, 67, 67, 58, 57, 13, 41, 29, 44, 24,  1, 32, 71,
        62, 57, 54, 78, 10,  1, 27, 69, 71, 62, 65,  1, 14, 21, 10,  1, 16, 14,
        14, 14,  1, 75, 15, 12, 14,  1,  7, 58, 57, 62, 73,  1, 76, 61, 58, 71,
        58,  1, 67, 58, 58, 57, 58, 57, 10,  1, 56, 61, 54, 67, 60, 58,  1, 75,
        58, 71, 72, 62, 68, 67,  1, 67, 74, 66, 55, 58, 71,  1, 55, 78,  1, 14,
        12, 15,  8,  0, 29, 34, 27, 42, 46, 31, 44, 41, 40, 31,  0, 46, 34, 31,
     

# Train/Test split

In [12]:
n = int(0.9 * len(data))
train_data = data[:n] #first 90%
val_data = data[n:] #last 10%

In [13]:
block_size = 8 #context length of our model will be block_size + 1

In [14]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When output is {decode(context.tolist())} the target is {decode([target.item()])}")

When output is H the target is A
When output is HA the target is R
When output is HAR the target is R
When output is HARR the target is Y
When output is HARRY the target is  
When output is HARRY  the target is P
When output is HARRY P the target is O
When output is HARRY PO the target is T


#### adding the batch dimension

In [16]:
torch.manual_seed(1337)
batch_size = 4 # homany indipendent sequences we will process in parallel
block_size = 8 # maximum context length

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb = get_batch('train')
print('inputs: ', xb.shape, ' : \n',xb)
print('targets: ', yb.shape, ' : \n',yb)
print('--------')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when out put in {decode(context.tolist())} the target: {decode([target.item()])}")

inputs:  torch.Size([4, 8])  : 
 tensor([[67, 57,  1, 57, 58, 72, 58, 71],
        [58, 57,  1, 54, 67, 57,  1, 59],
        [62, 57,  1,  0, 78, 68, 74,  1],
        [67, 57, 68, 71, 12,  1, 51, 68]])
targets:  torch.Size([4, 8])  : 
 tensor([[57,  1, 57, 58, 72, 58, 71, 73],
        [57,  1, 54, 67, 57,  1, 59, 71],
        [57,  1,  0, 78, 68, 74,  1, 58],
        [57, 68, 71, 12,  1, 51, 68, 74]])
--------
when out put in n the target: d
when out put in nd the target:  
when out put in nd  the target: d
when out put in nd d the target: e
when out put in nd de the target: s
when out put in nd des the target: e
when out put in nd dese the target: r
when out put in nd deser the target: t
when out put in e the target: d
when out put in ed the target:  
when out put in ed  the target: a
when out put in ed a the target: n
when out put in ed an the target: d
when out put in ed and the target:  
when out put in ed and  the target: f
when out put in ed and f the target: r
when out put in i 

# Let's start feeding this into the transformer

In [17]:
print(xb) # out input to the transformer

tensor([[67, 57,  1, 57, 58, 72, 58, 71],
        [58, 57,  1, 54, 67, 57,  1, 59],
        [62, 57,  1,  0, 78, 68, 74,  1],
        [67, 57, 68, 71, 12,  1, 51, 68]])


#### we will start with a bigram model

In [23]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,targets=None):
        logits = self.token_embedding_table(idx)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets) #neg log likelihood loss
        return logits, loss

    def generate(self,idx,max_new_tokens):
        # idx is (B,T) array of indices in the current index
        for _ in range(max_new_tokens):
            logits,loss = self(idx)
            logits = logits[:,-1,:]
            probs = F.softmax(logits,dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx,idx_next), dim=1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb,yb)
print(logits.shape)
print('Loss: ',loss.item())

print(decode(m.generate(torch.zeros((1,1),dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 80])
Loss:  4.717065334320068

eoK6KGlKhxi(NQ/9-i4$K6;i8UJA5PMip7.wOS?-aD6\!:wCI82(u5PXNNQT*H8B$C1sIWA-jxm7a:D:DloRhiQc.LA2afnXZQ-R


# Let's train the model now

In [24]:
#create an optamizer object

optimizer = torch.optim.AdamW(m.parameters(),lr = 1e-3)

In [32]:
batch_size = 32
for steps in range(10000):
    xb,yb = get_batch('train')
    logits,loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.447514533996582


In [35]:
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long), max_new_tokens=500)[0].tolist()))


 
"SThes outint wn'st hesm " s facumstle. .
" Cof t m ws  s s he, atmeeawanong trley the wh s peace..
G blon. ltheroned dil.
yof tr howowa g, f w t tht, aglof.
OUmput, ontit onthind br; seantheauthenalsh thigher agaly.. a 
 tshack hinlle dindsowimbo m Whey ort ggouthe icar Hentifof ashin'
"Welof withanct ct?"Lulal thamm wan't batis ckenckne as! Mch, fbown ve trad. he wad.
Weve t Mo'SKind 'Ealorr wousuthomutist,"Yowe ing cof s nestistr bered de. frysisthin tcarobed, womig.  s Th its -gs se  in p 


# let's see how attention works

In [39]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 32])

In [44]:
head_size = 16
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias= False)
value = nn.Linear(C, head_size, bias= False)

k = key(x)
q = query(x)
v = value(x)

wei = q@k.transpose(-2,-1)

tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril ==0, float('-inf'))
wei = F.softmax(wei, dim=1)
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [45]:
wei

tensor([[[0.0072, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0357, 0.0530, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0331, 0.0913, 0.0883, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4802, 0.2227, 0.3702, 0.1406, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0126, 0.2006, 0.0727, 0.2308, 0.2723, 0.0000, 0.0000, 0.0000],
         [0.2243, 0.1704, 0.1866, 0.1144, 0.2476, 0.8089, 0.0000, 0.0000],
         [0.1780, 0.2286, 0.1284, 0.1971, 0.3023, 0.1293, 0.5951, 0.0000],
         [0.0290, 0.0334, 0.1537, 0.3171, 0.1778, 0.0618, 0.4049, 1.0000]],

        [[0.0083, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4076, 0.0311, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0011, 0.3741, 0.2578, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0793, 0.1331, 0.0080, 0.0559, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0258, 0.0924, 0.0559, 0.3522, 0.2261, 0.0000, 0.0000, 0.0000],
         [0.2555, 0.233