# GPT : from scratch

## Dataset

### Loading Data

In [1]:
with open('input.txt','r',encoding='utf-8') as f:
    text = f.read()
print('length of the dataset in characters:',len(text))

length of the dataset in characters: 1115394


In [2]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


### getting all the unique characters in the data

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


### Tokenizing (Building Encoder & Decoder)

In [4]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]  #input string ---> output list of integers
decode = lambda l: ''.join([itos[i] for i in l])

In [5]:
print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


### Tokenizing the dataset and stroing into a tensor

In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


### Splitting data into train and validation 

In [7]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

## Data Loader : batch of chunks of data

### Block size or context size

In [8]:
block_size = 8
train_data[:block_size+1] #because input and lable 8 will be input and last one is output(targets) of

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context} the target: {target}')

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


### Batch Dimension : chunks of tensors

In [10]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel
block_size = 8 # maximum context length for predictions


#creating 4 batches
def get_batch(split):
    #generate a small batch of data of inputs x and targets y
    data = train_data if split=='train' else val_data

    ix = torch.randint(len(data)-block_size,(batch_size,))

    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y


In [11]:
#getting x batch xb, y batch yb
xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)
print('\n')


print('targets')
print(yb.shape)
print(yb)

inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [12]:
for b in range(batch_size): #batch dimension
    print('---Batch---')
    for t in range(block_size): #time dimension or context size
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f'when input is {context} target is: {target}')
    print('____\n')

---Batch---
when input is tensor([24]) target is: 43
when input is tensor([24, 43]) target is: 58
when input is tensor([24, 43, 58]) target is: 5
when input is tensor([24, 43, 58,  5]) target is: 57
when input is tensor([24, 43, 58,  5, 57]) target is: 1
when input is tensor([24, 43, 58,  5, 57,  1]) target is: 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) target is: 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) target is: 39
____

---Batch---
when input is tensor([44]) target is: 53
when input is tensor([44, 53]) target is: 56
when input is tensor([44, 53, 56]) target is: 1
when input is tensor([44, 53, 56,  1]) target is: 58
when input is tensor([44, 53, 56,  1, 58]) target is: 46
when input is tensor([44, 53, 56,  1, 58, 46]) target is: 39
when input is tensor([44, 53, 56,  1, 58, 46, 39]) target is: 58
when input is tensor([44, 53, 56,  1, 58, 46, 39, 58]) target is: 1
____

---Batch---
when input is tensor([52]) target is: 58
when input is tensor([52, 58]) targ

In [13]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


## Biagram Language Model

Here we are creating a token embedding table of vocab_size * vocab_size i.e 65*65 \n 
idx will go to this emebdding table and pluck out the row of size 65 for example here
24 will to go the embedding table and pluck out the 24th row.

Pytorch arrange these tensors in B,T,C dimensions

where 

      Batch size (B) = 4

      Time(context) T = 8

      channels(vocab size) c = 65

here tokens are not talking to each other 

In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BiagramLanguageModel(nn.Module):
    
    def __init__(self,vocab_size):
        super().__init__()
        #each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)


    def forward(self, idx, targets=None):


        #idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) #(B,T,C)
        if targets is None:
            loss = None
        else:
        #reshaping the logits inot B,C,T which suitable for cross entropy loss
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
        return logits,loss
    
    #generate function for the model
    def generate(self, idx, max_new_tokens):
        #idc is (B,T) array of indices in the current context

        for _ in range(max_new_tokens):
            #get the predictions
            logits,loss = self(idx) #which calls forward method

            #focus only on the last time step
            logits = logits[:,-1,:]  # becomes B,C

            #apply softmax to get probabilites
            probs = F.softmax(logits,dim=-1) # B,C

            #sample from distribution
            idx_next = torch.multinomial(probs,num_samples=1) #(B,1)

            #append sampled index to the running sequence
            idx = torch.cat((idx,idx_next),dim=-1)  # (B,T+1)
        return idx
m = BiagramLanguageModel(vocab_size)
logits,loss = m(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long)
generated_output = m.generate(idx = idx,max_new_tokens=100)[0].tolist()

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)


In [26]:
print(generated_output)
print(decode(generated_output))

[0, 31, 23, 21, 41, 24, 32, 11, 13, 41, 17, 24, 25, 53, 32, 40, 60, 38, 60, 1, 15, 12, 52, 55, 7, 29, 17, 9, 9, 10, 15, 22, 55, 49, 27, 23, 20, 7, 55, 11, 10, 50, 39, 2, 53, 47, 63, 61, 49, 20, 48, 45, 15, 46, 64, 40, 29, 12, 59, 2, 9, 40, 24, 21, 45, 61, 43, 60, 51, 63, 18, 22, 19, 33, 19, 54, 0, 61, 52, 37, 35, 51, 52, 62, 23, 35, 35, 43, 60, 7, 58, 16, 55, 36, 17, 56, 34, 23, 24, 45, 22]

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


### Training the Model

In [28]:
#create a pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
batch_size = 32

for steps in range(10000):

    #sample batch of data
    xb,yb = get_batch('train')

    #evalate the loss
    logits,loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    optimizer.step()
print(loss.item())

4.684566974639893


In [34]:
idx = torch.zeros((1,1), dtype=torch.long)
generated_output = m.generate(idx = idx,max_new_tokens=1000)[0].tolist()
print(decode(generated_output))


pgu3KpoiPFhix
SJ'
yFjjN3Q&u3WgMmywW$GJL?sPY?YtNpErgIo,XcJ&DniqVmZBwfFD3faBoi'I3Q?$xBr&G,yxUN!Wsyy cLC-IBothTsze'W?q;!:xrFti.ZQyJyQ!u!zHK$EQq-wM.T'QUiN.SPyjKtL&vbRwW,SZBCj?aUIUxDALpAUGFbLQXNVY.sBId-'wNm;UL:3Sq-KAiqRiN:lL-Qm,iCulyZUAP,oSB3SlriFWiy;GJ,NuUxxTER-!a!UYN
kOpJU'usc UIo.JAa!U CjuRw'TthF;aLq-KX&jzCE;HzE-Wla!uDKBjuxVBk!UQXlFlaXt,wWWV&G;HU?zW:.ObX?wfRkRyJBUzK$Evi
hjdlgGrT?I33StKa;AUGJAE-WWgI'VPwKoQ3TqSt:BAE3sttTER&ZTOMmR:Ckzr$zL&d&AUy IXROZPn;ERq,s$:C ,jKtxHCFT;yVajH:i3SPjuCPOJXSPNuHW:r:dcxZph'Q&ZgIQ3xEaj,pN3:CKpJgmOvUAURpgId, -bGlvmUiWW.Db:zPyIlfpepuMkgDuC!
SGXyXH;?ovnpO z:qDQqQEN.C&f WMp'bkqhX.NgA Cjue;Jr&uktXC pUt3QyCuiMyCj?n;3ct CKfBktIBL'IE$gCJdVzP!Kus fq-bW,GlgMsrVhSsuli'PgnNL;RwbvvN.?wfIPkSyjnpmZAUEJyGGNLE3pGJGoi:'Dnq,c;:bj?:CJKOv:C b?phEJ?up$epYppYNWmhhf.sC&yIiASh
AcMa
SP$PgxlnoiAWttib?n;AH,L;;hskpD.D!o'IVj;hMGZS-gkJrIF.SPi-iv' &D.
gdcLfjAcBotphWWERiyNW
oCgFA?RGMlv?OTFlDqdTWv.
ISqgSlIV;VRlylvviO'jQJJFbsEy;AfIcP p fdz&LEQJObkq-3SBTh;PgQbDC,SGTqttxKAcIKBTAcmNu!C-WxEsLtqRp3S