<a href="https://colab.research.google.com/github/rya23/llm-from-scratch/blob/master/first_ai_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Load the Dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-08-06 17:31:08--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-08-06 17:31:08 (99.4 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
import torch

In [87]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [2]:
with open('input.txt' , 'r',encoding ='utf-8') as f:
  text = f.read()

In [3]:
text[:100]
len(text)

1115394

In [4]:
chars = sorted(list(set(text)))
print(''.join(chars))
vocabulary_size = len(chars)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s : [stoi[c] for c in s]
decode = lambda s : ''.join([itos[c] for  c in s])

In [6]:
print(encode("hello there"))
print(decode(encode("hello there")))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


In [8]:
data = torch.tensor(encode(text),dtype=torch.long)
data[:100]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [9]:
#train test split
n = int(0.9*len(data))
train_data = data[:n]
test_data = data[n:]

In [10]:
block_size = 8 # size of context to be trained
batch_size = 4 # Independent Parallel data blocks to be loaded at once


def generate_batch(split):

    data = train_data if split == 'train' else test_data
    random_points = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in random_points])
    y = torch.stack([data[i+1:i+block_size+1] for i in random_points])

    x,y = x.to(device),y.to(device)

    return x,y

In [11]:
x,y = generate_batch('train')
print(x)
print(y)

tensor([[ 1, 40, 63,  1, 58, 46, 43,  1],
        [61, 47, 52, 58, 43, 56,  5, 57],
        [53, 56,  7,  7,  0,  0, 19, 24],
        [54, 53, 57, 43, 42,  1, 58, 46]])
tensor([[40, 63,  1, 58, 46, 43,  1, 57],
        [47, 52, 58, 43, 56,  5, 57,  1],
        [56,  7,  7,  0,  0, 19, 24, 27],
        [53, 57, 43, 42,  1, 58, 46, 47]])


##Logits are non normalized predictions which are fed to the softmax function

In [51]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramModel(torch.nn.Module):

    def __init__(self,vocabulary_size):
        super().__init__()

        self.token_embedding_table = nn.Embedding(vocabulary_size,vocabulary_size)

    def forward(self,idx,targets = None): #idx are the random samples in the dataset
        #targets is the expected value (idx is x  targets is y)
        logits = self.token_embedding_table(idx) # of shape (B = Batch size , T = Time (block size) , C = Classes (vocab size))
        if targets is None:
            loss= None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)

        return logits,loss


    def generate(self,idx,max_tokens):
        # idx is B , T
        for _ in range(max_tokens):
            logits,loss = self(idx)
            logits = logits[:,-1,:] # B , C
            probability = F.softmax(logits,dim=-1)

            idx_next = torch.multinomial(probability,num_samples=1) # B ,1

            idx =  torch.cat((idx,idx_next),dim=1) # B ,C+1

        return idx




In [52]:
m = BigramModel(vocabulary_size)
m = m.to(device)
logits,loss = m(x,y)
print(loss)

tensor(4.6027, grad_fn=<NllLossBackward0>)


In [74]:
print(decode(m.generate(idx = torch.zeros((1,1),dtype = torch.long) , max_tokens = 20)[0].tolist()))


vCSev ?T- &v$YKZUNmY


In [78]:
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [93]:
batch_size = 64
max_iters= 3000
eval_iters=200
eval_interval = 300

In [94]:
for iter in range(max_iters):

    if iter%eval_interval==0:

        loss = calculate_loss()
        print(f"Iteration : {iter} Training Loss : {loss['train']} Validation Loss : {loss['val']}")

    xb,yb = generate_batch('train')

    logits,loss = m(xb,yb)
    optimizer.zero_grad(set_to_none=False)
    loss.backward()
    optimizer.step()


Iteration : 0 Training Loss : 2.4675893783569336 Validation Loss : 2.4890668392181396
Iteration : 300 Training Loss : 2.4678900241851807 Validation Loss : 2.4856951236724854
Iteration : 600 Training Loss : 2.4672842025756836 Validation Loss : 2.4812521934509277
Iteration : 900 Training Loss : 2.4628500938415527 Validation Loss : 2.4841713905334473
Iteration : 1200 Training Loss : 2.4638962745666504 Validation Loss : 2.4822094440460205
Iteration : 1500 Training Loss : 2.4603497982025146 Validation Loss : 2.4743521213531494
Iteration : 1800 Training Loss : 2.4552009105682373 Validation Loss : 2.481659173965454
Iteration : 2100 Training Loss : 2.4576921463012695 Validation Loss : 2.484069347381592
Iteration : 2400 Training Loss : 2.464855909347534 Validation Loss : 2.483203649520874
Iteration : 2700 Training Loss : 2.4564592838287354 Validation Loss : 2.485264539718628


In [89]:
context = torch.zeros((1,1),dtype=torch.long , device = device)

In [90]:
print(decode(m.generate(idx = context , max_tokens = 200)[0].tolist()))


HllirsaE:
Handith Filfome hon w s f s CLA n ctehtilowat g:
Julse; LA:
s.


Ththatowend ay d, Tuat 'stof ply,

Resowe itc, I tsith dushly hisire me t wnctis ju wory wienthe n, wis my'stemisthan, cus ac


In [92]:
@torch.no_grad()


def calculate_loss():

    out = {}
    model = m.eval()

    for split in ['train','val']:

        losses = torch.zeros(eval_iters)

        for k in range(eval_iters):
            X,Y = generate_batch(split)
            logits,loss = m(X,Y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out


tensor([2])
