# GPT From scratch

Look upand read paper on Attention is all you need.
https://arxiv.org/pdf/1706.03762

## Acknowledgements:
The contents have been organized from these references:

1. evshahs.medium.com/build-gpt-with-me-implementing-gpt-from-scratch-step-by-step-b2efe4e2f7e0

In [3]:
%reload_ext autoreload
%autoreload 2

In [3]:
%%script echo "Comment this line to run this cell"
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


Comment this line to run this cell


In [90]:
'''
1. Load the text
2. Create a map of character to numeric representation of some sort

'''
text = open("input.txt").read()
chars = sorted(list(set(text))) #get all the characters in the first 1000 characters
vocab_size = len(chars) # get the size of it
char_to_index = {char: index for index, char in enumerate(chars)}
index_to_char = {index: char for index, char in enumerate(chars)}

print(f"vocab_size: {vocab_size}, Sample chars: {chars[0:10]}") #, char_to_index, index_to_char

vocab_size: 65, Sample chars: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3']


In [12]:
# Given a sentence, convert to a vector 
def encode_string(s):
    encoded_list = [char_to_index[char] for char in s]
    return encoded_list

def decode_list(l):
    decoded_string = ''.join([index_to_char[index] for index in l])
    return decoded_string

#Example 
enc = encode_string("babu")
dec = decode_list(enc)
print(f"Encoded: {enc} ==> decoded: '{dec}'")

Encoded: [40, 39, 40, 59] ==> decoded: 'babu'


In [None]:
import torch
data = torch.tensor(encode_string(text), dtype=torch.long)
assert len(data) == len(text), "Hmmm whats Wrong"
len(data), len(text), data[0:10]

(1115394, 1115394, tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]))

In [122]:
# example to test the function
batch_size  = 4
block_size  = 10
train       = data[0: int(.9*len(data)) ]  # 90% training, remaining validations
val_data    = data[len(train):]

# Create some batch of data: get some sample data of batch size
# 
# It works as follows:
# 1. pick random positions in the data
# 2. create array of (batch_size X block_size) 
# 3. Block_size is the window length or context length
#  
def get_batch(split="train"):
    # generate a small batch of data of inputs x and targets y
    data = train if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # get a random value
    x = torch.stack([data[i:i+block_size] for i in ix]) # the first block size (context)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # the target
    return x, y

# This function is same as baove but lot better without global variables
def get_batch1(data, context_len=3, batch_size=1):
    # generate a small batch of data of inputs x and targets y
    #data = train if split == 'train' else val_data
    ix = torch.randint(len(data) - context_len, (batch_size,)) # get a random value
    x = torch.stack([data[i:i+context_len] for i in ix]) # the first block size (context)
    y = torch.stack([data[i+1:i+context_len+1] for i in ix]) # the target
    return x, y

torch.manual_seed(1)
get_batch('train'), "-----", torch.manual_seed(1), get_batch1(train, 10,4)

((tensor([[42, 43, 56,  1, 58, 53,  1, 58, 46, 63],
          [51,  1, 57, 53,  1, 40, 56, 47, 45, 46],
          [ 0, 28, 56, 53, 60, 53, 57, 58, 10,  0],
          [47, 45, 46, 58, 12,  0,  0, 13,  1, 28]]),
  tensor([[43, 56,  1, 58, 53,  1, 58, 46, 63,  1],
          [ 1, 57, 53,  1, 40, 56, 47, 45, 46, 58],
          [28, 56, 53, 60, 53, 57, 58, 10,  0, 32],
          [45, 46, 58, 12,  0,  0, 13,  1, 28, 50]])),
 '-----',
 <torch._C.Generator at 0x10e92f730>,
 (tensor([[42, 43, 56,  1, 58, 53,  1, 58, 46, 63],
          [51,  1, 57, 53,  1, 40, 56, 47, 45, 46],
          [ 0, 28, 56, 53, 60, 53, 57, 58, 10,  0],
          [47, 45, 46, 58, 12,  0,  0, 13,  1, 28]]),
  tensor([[43, 56,  1, 58, 53,  1, 58, 46, 63,  1],
          [ 1, 57, 53,  1, 40, 56, 47, 45, 46, 58],
          [28, 56, 53, 60, 53, 57, 58, 10,  0, 32],
          [45, 46, 58, 12,  0,  0, 13,  1, 28, 50]])))

In [77]:
%%script echo
#Some quick test to see what is what => learning moment Not needed
print(torch.randint(256, (4,)), " <== Prints random integet array of legth (4,) " )

# What does stack do?
e = torch.tensor([0,1,2,3])
torch.stack([e,e,e])




# Bigram Model

In [59]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size) 
        # We will create a embedding for tokens and 
        # for now kee pthe embedding length to be same as vocab_size 
        # Reminder: vocab size computed above was 64, remember?

    def forward(self, idx, targets):
        # (B,T,C) (batch size, window length (or context len), encoding len) tensor (4,8,vocab_size)
        logits = self.token_embedding(idx) 

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # reshapes the logits tensor into a 2D tensor (flatten it)
            targets = targets.view(B*T) # flatten the target tensor
            loss = F.cross_entropy(logits, targets) #calculate the loss between the 2, measures how well the logits match the targets

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # this is used to generate new sequences of tokens
        # also note that idx is (batch size, block size)

        for i in range(max_new_tokens):

                logits, loss = self(idx, None) # obtain the predictions for the given input sequence (idx)

                logits = logits[:, -1, :] # this becomes (batch size, channels), focus on the last step

                probs = F.softmax(logits, dim=-1) # convert into probabilities

                idx_next = torch.multinomial(probs, num_samples=1) # give us 1 sample (1 prediction)

                idx = torch.cat((idx, idx_next), dim=1) # whatever the prediction is, concatenate it with the current idx and use this to predict the next element

        return idx

In [125]:
e = nn.Embedding(65, 3)
x,y = get_batch1(train, 3, 2)
e, x, e(x).shape, e(x), e(x)


(Embedding(65, 3),
 tensor([[43,  1, 47],
         [57,  1, 53]]),
 torch.Size([2, 3, 3]),
 tensor([[[-0.8202,  0.3057,  0.1398],
          [ 0.4634,  0.9385,  1.4253],
          [ 0.0399, -0.7815,  0.3195]],
 
         [[-0.1036, -1.4259,  0.3364],
          [ 0.4634,  0.9385,  1.4253],
          [-0.8303, -1.2991, -1.4490]]], grad_fn=<EmbeddingBackward0>),
 tensor([[[-0.8202,  0.3057,  0.1398],
          [ 0.4634,  0.9385,  1.4253],
          [ 0.0399, -0.7815,  0.3195]],
 
         [[-0.1036, -1.4259,  0.3364],
          [ 0.4634,  0.9385,  1.4253],
          [-0.8303, -1.2991, -1.4490]]], grad_fn=<EmbeddingBackward0>))

In [128]:
e(x), e(x).view(6,3)

(tensor([[[-0.8202,  0.3057,  0.1398],
          [ 0.4634,  0.9385,  1.4253],
          [ 0.0399, -0.7815,  0.3195]],
 
         [[-0.1036, -1.4259,  0.3364],
          [ 0.4634,  0.9385,  1.4253],
          [-0.8303, -1.2991, -1.4490]]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.8202,  0.3057,  0.1398],
         [ 0.4634,  0.9385,  1.4253],
         [ 0.0399, -0.7815,  0.3195],
         [-0.1036, -1.4259,  0.3364],
         [ 0.4634,  0.9385,  1.4253],
         [-0.8303, -1.2991, -1.4490]], grad_fn=<ViewBackward0>))

In [88]:
m = nn.Linear(2,3)
i = torch.randn(3,2)
print(f"{m}\n{i}\n{m(i)}")

Linear(in_features=2, out_features=3, bias=True)
tensor([[-1.1683, -0.5679],
        [-1.6316,  0.4828],
        [-0.0666, -0.7035]])
tensor([[-0.1701,  0.9120,  0.5708],
        [-0.1698,  1.4401,  1.4677],
        [ 0.3454,  0.3957, -0.0211]], grad_fn=<AddmmBackward0>)


In [133]:
torch.tril(torch.zeros(4, 4))

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])