In [3]:
import pandas as pd
import os

with open('/Users/tarikrashada/Projects/myMiniGPT/data/nietzsche.txt','r') as f:
    text = f.read()

In [4]:
print(len(text))

600901


In [6]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz¤¦©«Ã†
85


In [8]:
# these are the possible characters are model can emit. 
# We need to develop a strategy to tokenize the text - 
# convert the raw text to some sequence of integers according 
# to some vocabulary

# the simplest tokenization procedure would be character tokenization
# in practice LLMs have tokenizers that are optimized a lot better
# our character tokenizer will only have 85 integers but real
# tokenizers like tiktoken produce 10s of thousands of tokens

In [12]:
char_to_int_map = {char : i for i,char in enumerate(chars)}
int_to_char_map = {i : char for i,char in enumerate(chars)}
def encode(s):
    return [char_to_int_map[s[i]] for i in range(len(s))]
def decode(int_list):
    return ''.join([int_to_char_map[k] for k in int_list])

In [16]:
import torch
data = torch.tensor(encode(text),dtype=torch.long)
# data is the complete tokenized text

600901

In [18]:
# we will now split the data into train and validation sets. We will make the first 90% of the data
# train and the rest will be for validation
num = int(0.9*len(data))
train_data = data[:num]
val_data = data[num:]

In [19]:
# next we need to break the train data into context windows and also batches 
batch_size = 16
context_length = 32

train_data[:context_length+1]

tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42, 44, 39, 39, 38, 42, 32, 37,
        30,  1, 72, 60, 53, 72,  1, 43, 70, 73, 72, 60,  1, 61, 71])

In [20]:
# this is the first 17 characters. When we sample a chunk of data like this, this actually has multiple
# examples packed. We want our transformer to be able to make predictions with contexts of length up to 32

# it should still be able to make predictions with a context of less than this. E.g.

for t in range(context_length):
    x = data[:t+1]
    y = data[t+1]
    
    print(f"The context {x} is used to predict target: {y}")

The context tensor([39]) is used to predict target: 41
The context tensor([39, 41]) is used to predict target: 28
The context tensor([39, 41, 28]) is used to predict target: 29
The context tensor([39, 41, 28, 29]) is used to predict target: 24
The context tensor([39, 41, 28, 29, 24]) is used to predict target: 26
The context tensor([39, 41, 28, 29, 24, 26]) is used to predict target: 28
The context tensor([39, 41, 28, 29, 24, 26, 28]) is used to predict target: 0
The context tensor([39, 41, 28, 29, 24, 26, 28,  0]) is used to predict target: 0
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0]) is used to predict target: 0
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0]) is used to predict target: 42
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42]) is used to predict target: 44
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42, 44]) is used to predict target: 39
The context tensor([39, 41, 28, 29, 24, 26, 28,  0,  0,  0, 42, 44, 39]) is use

In [23]:
# so context length is the maximum context length for predictions while batch_size is the number of
# independent sequences processed in parallel

def generate_batch(split):
    data = train_data if split == 'train' else val_data
    # generate batch size number of random positions between 0 and len(data) - context_length
    xi = torch.randint(len(data) - context_length, (batch_size,))
    # the first 'context_length' characters starting at i
    x = torch.stack([data[i:i + context_length] for i in xi])
    # the y's are the offset 1 of this
    y = torch.stack([data[i + 1: i + 1 + context_length] for i in xi])
    return x,y

# y gives us the correct answer for every position in x (it is the next token that we would be predicting)

In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding_lookup = nn.Embedding(vocab_size, vocab_size)
    # thin wrapper where we basically convert each integer in our input to the integer-th row of the
    # embedding matrix
    def forward(self, input, targets=None):
        logits = self.embedding_lookup(input) # <- shape is (batch_size, context_length, vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
        # because the final dimension is vocab size we can interpret this to be the scores or logits for the
        # next char in the sequence - for each of the inputs we have vocab_size scores because there are
        # batch_size * context_length predictions
    def generate(self, input, max_new_tokens):
        for _ in range(max_new_tokens):
            # get predictions
            logits, _ = self(input)
            # focus on last time step because this is the prediction using the entire context window
            logits = logits[:,-1,:]
            # softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from distribution to get 1 sample
            next = torch.multinomial(probs, num_samples=1) # <- (batch_size, 1)
            input = torch.cat((input, next), dim=1) # <- (B, context_length + 1) concatenate the new token
        return input
            

In [25]:
xb, yb = generate_batch('train')

In [34]:
Bigram = BigramLanguageModel(vocab_size)
logits, loss = Bigram(xb,yb)
print(logits.shape)
print(loss)

torch.Size([512, 85])
tensor(4.9363, grad_fn=<NllLossBackward0>)


In [36]:
output = torch.zeros((1,1),dtype=torch.long)
decode(Bigram.generate(output,max_new_tokens=100)[0].tolist())
# the output is random because the embedding weights are initially random

'\n70.I†dqr1M"A©SnJE.jvO\n0.:ePXFL]-1c ¦C¤FsIdPt:;mK†KAxi!Hyd_CqR\'ed¦T=nwRwlqm7HUB«9K\nDvEaP©!JjFy41"oqdW'

In [None]:
optimizer = torch.optim.AdamW(Bigram.parameters(), lr=1e-3)

In [44]:
# train Bigram


batch_size
for steps in range(10000):
    xb, yb = generate_batch('train')
    # evaluate loss
    logits, loss = Bigram(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [45]:
decode(Bigram.generate(output,max_new_tokens=100)[0].tolist())

'\nt\ndaino indehereyedoncuiongealymoulycitheinooclurvess thedd ors: the cobldererere-drata ly shin\n1793'