## Creating a LLM from Scratch 

Reading in story as sample txt into python

In [1]:
with open("/Users/Scott/Library/Mobile Documents/com~apple~CloudDocs/CVs and LOMs/Portfolio/DS 2024/GPT architeture/Musashi.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[0:149])


Total number of character: 2649881













MUSASHI 


By Eiji Yoshikawa 

Translated from the Japanese by Charles S. Terry 
Foreword by Edwin O. Reischauer 



Kodansha Internatio


In [2]:
import re

In [3]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print('The amount of tokens in the text are:' , len(preprocessed))

The amount of tokens in the text are: 596013


Converting Tokens into Token Ids

In [4]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

20044


In [5]:
vocab = {token: integer for integer, token in enumerate(all_words)}

In [6]:
import itertools

#Slice vocab words to show first 100 vocab tokens 
dict(itertools.islice(vocab.items(),20))

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '-eight': 7,
 '-five': 8,
 '-seven': 9,
 '-six': 10,
 '.': 11,
 '/': 12,
 '/His': 13,
 '/Run': 14,
 '/clopping': 15,
 '/more': 16,
 '0': 17,
 '0-over': 18,
 '000': 19}

Creating a simple text tokenizer that handles unknown words

In [7]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed] #If word not found give it junk value
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        if isinstance(ids, int):  # Modified decoder if a single integer is passed, convert it to a list
            ids = [ids]
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [8]:
text1 = 'Hello, do you like tea?'

In [9]:
tokenizer = SimpleTokenizer(vocab)
print(tokenizer.encode(text1))

[1155, 6, 7511, 19982, 11618, 17827, 94]


In [10]:
text1_encoded = tokenizer.encode(text1)

print(tokenizer.decode(text1_encoded))

Hello, do you like tea?


Data Sampling with Sliding window

In [11]:
enc_text = tokenizer.encode(raw_text)

In [12]:
# remove first 1500 tokens from sample 
enc_sample = enc_text[1500:]

In [13]:
#create input-target pairs for the next word prediction

context_size = 4
x = enc_text[:context_size]
y = enc_text[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [1739, 437, 776, 3495]
y:      [437, 776, 3495, 3143]


In [14]:
#create next word prediction task
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[10644] ----> 17968
[10644, 17968] ----> 19578
[10644, 17968, 19578] ----> 11089
[10644, 17968, 19578, 11089] ----> 12982


In [15]:
#Use text instead of token Ids
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode(desired))

in ----> the
in the ----> western
in the western ----> island
in the western island ----> of


Implementation of **Dataset and DataLoader classes**

In [16]:
import torch
from torch.utils.data import Dataset, DataLoader



Dataloader helps GPT with preprocessing of data by splitting data into **Tokenized text**, **Overlapped Sequences**, **Creating inputs and next token targets**

In [17]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        #Using sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    #Returns total single row number of dataset
    def __len__(self):
        return len(self.input_ids)
    
    #Returns a single row from dataset
    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

Create Data loader

In [18]:
def create_data_loader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle= True, drop_last= True, num_workers=0):
    tokenizer = SimpleTokenizer(vocab) #initalizes tokenizer
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) #Creates Dataset
    dataloader = DataLoader(dataset, batch_size= batch_size, shuffle=shuffle, drop_last = drop_last, num_workers= num_workers)
    return dataloader

In [19]:
dataloader = create_data_loader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle= False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[1739,  437,  776, 3495]]), tensor([[ 437,  776, 3495, 3143]])]


The `first_batch` variable contains two tensors: The first tensor stores the input token IDs and the second tensor stores the target token IDs

In [20]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 437,  776, 3495, 3143]]), tensor([[ 776, 3495, 3143, 9209]])]


Create token Embeddings

In [21]:
output_dim = 256

In [22]:
print(vocab_size)

20044


In [23]:
torch.manual_seed(25)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.0877, -0.6113,  0.3441,  ...,  0.7630,  0.2497, -0.5980],
        [-0.5935, -0.0849,  0.2489,  ..., -1.0397,  0.1785, -0.3177],
        [ 0.1765,  0.7711,  0.4033,  ..., -0.1258,  0.3940,  0.9446],
        ...,
        [ 0.7967,  0.4206, -0.3368,  ...,  1.2137,  2.2570,  0.0112],
        [ 0.1894, -0.5221, -1.0441,  ...,  1.0416, -0.1292,  0.3845],
        [-1.3862, -0.2585, -0.3558,  ..., -0.4617, -1.7202,  0.1962]],
       requires_grad=True)


In [24]:
input_ids = torch.tensor(x)
print(input_ids)

tensor([1739,  437,  776, 3495])


In [25]:
print(embedding_layer(input_ids))

tensor([[-2.9027, -0.2900,  1.3043,  ..., -0.7227,  0.2779,  1.2826],
        [ 1.7247, -0.0518,  1.1980,  ...,  1.3852, -0.9934,  1.0998],
        [-1.7076,  1.0034, -0.3534,  ...,  0.7094, -1.5231, -0.3077],
        [-0.6164, -0.4027, -1.7662,  ..., -0.0221, -1.9148,  2.6982]],
       grad_fn=<EmbeddingBackward0>)


In [26]:
max_length = 4 
dataloader = create_data_loader_v1(raw_text, batch_size=8, max_length= max_length, stride= max_length, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(targets)
print(inputs.shape)

tensor([[  437,   776,  3495,  3143],
        [ 9209, 17968,  1408,  5356],
        [  478,  2468,    11,  3028],
        [  928,  5356,   772,  2051],
        [   11,  2389,  1565,  1354],
        [ 3106, 20043,  1991,  3491],
        [20043,  1704,  2256, 13042],
        [13300,    85,     6,  9209]])
torch.Size([8, 4])


In [27]:
token_embeddings = embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


This shows that each token ID is now embeded as a 256-dimensional vector.

In [28]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


Two types of positional embeddings: absolute and relative

In [29]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [30]:
input_embeddings

tensor([[[-2.3595, -0.4262,  0.3191,  ...,  1.3478, -0.6671,  0.7288],
         [ 0.5450, -1.2454,  1.4665,  ...,  1.7144, -0.7670,  1.3902],
         [-0.8712, -0.2022, -0.4211,  ...,  0.0714, -2.1658, -0.2321],
         [ 0.3051,  1.4921, -1.4002,  ..., -0.9770, -2.2932,  3.3520]],

        [[ 2.6149, -1.4596, -1.2166,  ...,  1.0100,  0.1937,  0.2448],
         [-0.9907, -1.2723, -2.3366,  ...,  1.5804,  2.6588,  0.7619],
         [ 1.3230, -0.8819,  0.9740,  ..., -1.0369, -2.2120,  0.8156],
         [ 1.0983,  2.3996,  1.3454,  ...,  1.6240, -0.4838,  0.6245]],

        [[ 0.7572, -0.9177, -1.7751,  ...,  1.7690, -0.7205, -0.0769],
         [-0.3805, -1.4515,  2.7671,  ...,  2.0395,  0.2330,  0.1416],
         [ 2.4902, -2.1226, -0.1881,  ..., -1.4347, -0.3249, -0.0801],
         [ 1.1630,  1.9819,  2.4968,  ..., -2.4278,  0.0127, -1.0046]],

        ...,

        [[ 0.6261,  0.3196,  0.8544,  ...,  3.8293,  0.4749,  0.4261],
         [-0.0888, -0.4508,  0.4112,  ...,  0.8742, -1.41

### Bigram Model