# Input Data Loading

In [1]:
with open("../ponniyinselvan.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20161
The story revolves around Vandiyathevan, a charming, brave and courageous young man who sets out ac


In [2]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item]
print(preprocessed[:38])

['The', ' ', 'story', ' ', 'revolves', ' ', 'around', ' ', 'Vandiyathevan', ',', ' ', 'a', ' ', 'charming', ',', ' ', 'brave', ' ', 'and', ' ', 'courageous', ' ', 'young', ' ', 'man', ' ', 'who', ' ', 'sets', ' ', 'out', ' ', 'across', ' ', 'the', ' ', 'Chola', ' ']


# Token-IDs Generation

## 1. Custom Tokeniser

### Creating the vocabulary
1. Get unique words 
2. Sort alphabetically
3. Assign ID

In [3]:
all_words = sorted(set(preprocessed))
len(all_words)

798

In [4]:
vocabulary = {word:index for index,word in enumerate(all_words)}
vocabulary

{'\n': 0,
 ' ': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '.': 6,
 'Aabathudavigal': 7,
 'Aanaimangalam': 8,
 'Actually': 9,
 'Aditha': 10,
 'Aditya': 11,
 'Adurantaka': 12,
 'After': 13,
 'Along': 14,
 'Amai': 15,
 'Amudhan': 16,
 'Aniruddha': 17,
 'Anuradhapuram': 18,
 'Arabians': 19,
 'Arulmozhi': 20,
 'Arulmozhivarman': 21,
 'At': 22,
 'Azhwarkaddiyan': 23,
 'Before': 24,
 'Boothi': 25,
 'Brahmarayar': 26,
 'Buddha': 27,
 'Buddhist': 28,
 'Chakravarthi': 29,
 'China': 30,
 'Chinna': 31,
 'Chola': 32,
 'Cholar': 33,
 'Choodamani': 34,
 'Command': 35,
 'Crown': 36,
 'Devaralan': 37,
 'Devi': 38,
 'Due': 39,
 'Durga': 40,
 'During': 41,
 'Emperor': 42,
 'Even': 43,
 'Everyone': 44,
 'Finally': 45,
 'Fort': 46,
 'From': 47,
 'Gandaradithya': 48,
 'He': 49,
 'His': 50,
 'Ilaya': 51,
 'In': 52,
 'Intending': 53,
 'Kadambur': 54,
 'Kadhamaaran': 55,
 'Kanchi': 56,
 'Kandhamaaran': 57,
 'Karikalan': 58,
 'Karuthiruman': 59,
 'Kesari': 60,
 'Kodambalur': 61,
 'Kodikarai': 62,
 'Kollidam': 63,

### Tokenizer

In [5]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_id = vocab
        self.id_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_id[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.id_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [6]:
customTokenizer = SimpleTokenizer(vocabulary)

### Testing Tokenizer

In [7]:
sample_token_ids = customTokenizer.encode("""Azhwarkaddiyan work for Arulmozhivarman.
Vandiyathevan and Aditha Karikalan are chola""")
sample_token_ids

[23, 793, 375, 21, 6, 144, 183, 10, 58, 188, 254]

In [8]:
customTokenizer.decode([23, 793, 375, 21, 6, 144, 183, 10, 58, 188, 254])

'Azhwarkaddiyan work for Arulmozhivarman. Vandiyathevan and Aditha Karikalan are chola'

Next: https://github.com/rasbt/LLM-workshop-2024/blob/main/02_data/02.ipynb

## 2. Byte Pair Encoding (BPE)
Good with unknown words as it splits at unknown word into known vocab

In [9]:
import importlib
import tiktoken

In [10]:
print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [11]:
tokenizer = tiktoken.get_encoding("gpt2")

### Testing

In [12]:
text = (
    """Azhwarkaddiyan work for Arulmozhivarman. <|endoftext|> Vandiyathevan and Aditha Karikalan are cholas"""
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[32, 23548, 48542, 2860, 72, 4121, 670, 329, 943, 377, 5908, 23548, 452, 283, 805, 13, 220, 50256, 35464, 7745, 26221, 10438, 290, 1215, 342, 64, 9375, 1134, 25786, 389, 442, 12456]


In [13]:
strings = tokenizer.decode(integers)

print(strings)

Azhwarkaddiyan work for Arulmozhivarman. <|endoftext|> Vandiyathevan and Aditha Karikalan are cholas


# Data Loading

In [14]:
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

In [15]:
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_length, stride):
            input_ids_chunk = token_ids[i: i+max_length]
            target_ids_chunk = token_ids[i+1: i+max_length+1]

            self.input_ids.append(torch.tensor(input_ids_chunk))
            self.target_ids.append(torch.tensor(target_ids_chunk))

    def __len__(self):
        return(len(self.input_ids))

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


In [16]:
# Method to load the Dataset class
def create_data_loader(text, max_length=256, stride=128, batch_size=4, shuffle=True, drop_last = True, num_workers=0):

    # Initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDataset(text, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

### Testing

In [17]:
data_loader = create_data_loader(raw_text, max_length=4, stride=4, batch_size=8, shuffle=False)
data_iterator = iter(data_loader)
next(data_iterator)

[tensor([[  464,  1621, 38228,  1088],
         [35464,  7745, 26221, 10438],
         [   11,   257, 23332,    11],
         [14802,   290, 34010,  1862],
         [  582,   508,  5621,   503],
         [ 1973,   262,   609,  5708],
         [ 1956,   284,  5203,   257],
         [ 3275,   422,   262, 12223]]),
 tensor([[ 1621, 38228,  1088, 35464],
         [ 7745, 26221, 10438,    11],
         [  257, 23332,    11, 14802],
         [  290, 34010,  1862,   582],
         [  508,  5621,   503,  1973],
         [  262,   609,  5708,  1956],
         [  284,  5203,   257,  3275],
         [  422,   262, 12223,  9005]])]