## **Data preprocessing once again from scratch using the harry potter data + mini-shakespeare dataset**

In [1]:
import os
import torch
import tiktoken
import re
from torch.utils.data import DataLoader, Dataset

#### **Step 1: extracting raw text from datasets**

In [2]:
# reading all txt file from the data dir
dataset_path = "../data"
raw_text = ""

if os.path.exists(dataset_path):
    for filename in os.listdir(dataset_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(dataset_path, filename)
            
            try:
                current_text = ""
                with open(file_path, "r", encoding="utf-8") as data:
                    
                    current_text = data.read() + " <|endoftext|> "

                    raw_text += current_text
                    
                    
            except Exception as e:
                print(f"An error occured during file reading: {e}")
else: 
    print(f"dataset path does not exist: {dataset_path}")
    
# print out full textual dataset
print(raw_text)

Harry Potter and the Sorcerer's Stone


CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.

The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't
think they could bear it if anyone found out about the Potters. Mr

### **Step 2: Tokenization**

**redo word tokenization from scratch again**

in word tokenization 
1. split the textual data into words and characters,
2. remove all whitespaces(optional though)
3. get all unique words and characters
4. create vocabs dict with token and token_ids
5. then we can endocode and decode them

In [3]:
# pattern = re.compile(r'(<\|endoftext\|>|--|\.{3}|[.,:;"\'\[!@#$%^&*()\]-_\|=+~?/><{}]|\s)')
pattern = re.compile(r'(<\|endoftext\|>|--|\.{3}|[.,:;"\'\[!@#$%^&*()\]_\|=+~?/><{}]|\s)')

# Remove hyphens that are likely formatting artifacts (hyphen followed by newline/space)
cleaned_text = re.sub(r'-\s*\n\s*', '', raw_text)  # Remove line-break hyphens
cleaned_text = re.sub(r'-(?=\s)', ' ', cleaned_text)  # Replace "- " with just " "

# split
preprocessed = re.split(pattern, cleaned_text)
# preprocessed = re.split(pattern, raw_text)
# print(preprocessed[:99])

# remove whitepaces
preprocessed = [item.strip() for item in preprocessed if item.strip()]
# print(preprocessed[:99])

# get all unique
all_words = sorted(set(preprocessed))
all_words.append('<|unk|>')


In [4]:
print(len(raw_text))
print(len(preprocessed))
print(len(all_words))
print(all_words[-5:])

3788575
866816
27235
['zounds', '}', '~', '�', '<|unk|>']


In [5]:
# creating vocabs
vocab = {token: token_id for token_id,token in enumerate(all_words)}

# creating coding and decoding class

class WordBasedTokenization():
    def __init__(self, vocab, pattern):
        self.token_to_token_id = vocab
        self.token_id_to_token = {token_id:token for token,token_id in vocab.items()}
        self.pattern = pattern
    
        
    def encode(self, text):
        preprocessed = re.split(self.pattern, text)
        preprocessed = [
            self.token_to_token_id[item.strip()] for item in preprocessed if item.strip()
        ]
        
        return preprocessed
    
    def decode(self, token_ids):
        text = " ".join([self.token_id_to_token[token_id] for token_id in token_ids])
        
        # Add space after most punctuation
        text = re.sub(r'([.,:;!?\)\]\}])(?!\s)', r'\1 ', text)

        # Add space before opening punctuation  
        text = re.sub(r'(?<!\s)([\(\[\{])', r' \1', text)

        # Handle double patterns
        text = re.sub(r'(?<!\s)(--|\.{3})(?!\s)', r' \1 ', text)

        # Clean up
        text = re.sub(r'\s+', ' ', text)
        
        return text

In [6]:
samp_hp = """
He'd forgotten all about the people in cloaks until he passed a group of
them next to the baker's. He eyed them angrily as he passed. He didn't
know why, but they made him uneasy. This bunch were whispering
excitedly, too, and he couldn't see a single collecting tin. It was on
his way back past them, clutching a large doughnut in a bag, that he
caught a few words of what they were saying.
"""

samp_tv = """
"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?
"""

samp_ms = """
Second Citizen:
Worthy Menenius Agrippa; one that hath always loved
the people.

First Citizen:
He's one honest enough: would all the rest were so!

MENENIUS:
What work's, my countrymen, in hand? where go you
With bats and clubs? The matter? speak, I pray you.
"""

wordb_tokenizer = WordBasedTokenization(vocab=vocab, pattern=pattern)

enc_hp = wordb_tokenizer.encode(samp_hp)
enc_tv = wordb_tokenizer.encode(samp_tv)
enc_ms = wordb_tokenizer.encode(samp_ms)

In [7]:
print(f"samp harry: {enc_hp}")
print(f"samp verdict: {enc_tv}")
print(f"samp mini-shakes: {enc_ms}")


samp harry: [3090, 5, 11061, 13846, 7419, 7063, 24590, 19301, 15818, 10017, 25907, 15085, 19135, 7010, 14732, 18608, 24600, 18340, 24913, 24590, 8055, 5, 21594, 190, 3090, 13061, 24600, 7558, 7805, 15085, 19135, 190, 3090, 11590, 5, 24262, 16535, 26782, 9, 9291, 24627, 17265, 15316, 25631, 190, 6145, 9231, 26640, 26734, 12893, 9, 24970, 9, 7545, 15085, 10677, 5, 24262, 22000, 7010, 22566, 10141, 24865, 190, 3467, 26458, 18672, 15339, 26512, 8016, 19146, 24600, 9, 10073, 7010, 16651, 12046, 15818, 7010, 8043, 9, 24583, 15085, 9566, 7010, 13349, 27004, 18608, 26659, 24627, 26640, 21740, 190]
samp verdict: [1, 6111, 15184, 18608, 15339, 14416, 1, 11, 24583, 26458, 26659, 24590, 26972, 9374, 16242, 190, 3310, 9404, 15124, 4163, 190, 2816, 6174, 11, 15339, 16668, 1509, 22608, 11, 11421, 15339, 25481, 7033, 190, 1, 4408, 10728, 16242, 5, 21594, 14481, 24913, 22073, 24590, 26058, 18608, 18166, 19455, 5, 26512, 25950, 466, 9291, 3310, 11989, 5, 24262, 24653, 18608, 24583, 9, 4162, 190, 5159, 1

In [8]:
wordb_tokenizer.decode(enc_hp)


"He ' d forgotten all about the people in cloaks until he passed a group of them next to the baker ' s . He eyed them angrily as he passed . He didn ' t know why , but they made him uneasy . This bunch were whispering excitedly , too , and he couldn ' t see a single collecting tin . It was on his way back past them , clutching a large doughnut in a bag , that he caught a few words of what they were saying . "

In [9]:
wordb_tokenizer.decode(enc_tv)


'" The height of his glory " -- that was what the women called it . I can hear Mrs . Gideon Thwing -- his last Chicago sitter -- deploring his unaccountable abdication . " Of course it \' s going to send the value of my picture \' way up ; but I don \' t think of that , Mr . Rickham -- the loss to Arrt is all I think of . " The word , on Mrs . Thwing \' s lips , multiplied its _ rs _ as though they were reflected in an endless vista of mirrors . And it was not only the Mrs . Thwings who mourned . Had not the exquisite Hermia Croft , at the last Grafton Gallery show , stopped me before Gisburn \' s " Moon-dancers " to say , with tears in her eyes : " We shall not look upon its like again " ? '

In [10]:
wordb_tokenizer.decode(enc_ms)

"Second Citizen : Worthy Menenius Agrippa ; one that hath always loved the people . First Citizen : He ' s one honest enough : would all the rest were so ! MENENIUS : What work ' s , my countrymen , in hand ? where go you With bats and clubs ? The matter ? speak , I pray you . "

### **using gpt2 tokenization**

In [11]:
gpt2_tokenizer = tiktoken.get_encoding(encoding_name='gpt2')
encoded_raw_text = gpt2_tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})

In [12]:
print(encoded_raw_text[:99])
print(len(encoded_raw_text))

[18308, 14179, 290, 262, 30467, 338, 8026, 628, 198, 41481, 16329, 198, 198, 10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13, 360, 1834, 1636, 11, 286, 1271, 1440, 11, 4389, 16809, 9974, 11, 547, 6613, 284, 910, 198, 5562, 484, 547, 7138, 3487, 11, 5875, 345, 845, 881, 13, 1119, 547, 262, 938, 198, 15332, 345, 1549, 1607, 284, 307, 2950, 287, 1997, 6283, 393, 11428, 11, 198, 13893, 484, 655, 1422, 470, 1745, 351, 884, 18149, 13, 198, 198, 5246, 13, 360, 1834, 1636, 373, 262, 3437, 286, 257, 4081, 1444, 1902]
1027082


In [13]:
print(gpt2_tokenizer.decode(encoded_raw_text[-100:]))

 put half-mechanically--"Begin again?" he flashed out. "When the one thing that brings me anywhere near him is that I knew enough to leave off?"

He stood up and laid his hand on my shoulder with a laugh. "Only the irony of it is that I _am_ still painting--since Grindle's doing it for me! The Strouds stand alone, and happen once--but there's no exterminating our kind of art." <|endoftext|> 


### **Step 3: Creating Input-Ouput/Target dataset & dataloaders**

1. crate our dataset using sliding window
2. create dataloaders

In [16]:
class GPTDatasets:
    def __init__(self, raw_text, tokenizer, context_window, stride):
        self.input_token_ids = []
        self.target_token_ids = []
        
        # initialize the tokenizer
        token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
        
        
        for i in range(0, len(token_ids) - context_window, stride):
            input_chunk = token_ids[i: i + context_window]
            target_chunk = token_ids[i+1: i + context_window + 1]
            self.input_token_ids.append(torch.tensor(input_chunk))
            self.target_token_ids.append(torch.tensor(target_chunk))
        
    def __len__(self):
        return len(self.input_token_ids)
    
    def __getitem__(self, index):
        return self.input_token_ids[index], self.target_token_ids[index]
    
    
def create_dataloader(
    raw_text, 
    context_window=256, 
    batch_size=8, 
    stride=8, 
    shuffle=False, 
    drop_last=True, 
    num_workers=0
):
    
    tokenizer = tiktoken.get_encoding(encoding_name="gpt2")
    
    dataset = GPTDatasets(
        raw_text=raw_text, 
        tokenizer=tokenizer, 
        context_window=context_window, 
        stride=stride
    )
    
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader

In [None]:
context_window = 8
batch_size = 8
stride = 8
# vocab_size = 50267


dataloader = create_dataloader(
    raw_text=raw_text, 
    context_window=context_window, 
    batch_size=batch_size, 
    stride=stride
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f"inputs: {inputs}\n")
print(f"\ntargets: {targets}\n")

inputs: tensor([[18308, 14179,   290,   262, 30467,   338,  8026,   628],
        [  198, 41481, 16329,   198,   198, 10970, 16494,    56],
        [19494,   406,  3824,  1961,   198,   198,  5246,    13],
        [  290,  9074,    13,   360,  1834,  1636,    11,   286],
        [ 1271,  1440,    11,  4389, 16809,  9974,    11,   547],
        [ 6613,   284,   910,   198,  5562,   484,   547,  7138],
        [ 3487,    11,  5875,   345,   845,   881,    13,  1119],
        [  547,   262,   938,   198, 15332,   345,  1549,  1607]])


targets: tensor([[14179,   290,   262, 30467,   338,  8026,   628,   198],
        [41481, 16329,   198,   198, 10970, 16494,    56, 19494],
        [  406,  3824,  1961,   198,   198,  5246,    13,   290],
        [ 9074,    13,   360,  1834,  1636,    11,   286,  1271],
        [ 1440,    11,  4389, 16809,  9974,    11,   547,  6613],
        [  284,   910,   198,  5562,   484,   547,  7138,  3487],
        [   11,  5875,   345,   845,   881,    13,  1119

#

### **Step 4: Creating `vector` and `positional` embeddings**

In [31]:
# basicaly the length of the token_ids list 1027082
vocab_size = len(encoded_raw_text)
torch.manual_seed(123)
embedding_dim = 256
embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim=embedding_dim)
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035,  ...,  1.3337,  0.0771, -0.0522],
        [ 0.2386,  0.1411, -1.3354,  ..., -0.0315, -1.0640,  0.9417],
        [-1.3152, -0.0677, -0.1350,  ..., -0.3181, -1.3936,  0.5226],
        ...,
        [ 0.6610, -1.1625, -0.2265,  ...,  0.0489, -1.1780, -1.6377],
        [ 0.2773, -0.1283,  0.6690,  ...,  0.0802,  0.1842,  0.3144],
        [-0.3769, -1.8018, -0.6962,  ..., -0.6056, -0.1421, -0.5702]],
       requires_grad=True)

In [40]:
embedding = embedding_layer(inputs)
embedding.shape, inputs.shape

(torch.Size([8, 8, 256]), torch.Size([8, 8]))

In [42]:
print(gpt2_tokenizer.decode([18308]))
print(embedding_layer(inputs[0][0]))

Harry
tensor([-7.7795e-01,  1.1004e+00,  2.7744e-01, -4.1804e-01, -1.0855e+00,
        -9.0702e-02, -8.3024e-01, -2.4618e+00, -4.2890e-02,  6.4519e-01,
        -2.5704e-01, -3.7011e-01, -5.4641e-01, -1.9257e-01,  6.2528e-01,
         1.3205e-01, -3.5088e-01,  4.5370e-01, -9.8059e-01,  4.8306e-01,
         1.7509e+00, -1.3206e-01, -7.3226e-01, -1.2739e+00, -7.2036e-01,
        -1.9959e-01,  1.6311e-01, -6.0825e-01, -7.2592e-02,  9.5789e-01,
         5.3025e-01,  2.9218e-01, -3.5324e-02, -3.0322e-01, -1.6195e-02,
        -1.2597e+00,  7.0498e-01, -8.2218e-01, -3.7292e-01,  3.7034e-01,
         2.8615e-01,  4.1146e-01,  1.0870e-01,  5.0101e-01,  1.6618e+00,
         3.7413e-01, -1.1601e+00, -6.0538e-01, -2.2629e-01, -1.3104e-01,
        -9.2813e-01,  3.4346e-01,  3.9148e-01, -8.9364e-01,  8.4734e-01,
        -8.2519e-01,  4.2928e-01,  8.1503e-02,  1.2865e+00, -1.4395e+00,
        -1.0222e+00,  7.5806e-01, -7.6855e-02, -3.6854e-01, -2.3565e+00,
        -1.4612e+00,  1.0015e+00, -7.9102e-01

In [43]:
postional_layer = torch.nn.Embedding(context_window, embedding_dim=embedding_dim)
postional_layer.weight

Parameter containing:
tensor([[-0.5468,  0.2558,  1.9322,  ...,  0.4242,  1.0546,  1.2993],
        [ 0.3007, -0.9178,  0.4876,  ..., -1.0389,  0.4304,  0.6189],
        [-1.2320, -2.6218, -0.6628,  ...,  2.1041,  1.4943, -0.3140],
        ...,
        [ 2.1042,  0.1011, -0.1101,  ..., -2.3914,  0.4506,  0.4305],
        [ 1.4134,  1.2489, -0.4364,  ..., -0.0784, -0.0395,  0.1424],
        [-0.6167,  0.8968, -1.3640,  ..., -0.4705, -2.0906,  1.2981]],
       requires_grad=True)

In [48]:
postional_embedding = postional_layer(torch.arange(context_window))
print(postional_embedding, postional_embedding.shape)


tensor([[-0.5468,  0.2558,  1.9322,  ...,  0.4242,  1.0546,  1.2993],
        [ 0.3007, -0.9178,  0.4876,  ..., -1.0389,  0.4304,  0.6189],
        [-1.2320, -2.6218, -0.6628,  ...,  2.1041,  1.4943, -0.3140],
        ...,
        [ 2.1042,  0.1011, -0.1101,  ..., -2.3914,  0.4506,  0.4305],
        [ 1.4134,  1.2489, -0.4364,  ..., -0.0784, -0.0395,  0.1424],
        [-0.6167,  0.8968, -1.3640,  ..., -0.4705, -2.0906,  1.2981]],
       grad_fn=<EmbeddingBackward0>) torch.Size([8, 256])


In [46]:
embedding.shape, postional_embedding.shape

(torch.Size([8, 8, 256]), torch.Size([8, 256]))

### **Last step of preprocessing: Input embedding**

In [49]:
input_embedding = embedding + postional_embedding
print(input_embedding, input_embedding.shape)

tensor([[[-1.3247,  1.3562,  2.2096,  ...,  0.4587, -1.1999,  2.5443],
         [ 0.6819, -0.9396,  0.0054,  ..., -0.6587, -1.7027,  0.7519],
         [-2.5557, -1.4483, -2.1545,  ...,  3.2516,  2.0384,  0.4307],
         ...,
         [-0.3049, -0.4344, -0.8960,  ..., -2.6591,  0.2494,  1.6927],
         [ 1.1898,  0.0121, -0.3229,  ..., -1.3087, -0.0045, -0.9287],
         [-1.8005,  2.8785, -2.2184,  ..., -0.4303, -0.8171,  2.6958]],

        [[ 0.1997,  0.6100,  3.3993,  ...,  1.4518,  3.9914, -0.8249],
         [-0.8780, -1.2314,  1.1746,  ..., -0.0223,  2.0093,  0.3357],
         [-2.0511, -1.5902, -1.6187,  ...,  2.8115,  1.9903, -1.7217],
         ...,
         [ 2.0668,  0.4150, -0.9603,  ..., -2.8979,  1.7751,  1.2961],
         [ 1.2227,  2.1681,  0.8516,  ...,  0.8904, -0.1650, -0.8803],
         [-0.7616, -1.2935, -0.0985,  ...,  0.5904, -2.1971,  0.4951]],

        [[-0.8558,  2.3578,  2.5308,  ...,  0.5117,  0.0593,  1.7717],
         [ 0.5470, -2.0826,  0.9126,  ..., -2