### Existing code

In [1]:
import re
with open("verdictbook.txt","r",encoding="utf-8") as f:
    booktext=f.read()
result = re.split(r'([,.:;?_!"()\']|--|\s)', booktext)
result=[word.strip() for word in result if word.strip()]
sortedMAP=sorted(set(result))
sortedMAP.extend(["<|endoftext|>", "<|unk|>"])
string_to_number={txt:num for num,txt in enumerate(sortedMAP)}
class Simpletokenizerv2:
    def __init__(self,vocab):
        self.string_to_number=vocab
        self.number_to_string={num:txt for txt,num in self.string_to_number.items()}
    
    def encode(self,text):
        text = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        text=[word.strip() for word in text if word.strip()]
        result=[self.string_to_number[idx] if idx in self.string_to_number else self.string_to_number["<|unk|>"] for idx in text]

        return result

    def decoder(self,encoded):
        text=" ".join([self.number_to_string[idx] for idx in encoded])
        text = re.sub(r'([,.:;?_!"()\']|--|\s)',r'\1', text)
        return text


tokenizer=Simpletokenizerv2(string_to_number)
result=tokenizer.encode('I HAD always thought Jack Gisburn rather a cheap newword')
result=tokenizer.decoder(result)
print(result)




I HAD always thought Jack Gisburn rather a cheap <|unk|>


### Creating input target pair

In [7]:
encoded=tokenizer.encode('I HAD always thought Jack Gisburn rather a cheap newword')

Context_size=9
for i in range(1,Context_size+1):
    context=encoded[:i]
    decoded=encoded[i]

    print(context ,"-->", decoded)


for i in range(1,Context_size+1):
    context=encoded[:i]
    decoded=encoded[i]

    print(tokenizer.decoder(context) ,"-->", tokenizer.decoder([decoded]))





[53] --> 44
[53, 44] --> 149
[53, 44, 149] --> 1003
[53, 44, 149, 1003] --> 57
[53, 44, 149, 1003, 57] --> 38
[53, 44, 149, 1003, 57, 38] --> 818
[53, 44, 149, 1003, 57, 38, 818] --> 115
[53, 44, 149, 1003, 57, 38, 818, 115] --> 256
[53, 44, 149, 1003, 57, 38, 818, 115, 256] --> 1131
I --> HAD
I HAD --> always
I HAD always --> thought
I HAD always thought --> Jack
I HAD always thought Jack --> Gisburn
I HAD always thought Jack Gisburn --> rather
I HAD always thought Jack Gisburn rather --> a
I HAD always thought Jack Gisburn rather a --> cheap
I HAD always thought Jack Gisburn rather a cheap --> <|unk|>


### Implementing a dataloader

In [21]:
from torch.utils.data import Dataset, DataLoader
import tiktoken
import torch

class DatasetclassV1(Dataset):
    def __init__(self,txt,encoder,strides,maxlen):
        self.inputlist=[]
        self.targetlist=[]

        tokenized = encoder.encode(txt, allowed_special={"<|endoftext|>"})
    
        for i in range(0,len(tokenized)-maxlen,strides):
            input_chunk=tokenized[i:i+maxlen]
            target_chunk=tokenized[i+1:i+maxlen+1]
            self.inputlist.append(torch.tensor(input_chunk))
            self.targetlist.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.targetlist)

    def __getitem__(self, index):
        return self.inputlist[index],self.targetlist[index]
    



def create_dataloader_v1 (txt, batch_size=4, max_length=256,stride=128, shuffle=True, drop_last=True,num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    datasetobj=DatasetclassV1(txt,tokenizer,stride,max_length)
    dataloader = DataLoader(
        datasetobj,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader



In [24]:
with open("verdictbook.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
