In [1]:
import torch
import tiktoken

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
with open ("thelostrace.txt", "r") as f:
    raw_text = f.read()

raw_text[:50]

'Cororuc glanced about him and hastened his pace. H'

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")


In [4]:
enc_text = tokenizer.encode(raw_text)

In [5]:
print(enc_text[:20])

[10606, 273, 1229, 27846, 546, 683, 290, 19338, 2945, 465, 8761, 13, 679, 373, 645, 26769, 11, 475, 339, 750]


In [6]:
print(tokenizer.decode(enc_text[:2]))

Coror


In [7]:
len(enc_text)

6923

In [9]:
for i in range(1,10):
    print("Input:", tokenizer.decode(enc_text[:i]), "Target:", tokenizer.decode([enc_text[i]]))

Input: Cor Target: or
Input: Coror Target: uc
Input: Cororuc Target:  glanced
Input: Cororuc glanced Target:  about
Input: Cororuc glanced about Target:  him
Input: Cororuc glanced about him Target:  and
Input: Cororuc glanced about him and Target:  hast
Input: Cororuc glanced about him and hast Target: ened
Input: Cororuc glanced about him and hastened Target:  his


In [12]:
from torch.utils.data import Dataset, DataLoader

In [11]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        #tokenize text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "number of tokenized inputs must at least be equal to max_length+1"

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i+i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]    

In [13]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader