In [None]:
!pip install tiktoken torch

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rakibulhasanshaon69/the-verdict-txt")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rakibulhasanshaon69/the-verdict-txt?dataset_version_number=1...


100%|██████████| 8.86k/8.86k [00:00<00:00, 23.3MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/rakibulhasanshaon69/the-verdict-txt/versions/1





###Creating input target pairs

In [2]:
with open("/root/.cache/kagglehub/datasets/rakibulhasanshaon69/the-verdict-txt/versions/1/the-verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()
  f.close()

In [3]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")

encodings = tokenizer.encode(raw_text)


In [4]:
sample_enc = encodings[50:]

In [5]:
context_len = 4

x = sample_enc[:context_len]
y = sample_enc[1:context_len+1]

In [6]:
x,y

([290, 4920, 2241, 287], [4920, 2241, 287, 257])

In [7]:
for i in range(context_len):
  input = x[:i+1]
  output = y[i]
  print(str(input)+"---->"+str(output))

[290]---->4920
[290, 4920]---->2241
[290, 4920, 2241]---->287
[290, 4920, 2241, 287]---->257


###left of the arrow refers to the token input to the llm , and right of the id is the token llms needs to predict

In [8]:
for i in range(context_len):
  input = x[:i+1]
  output = y[i:i+1]
  print(tokenizer.decode(input)+"---->"+tokenizer.decode(output))

 and----> established
 and established----> himself
 and established himself----> in
 and established himself in----> a


In [9]:
from torch.utils.data import Dataset, DataLoader
import torch
class GPTDatasetV1(Dataset):
  def __init__(self,text,tokenizer,context_len,stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})

    for i in range(0,len(token_ids)-context_len,stride):
      in_chunk = token_ids[i:i+context_len]
      out_chunk = token_ids[i+1:i+context_len+1]
      self.input_ids.append(torch.tensor(in_chunk))
      self.target_ids.append(torch.tensor(out_chunk))
  def __len__(self):
      return len(self.input_ids)
  def __getitem__(self,idx):
      return self.input_ids[idx],self.target_ids[idx]

In [18]:
max_len = 4
dataset = GPTDatasetV1(raw_text,tokenizer,max_len,max_len)
dataloader = DataLoader(dataset,batch_size=8,shuffle=False,drop_last=True) #manages inputing, batching of the data

In [25]:
data_iter = iter(dataloader)
input_tokens,output_tokens = next(data_iter)

In [30]:
vocab_size = 50257
embed_dim = 256


token_embed_layer = torch.nn.Embedding(vocab_size,embed_dim)

In [31]:
embeddings = token_embed_layer(input_tokens)
embeddings.shape

torch.Size([8, 4, 256])

In [35]:
position_embed_layer = torch.nn.Embedding(context_len,embed_dim)
position_embed_layer(torch.arange(context_len)).shape

torch.Size([4, 256])