In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import tiktoken
from torch.utils.data import Dataset, DataLoader
from Layers import *


In [27]:
# Get the input data
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

In [28]:
print(text_data[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [29]:
print(f"Number of characters in data : {len(text_data)}")
tokenizer = tiktoken.get_encoding("gpt2")
tokenized_text_data = tokenizer.encode(text_data)
print(f"Total number of tokens in data is : {len(tokenized_text_data)}")

Number of characters in data : 20479
Total number of tokens in data is : 5145


In [30]:
GPT_CONFIG_124M = {
    "vocab_size" : 50527,  # total no of unique tokens
    "context_length" : 256,  # seq length or context length
    "emb_dim" : 768, # embedding dim, hidden dim, d_model
    "n_heads" : 12, # number of attention heads in MHA
    "n_layers" : 12, # number of transformer layers
    "drop_rate" : 0.1, # dropout rate
    "qkv_bias" : False # Query Key Value 
}

In [31]:
# Splitting the training and validation data
train_ratio = 0.9
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [32]:
train_loader = CreateGPTDatasetV1.create_dataloader_v1(txt = train_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"])
val_loader = CreateGPTDatasetV1.create_dataloader_v1(txt = val_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"])


In [33]:
gpt_model = GPTBlock(cfg=GPT_CONFIG_124M)

In [34]:
torch.manual_seed(123)
gpt_model.eval();

In [35]:
def compute_batch_loss(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = F.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

def compute_loader_loss(data_loader, model, device, num_batches=None):
    # compute avg loss in a batch in a dataloader
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = compute_batch_loss(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note:
# Uncommenting the following lines will allow the code to run on Apple Silicon chips, if applicable,
# which is approximately 2x faster than on an Apple CPU (as measured on an M3 MacBook Air).
# However, the resulting loss values may be slightly different.

#if torch.cuda.is_available():
#    device = torch.device("cuda")
#elif torch.backends.mps.is_available():
#    device = torch.device("mps")
#else:
#    device = torch.device("cpu")
#
# print(f"Using {device} device.")


gpt_model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = compute_loader_loss(train_loader, gpt_model, device)
    val_loss = compute_loader_loss(val_loader, gpt_model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 10.994041866726345
Validation loss: 10.96313762664795
