In this notebook we will continue with the learnings form the v1 and apply it to the scale of -
- training and evaluation
- dataloader modified according to the both train and test data
- longer runs with more epochs

In [None]:
%autosave 300
%reload_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [None]:
import os
os.chdir(
    "/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers"
)
print(os.getcwd())

In [None]:
import tiktoken
import torch
from notebooks.gpt2_models.dummy_model import GPT, GPTConfig
from torch.nn import functional as F
import time
from contextlib import nullcontext
import inspect
import numpy as np
import matplotlib.pyplot as plt
import math
from typing import Tuple, List

New dataset producing both train and test data

In [None]:
data_dir = os.path.join(os.getcwd(), "data/tinyshakespeare")
encoder = tiktoken.get_encoding("gpt2")

In [None]:
def prep_train_val_token_array(data_dir, filename, encoder, split_ratio=0.9):
    """Read the file, encode it, split it into train and val, and save the token array as binary files"""
    file_path = os.path.join(data_dir, filename)
    print(f"Reading file from {file_path}")
    with open(file_path, "r") as f:
        text = f.read()

    max_len = len(text)
    train_str = text[: int(split_ratio * max_len)]
    val_str = text[int(split_ratio * max_len) :]

    print(f"Train length: {len(train_str)}")
    print(f"Val length: {len(val_str)}")

    train_tokens = encoder.encode(train_str)
    val_tokens = encoder.encode(val_str)

    print(f"Train tokens: {len(train_tokens)}")
    print(f"Val tokens: {len(val_tokens)}")

    train_ids = np.array(train_tokens, dtype=np.uint16)
    val_ids = np.array(val_tokens, dtype=np.uint16)
    train_ids.tofile(os.path.join(data_dir, "train.bin"))
    val_ids.tofile(os.path.join(data_dir, "val.bin"))

In [None]:
prep_train_val_token_array(data_dir, "input.txt", encoder)

In [None]:
def load_token_array(data_dir, filename):
    """Load the token array from binary file"""
    token_data = np.memmap(
        os.path.join(data_dir, f"{filename}.bin"), dtype=np.uint16, mode="r"
    )
    token_data = torch.from_numpy(token_data.astype(np.int64))
    return token_data

In [None]:
# train_tokens = load_token_array(data_dir, "train")
# val_tokens = load_token_array(data_dir, "val")

In [None]:
class DataloaderLite:
    def __init__(self, B: int, T: int, data_dir: str, filename: str):
        self.B = B  # batch size
        self.T = T  # seq length
        self.data_dir = data_dir
        self.filename = filename
        assert filename in ["train", "val"], "Only 'train' and 'val' files are allowed"
        self.reset()

    def reset(self) -> None:
        self.tokens = load_token_array(self.data_dir, self.filename)
        print(f"Total tokens in the file: {len(self.tokens)}")
        self.current_position = 0
        self.num_iterations = len(self.tokens) // (self.B * self.T)
        print(
            f"1 epoch will have {self.num_iterations} iterations given Batch size={self.B} and Context length={self.T}"
        )

    def next_batch(self) -> Tuple[torch.Tensor, torch.Tensor]:
        B, T = self.B, self.T
        # print(
        #     f"Current position: {self.current_position} and total tokens: {len(self.tokens)}"
        # )
        if self.current_position + B * T + 1 > len(self.tokens):
            # print(
            #     "Resetting the position for the next batch as current iter exhausts the data file"
            # )
            self.reset()

        end_position = self.current_position + B * T + 1
        current_batch = (
            self.tokens[self.current_position : end_position].clone().detach()
        )  # clone and detach to avoid memory leak

        # reshaping the tensor to (B, T) shape and creating input and target tensors
        x = current_batch[:-1].view(B, T)  # input tensor of shape (B, T)
        y = current_batch[1:].view(B, T)  # target tensor of shape (B, T)

        # updating the position for the next batch
        self.current_position += B * T

        return x, y

In [None]:
max_lr = 6e-4
min_lr = max_lr * 0.3
warmup_steps = 10
weight_decay = 0.1
num_epochs = 2
initial_iterations = 100
min_val_iter = 10

In [None]:
# basic settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
torch.cuda.manual_seed(42)
device_type = "cuda" if torch.cuda.is_available() else "cpu"
print(
    f" Device: {device_type} and Device count: {torch.cuda.device_count()} and Device name: {torch.cuda.get_device_name()}"
)

In [None]:
ideal_batch_size = 524288  # as per GPT3 paper
permit_batch = 8  # good enough 1 core gpu
permit_context = 1024
# grad_accumulation_steps = ideal_batch_size // (permit_batch * permit_context)
grad_accumulation_steps = ideal_batch_size // (permit_batch * permit_context)
print(f"grad_accumulation_steps: {grad_accumulation_steps}")

In [None]:
# permisssion for grad_clip
grad_clip = 0.0
# model compilation
compile = True

In [None]:
train_dataloader = DataloaderLite(permit_batch, permit_context, data_dir, "train")
val_dataloader = DataloaderLite(permit_batch, permit_context, data_dir, "val")

In [None]:
# for i in range(37):
#     x, y = train_dataloader.next_batch()
#     print(x.shape, y.shape)
#     print(x[0, :10])
#     print(y[0, :10])

In [None]:
# for i in range(5):
#     x, y = val_dataloader.next_batch()
#     print(x.shape, y.shape)
#     print(x[0, :10])
#     print(y[0, :10])
#     print("===")

In [None]:
def get_schedule_lr(
    it, warmup_iters=500, learning_rate=0.1, lr_decay_iters=1000, min_lr=0.01
):
    """Get the learning rate schedule for training after cosine annealing"""
    if it < warmup_iters:
        return learning_rate * (it + 1) / warmup_iters
    if it > lr_decay_iters:
        return min_lr

    # Cosine annealing learning rate schedule
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [None]:
def configure_optimizers(model, weight_decay, learning_rate, device_type):
    """Create the optimizer and scheduler for training"""

    # start with all of the candidate parameters (that require grad)
    param_dict = {pn: p for pn, p in model.named_parameters()}
    param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}

    # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
    # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
    decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
    nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
    optim_groups = [
        {"params": decay_params, "weight_decay": weight_decay},
        {"params": nodecay_params, "weight_decay": 0.0},
    ]

    num_decay_params = sum(p.numel() for p in decay_params)
    num_nodecay_params = sum(p.numel() for p in nodecay_params)

    print(
        f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
    )
    print(
        f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
    )
    # Create AdamW optimizer and use the fused version if it is available
    fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
    use_fused = fused_available and device_type == "cuda"

    optimizer = torch.optim.AdamW(
        optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused
    )
    return optimizer

In [None]:
dtype = (
    "bfloat16"
    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    else "float16"
)
print(f"Using {dtype} for automatic mixed precision training")

# note: float16 data type will automatically use a GradScaler
ptdtype = {
    "float32": torch.float32,
    "bfloat16": torch.bfloat16,
    "float16": torch.float16,
}[dtype]

In [None]:
# context manager for automatic mixed precision training
ctx = (
    nullcontext()
    if device_type == "cpu"
    else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
)

if ctx != nullcontext():
    # GradScaler for automatic mixed precision training
    scaler = torch.cuda.amp.GradScaler(enabled=True)


In [None]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()

transformer_model = GPT(
    GPTConfig(
        vocab_size=50304,
        block_size=1024,
        n_layer=12,
        n_head=12,
        n_embd=768,
    )
)
transformer_model.to(device)

if compile:
    transformer_model = torch.compile(transformer_model)

In [None]:
total_iteration = num_epochs * train_dataloader.num_iterations
print(f"Total iterations: {total_iteration}")

In [None]:
optimizer = configure_optimizers(transformer_model, weight_decay, max_lr, device_type)

In [None]:
for iter in range(
    total_iteration
):  # total_iteration = num_epochs * train_loader.num_iterations

    start_time = time.time()

    # validate the model after every epoch
    if iter % train_dataloader.num_iterations == 0:
        print(f"Epoch: {iter // train_dataloader.num_iterations}")
        print("Model Evaluation.....")
        transformer_model.eval()
        with torch.no_grad():
            val_loss_accum = 0.0
            iter_range = min(min_val_iter, val_dataloader.num_iterations)
            for _ in range(iter_range):
                x, y = val_dataloader.next_batch()
                x, y = x.to(device), y.to(device)
                with ctx:
                    logits, loss = transformer_model(x, y)
                loss = (
                    loss / iter_range
                )  # divide the loss by accumulation steps to get the average loss
                val_loss_accum += loss.detach()  # accumulate the loss

        print(f"validation loss: {val_loss_accum.item():.4f}")

    print("Model Training.....")
    transformer_model.train()
    optimizer.zero_grad()
    loss_accum = 0.0
    for micro_step in range(
        grad_accumulation_steps
    ):  # grad_accumulation_steps = ideal_batch_size // (permit_batch * permit_context)
        # get the next batch
        x, y = train_dataloader.next_batch()
        x, y = x.to(device), y.to(device)
        # forward pass
        with ctx:
            logits, loss = transformer_model(x, y)

        loss = (
            loss / grad_accumulation_steps
        )  # divide the loss by accumulation steps to get the average loss
        loss_accum += loss.detach()  # accumulate the loss

        # backward pass with GradScaler
        scaler.scale(loss).backward()

    norm = 0.0
    if grad_clip != 0.0:
        # unscales the gradients of optimizer's assigned params in-place
        scaler.unscale_(optimizer)
        # clip the gradients
        norm = torch.nn.utils.clip_grad_norm(transformer_model.parameters(), grad_clip)

    # get the current learning rate
    lr = get_schedule_lr(iter, warmup_steps, max_lr, int(total_iteration * 0.8), min_lr)
    # update the learning rate
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

    # Unscales the gradients and calls optimizer.step()
    scaler.step(optimizer)
    # Updates the scale for next iteration
    scaler.update()
    end_time = time.time()
    torch.cuda.empty_cache()  # clear the cache
    torch.cuda.synchronize()  # wait for the computation to be done
    time_elapsed = (end_time - start_time) * 1000
    token_persec = (x.numel() * grad_accumulation_steps) / time_elapsed
    print(
        f"for iter: {iter} | trainloss: {loss_accum.item()} | validation loss {val_loss_accum.item()} | norm: {norm:.2f} | lr {lr:.4e} | time: {time_elapsed:.2f}ms | tokens/mssec: {token_persec:.2f}"
    )