In [1]:
%autosave 300
%reload_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [2]:
import os

os.chdir(
    "/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers"
)
print(os.getcwd())

/mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers


In [3]:
import os
import time
import math
import torch
import tiktoken
import numpy as np
from torch import nn
from torch.nn import functional as F
from torch.optim import AdamW
from contextlib import nullcontext
from typing import Tuple
import inspect
from notebooks.gpt2_models.dummy_model import GPT, GPTConfig

In [4]:
# Constants

DATA_DIR = os.path.join(os.getcwd(), "data/tinyshakespeare")
ENCODER = tiktoken.get_encoding("gpt2")

MAX_LR = 6e-4
MIN_LR = MAX_LR * 0.3
WARMUP_STEPS = 10
WEIGHT_DECAY = 0.1
NUM_EPOCHS = 2
MIN_VAL_ITER = 10

IDEAL_BATCH_SIZE = 524288  # Batch size for the model as per GPT-3 paper
PERMIT_BATCH = 8  # No of batches as permitted by GPU
PERMIT_CONTEXT = 1024  # context length / sequence length
GRAD_ACCUMULATION_STEPS = IDEAL_BATCH_SIZE // (PERMIT_BATCH * PERMIT_CONTEXT)

GRAD_CLIP = 0.0
COMPILE = True if GPTConfig().flash_attention else False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE_TYPE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = (
    "bfloat16"
    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    else "float16"
)
ptdtype = {
    "float32": torch.float32,
    "bfloat16": torch.bfloat16,
    "float16": torch.float16,
}[DTYPE]


# Set the random seeds for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [5]:
def prep_train_val_token_array(data_dir, filename, encoder, split_ratio=0.9):
    """Read the file, encode it, split it into train and val, and save the token array as binary files"""
    file_path = os.path.join(data_dir, filename)
    print(f"Reading file from {file_path}")

    # Read the text data from the file
    with open(file_path, "r") as f:
        text = f.read()

    # Split the text into train and validation parts
    train_len = int(split_ratio * len(text))
    train_str, val_str = text[:train_len], text[train_len:]

    print(f"Train length: {len(train_str)}")
    print(f"Val length: {len(val_str)}")

    # Encode the train and validation text to tokens
    train_tokens, val_tokens = encoder.encode(train_str), encoder.encode(val_str)

    print(f"Train tokens: {len(train_tokens)}")
    print(f"Val tokens: {len(val_tokens)}")

    # Save the tokens as binary files
    np.array(train_tokens, dtype=np.uint16).tofile(os.path.join(data_dir, "train.bin"))
    np.array(val_tokens, dtype=np.uint16).tofile(os.path.join(data_dir, "val.bin"))

In [6]:
def load_token_array(data_dir, filename):
    """Load the token array from binary file"""
    token_data = np.memmap(
        os.path.join(data_dir, f"{filename}.bin"), dtype=np.uint16, mode="r"
    )
    return torch.from_numpy(token_data.astype(np.int64))


class DataloaderLite:
    def __init__(self, B: int, T: int, data_dir: str, filename: str):
        self.B = B  # batch size
        self.T = T  # seq length
        self.data_dir = data_dir
        self.filename = filename
        assert filename in ["train", "val"], "Only 'train' and 'val' files are allowed"
        self.reset()

    def reset(self) -> None:
        self.tokens = load_token_array(self.data_dir, self.filename)
        print(f"Total tokens in the file: {len(self.tokens)}")
        self.current_position = 0
        self.num_iterations = len(self.tokens) // (self.B * self.T)
        print(
            f"1 epoch will have {self.num_iterations} iterations given Batch size={self.B} and Context length={self.T}"
        )

    def next_batch(self) -> Tuple[torch.Tensor, torch.Tensor]:
        B, T = self.B, self.T
        # print(
        #     f"Current position: {self.current_position} and total tokens: {len(self.tokens)}"
        # )
        if self.current_position + B * T + 1 > len(self.tokens):
            # print(
            #     "Resetting the position for the next batch as current iter exhausts the data file"
            # )
            self.reset()

        end_position = self.current_position + B * T + 1
        current_batch = (
            self.tokens[self.current_position : end_position].clone().detach()
        )  # clone and detach to avoid memory leak

        # reshaping the tensor to (B, T) shape and creating input and target tensors
        x = current_batch[:-1].view(B, T)  # input tensor of shape (B, T)
        y = current_batch[1:].view(B, T)  # target tensor of shape (B, T)

        # updating the position for the next batch
        self.current_position += B * T

        return x, y

In [7]:
def get_schedule_lr(
    it, warmup_iters=500, learning_rate=0.1, lr_decay_iters=1000, min_lr=0.01
):
    """Get the learning rate schedule for training after cosine annealing"""
    if it < warmup_iters:
        return learning_rate * (it + 1) / warmup_iters
    if it > lr_decay_iters:
        return min_lr

    # Cosine annealing learning rate schedule
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [8]:
def configure_optimizers(model, weight_decay, learning_rate, device_type):
    """Create the optimizer and scheduler for training"""

    # start with all of the candidate parameters (that require grad)
    param_dict = {pn: p for pn, p in model.named_parameters()}
    param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}

    # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
    # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
    decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
    nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
    optim_groups = [
        {"params": decay_params, "weight_decay": weight_decay},
        {"params": nodecay_params, "weight_decay": 0.0},
    ]

    num_decay_params = sum(p.numel() for p in decay_params)
    num_nodecay_params = sum(p.numel() for p in nodecay_params)

    print(
        f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
    )
    print(
        f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
    )
    # Create AdamW optimizer and use the fused version if it is available
    fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
    use_fused = fused_available and device_type == "cuda"

    optimizer = AdamW(
        optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused
    )
    return optimizer

In [9]:
def validate_model(model, dataloader, device, ctx, min_val_iter):
    """Validate the model"""
    model.eval()  # Set the model to evaluation mode
    dataloader.reset()
    val_loss_accum = 0.0
    with torch.no_grad():
        iter_range = min(min_val_iter, dataloader.num_iterations)
        for _ in range(iter_range):
            x, y = dataloader.next_batch()
            x, y = x.to(device), y.to(device)
            with ctx:
                _, loss = model(x, y)
            val_loss_accum += loss.detach() / iter_range
    print(f"Validation loss: {val_loss_accum.item():.4f}")
    return val_loss_accum

In [10]:
def train_model(
    model,
    optimizer,
    train_dataloader,
    grad_accumulation_steps,
    ctx,
    scaler,
    device,
    grad_clip,
    iteration,
    total_iterations,
    warmup_steps,
    max_lr,
    min_lr,
):
    """Train the model"""
    model.train()  # Set the model to training mode
    optimizer.zero_grad()  # Zero out the gradients
    loss_accum = 0.0

    for _ in range(grad_accumulation_steps):
        x, y = train_dataloader.next_batch()
        x, y = x.to(device), y.to(device)

        with ctx:
            _, loss = model(x, y)

        loss /= grad_accumulation_steps  # Normalize loss
        loss_accum += loss.detach()
        scaler.scale(loss).backward()  # Backpropagation

    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

    lr = get_schedule_lr(
        iteration, warmup_steps, max_lr, int(total_iterations * 0.8), min_lr
    )
    for param_group in optimizer.param_groups:
        param_group["lr"] = lr

    scaler.step(optimizer)  # Update the weights
    scaler.update()  # Update the scale for next iteration
    torch.cuda.empty_cache()

    return loss_accum, lr

In [11]:
def main_train_eval():
    # Prepare the training and validation data
    prep_train_val_token_array(DATA_DIR, "input.txt", ENCODER)

    train_dataloader = DataloaderLite(PERMIT_BATCH, PERMIT_CONTEXT, DATA_DIR, "train")
    val_dataloader = DataloaderLite(PERMIT_BATCH, PERMIT_CONTEXT, DATA_DIR, "val")

    ctx = (
        nullcontext()
        if DEVICE_TYPE == "cpu"
        else torch.amp.autocast(device_type=DEVICE_TYPE, dtype=ptdtype)
    )

    if ctx != nullcontext():
        scaler = torch.cuda.amp.GradScaler(enabled=True)

    # Clear GPU memory
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.synchronize()

    # Initialize the model
    transformer_model = GPT(
        GPTConfig(vocab_size=50304, block_size=1024, n_layer=12, n_head=12, n_embd=768)
    ).to(DEVICE)

    if COMPILE:
        transformer_model = torch.compile(transformer_model)

    # Calculate the total number of iterations
    total_iterations = NUM_EPOCHS * train_dataloader.num_iterations
    print(f"Total iterations: {total_iterations}")
    optimizer = configure_optimizers(
        transformer_model, WEIGHT_DECAY, MAX_LR, DEVICE_TYPE
    )

    for iteration in range(total_iterations):
        start_time = time.time()

        if iteration % train_dataloader.num_iterations == 0:
            print(f"Epoch: {iteration // train_dataloader.num_iterations}")
            print("Model Evaluation.....")
            val_loss_accum = validate_model(
                transformer_model, val_dataloader, DEVICE, ctx, MIN_VAL_ITER
            )

        print("Model Training.....")
        train_loss_accum, lr = train_model(
            transformer_model,
            optimizer,
            train_dataloader,
            GRAD_ACCUMULATION_STEPS,
            ctx,
            scaler,
            DEVICE,
            GRAD_CLIP,
            iteration,
            total_iterations,
            WARMUP_STEPS,
            MAX_LR,
            MIN_LR,
        )

        end_time = time.time()
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        time_elapsed = (end_time - start_time) * 1000
        token_persec = (
            PERMIT_BATCH * PERMIT_CONTEXT * GRAD_ACCUMULATION_STEPS
        ) / time_elapsed
        print(
            f"Iter: {iteration} | Train loss: {train_loss_accum.item():.4f} | Validation loss: {val_loss_accum.item():.4f} | LR: {lr:.4e} | Time: {time_elapsed:.2f}ms | Tokens/ms: {token_persec:.2f}"
        )

In [12]:
main_train_eval()

Reading file from /mnt/batch/tasks/shared/LS_root/mounts/clusters/insights-model-run2/code/Users/soutrik.chowdhury/EraV2_Transformers/data/tinyshakespeare/input.txt
Train length: 1003854
Val length: 111540
Train tokens: 301966
Val tokens: 36059
Total tokens in the file: 301966
1 epoch will have 36 iterations given Batch size=8 and Context length=1024
Total tokens in the file: 36059
1 epoch will have 4 iterations given Batch size=8 and Context length=1024
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True
Flash attention: True


  self.pid = os.fork()


Total iterations: 72
num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
Epoch: 0
Model Evaluation.....
Total tokens in the file: 36059
1 epoch will have 4 iterations given Batch size=8 and Context length=1024
Validation loss: 10.9303
Model Training.....
Total tokens in the file: 301966
1 epoch will have 36 iterations given Batch size=8 and Context length=1024
Iter: 0 | Train loss: 10.9230 | Validation loss: 10.9303 | LR: 6.0000e-05 | Time: 52345.26ms | Tokens/ms: 10.02
Model Training.....
Total tokens in the file: 301966
1 epoch will have 36 iterations given Batch size=8 and Context length=1024
Total tokens in the file: 301966
1 epoch will have 36 iterations given Batch size=8 and Context length=1024
Iter: 1 | Train loss: 9.6116 | Validation loss: 10.9303 | LR: 1.2000e-04 | Time: 9079.20ms | Tokens/ms: 57.75
Model Training.....
Total tokens in the file: 301966
1 epoch will have 36 iterations given Batch size=8 a