In [7]:
import sys
from pathlib import Path
import torch.nn as nn
import torch
from torch.utils.data import DataLoader

In [8]:
cur_dir = Path().cwd().parents[0]
sys.path.append(str(cur_dir))

In [9]:
from src.model.transformer_block import TransformerBlock, LayerNorm
from src.model.train_model import GPTDataset

In [10]:
class GPTModel_v2(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.embedding_layer = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.positional_layer = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg["drop_rate"])
        self.transformer_block = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.result_layer = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, x):
        # here we get a bactch of data with a given sequence length
        batch_size, seq_len = x.shape
        token_embeddings = self.embedding_layer(x)
        pos_embeddings = self.positional_layer(torch.arange(seq_len, device=x.device))
        input_embeddings = pos_embeddings + token_embeddings
        dropout = self.dropout(input_embeddings)
        trans_block = self.transformer_block(dropout)
        final_norm = self.final_norm(trans_block)
        logits = self.result_layer(final_norm)
        return logits



In [11]:
GPT2_small_config = {
    "vocab_size": 50257,  # Size of the vocabulary used by the model
    "context_length": 1024,  # Maximum length of input sequences
    "emb_dim": 256,  # Dimensionality of the model's embeddings (d_model)
    "n_heads": 16,  # Number of attention heads in the multi-head attention mechanism
    "n_layers": 24,  # Number of transformer layers in the model
    "drop_rate": 0.1,  # Dropout rate for regularization
    "qkv_bias": False,  # Whether to include bias terms in the query, key, and value projections
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPTModel_v2(GPT2_small_config)
#model.to(device)


In [6]:
def create_dataloader(text, max_seq_length,stride, batch_size, tokenizer,drop_last):
    dataset = GPTDataset(text,tokenizer= tokenizer,max_seq_length=max_seq_length, stride=stride)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,drop_last=drop_last)  
    return dataloader

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001,weight_decay=0.004)

In [20]:
model = model.to(device)

In [13]:
def train_model(model, train_dataloader, val_dataloader, optimizer, device, 
                    num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    """
    model: gpt model for next token prediction
    train_dataloder: loading the training datav
    val_dataloader: loading the validation data for evaluation
    optimizer: adamW optimizer with a learning rate and weight decay
    device: use cuda if available otherwise use cpu 
    num_epochs: number of training rounds
    eval_freq: determines how frequently we evaluate and print the model performance
    eval_iter: this is passed to the evaluation strategy
    start_context: this is passed to the next token generation strategy, this essentially directs 
                    the model to generate text after x tokens before the last one
    tokenizer: this is a gpt tokenizer, that tokenises the text into tokens with a gpt2 vocabulary.
    """
    train_losses, val_losses, track_tokens_seen = [],[],[]
    tokens_seen , global_step = 0,-1
    for e in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_dataloader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1


            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_dataloader, val_dataloader, 
                                                      device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epochs {e+1} (Step{global_step:06d}): "
                      f"Train loss: {train_loss:.3f}, "
                    f"Val loss: {val_loss:.3f}")
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    inputs_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    torch.nn.functional.cross_entropy(logits.flatten())
    