In [1]:
from datasets import load_dataset

ds = load_dataset("yahma/alpaca-cleaned")

In [2]:
print(ds)
sample_ds = ds['train']
print(sample_ds[8])

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 51760
    })
})
{'output': 'There are two spelling errors in the sentence. The corrected sentence should be: "He finished his meal and left the restaurant."', 'input': 'He finnished his meal and left the resturant', 'instruction': 'Evaluate this sentence for spelling and grammar mistakes'}


In [3]:
sample_ds=sample_ds[:30000]

In [4]:
data=[]

for i in range(len(sample_ds['output'])):
    data.append({
        'output': sample_ds['output'][i],
        'input': sample_ds['input'][i],
        'instruction': sample_ds['instruction'][i]
    })

In [5]:
len(data)

30000

### CONVERTING INSTRUCTIONS INTO ALPACA FORMAT


In [6]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""

    return instruction_text + input_text

In [7]:

model_input = format_input(data[8])
desired_response = f"\n\n### Response:\n{data[8]['output']}"

print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Evaluate this sentence for spelling and grammar mistakes

### Input:
He finnished his meal and left the resturant

### Response:
There are two spelling errors in the sentence. The corrected sentence should be: "He finished his meal and left the restaurant."


In [8]:
train_portion = int(len(data) * 0.85)  # 85% for training
test_portion = int(len(data) * 0.1)    # 10% for testing
val_portion = len(data) - train_portion - test_portion  # Remaining 5% for validation

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [9]:
print("Training set length:", len(train_data))
print("Validation set length:", len(val_data))
print("Test set length:", len(test_data))

Training set length: 25500
Validation set length: 1500
Test set length: 3000


## STEP 2: ORGANIZING DATA INTO TRAINING BATCHES

In [10]:
import tiktoken
import torch
tokenizer = tiktoken.get_encoding("gpt2")
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [11]:
import torch
from torch.utils.data import Dataset


class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [12]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    ignore_index=-100,
    allowed_max_length=None,
    device="cpu"
):
    # Find the longest sequence in the batch
    batch_max_length = max(len(item)+1 for item in batch)

    # Pad and prepare inputs and targets
    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        # Add an <|endoftext|> token
        new_item += [pad_token_id]
        # Pad sequences to max_length
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])  # Truncate the last token for inputs
        targets = torch.tensor(padded[1:])  # Shift +1 to the right for targets

        # New: Replace all but the first padding tokens in targets by ignore_index
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index

        # New: Optionally truncate to maximum sequence length
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs)
        targets_lst.append(targets)

    # Convert list of inputs and targets to tensors and transfer to target device
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)

    return inputs_tensor, targets_tensor

## STEP 3: CREATING DATALOADERS FOR AN INSTRUCTION DATASET


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [14]:
from functools import partial
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)


In [15]:
from torch.utils.data import DataLoader


num_workers = 0 #num_workers defines how many subprocesses (worker processes) will be used to load the data in parallel
batch_size = 5
torch.manual_seed(123)

train_dataset=InstructionDataset(train_data ,tokenizer)

train_loader=DataLoader(
    train_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [16]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape , inputs.device)
    print(inputs)
    break

Train loader:
torch.Size([5, 906]) torch.Size([5, 906]) cuda:0
tensor([[21106,   318,   281,  ...,   220,   220,  2073],
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        [21106,   318,   281,  ..., 50256, 50256, 50256],
        [21106,   318,   281,  ..., 50256, 50256, 50256]], device='cuda:0')


## STEP 4: LOADING A PRETRAINED LLM WEIGHTS

In [17]:
from gpt_download3 import download_and_load_gpt2

BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-small (124M)" # Changed from "gpt2-large (774M)" to "gpt2-small (124M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)



File already exists and is up-to-date: gpt2/124M/checkpoint




File already exists and is up-to-date: gpt2/124M/encoder.json




File already exists and is up-to-date: gpt2/124M/hparams.json




File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2/124M/model.ckpt.index




File already exists and is up-to-date: gpt2/124M/model.ckpt.meta




File already exists and is up-to-date: gpt2/124M/vocab.bpe


##STEP 5 : Define Custom GPT Architecture

In [18]:
import torch.nn as nn
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention ( self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers=nn.Sequential(
        nn.Linear(cfg['emb_dim'] ,4*cfg["emb_dim"]),
        GELU(),
        nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
    )

  def forward(self,x):
    return self.layers(x)

class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))


  def forward(self,x):
    mean=x.mean(dim=-1 ,keepdim=True)
    var=x.var(dim=-1 ,keepdim=True , unbiased=False)
    norm_x=(x-mean)/torch.sqrt(var+self.eps)
    return self.scale * norm_x + self.shift


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        # 2*4*768
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x
        # 2*4*768


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

## STEP 6 : ASSIGN WEIGHTS TO OUR CUSTOM MODEL

In [19]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))


import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])


In [20]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

# <div class="alert alert-block alert-success">

Before diving into finetuning the model in the next section, let's take a moment to assess
the pretrained LLM's performance on one of the validation tasks by comparing its output to
the expected response.

This will give us a baseline understanding of how well the model
performs on an instruction-following task right out of the box, prior to finetuning, and will
help us appreciate the impact of finetuning later on.

We use the first example from the
validation set for this assessment:
</div>

In [21]:
torch.manual_seed(123)
input_text = format_input(val_data[1])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Make up a sentence using the vocabulary words: collaboration, detestable, moral


In [22]:
model=model.to(device) # move model to device
token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer).to(device), # Move input tensor to the same device as the model
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [23]:
response_text = generated_text[len(input_text):].strip()
print(response_text)

, and so on.

### Instruction:

Write a sentence that is not a sentence.

### Instruction:

Write a sentence that is not a


In [24]:
val_data[0]

{'output': "An AI chatbot is a software program that uses artificial intelligence (AI) to conduct a conversation with users through messaging or voice-based platforms. These chatbots are designed to simulate human-like conversation and act as a virtual assistant or customer service representative, providing information or assistance to users through natural language processing (NLP) and machine learning algorithms.\n\nThe way an AI chatbot works is by understanding and interpreting the user's input, whether it be through text, voice or even images, and then responding with an appropriate output. When a user sends a message, the chatbot uses NLP to analyze and comprehend the user's language and intent. Then, it searches its database or other sources of information to provide a relevant response, which may include text, images or links.\n\nSome AI chatbots are rule-based and rely heavily on predetermined responses, whereas others use machine learning algorithms and data analytics to cont

In [25]:
next(model.parameters()).device
total_params = sum(p.numel() for p in model.parameters())
total_params

163037184

# <div class="alert alert-block alert-info">
    
As we can see from the output, the pretrained model is not yet capable of correctly
following the given instruction.

While it does create a "Response" section, it simply repeats
the original input sentence and part of the instruction, failing to convert the active sentence
to passive voice as requested.


In the upcoming section, we implement the finetuning process to improve the model's
ability to comprehend and appropriately respond to such requests.

</div>

## STEP 5: FINETUNING THE LLM ON INSTRUCTION DATA

In [26]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context

    ###Input batch:
 ###tensor([[6109, 3626, 6100,  345],
        ##[6109, 1110, 6622,  257]])

    for _ in range(max_new_tokens):

        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]

        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond) ### batch, n_tokens, vocab_size

        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss



In [27]:
def calc_loss_batch(input_batch, target_batch, model, device):
    # input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    if input_batch.device != device:
      input_batch = input_batch.to(device)
    if target_batch.device != device:
      target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

def print_memory(tag):
    print(f"[{tag}] Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB | Reserved: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            print_memory(f"Eval Batch {i} - Before")
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            print_memory(f"Eval Batch {i} - After")

            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches


In [114]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration

            loss = calc_loss_batch(input_batch, target_batch, model, device)

            loss.backward() # Calculate loss gradients
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  #clamps (limits) the total norm of the gradients to a maximum of 1.0
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen

In [115]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration

            loss = calc_loss_batch(input_batch, target_batch, model, device)

            loss.backward() # Calculate loss gradients
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  #clamps (limits) the total norm of the gradients to a maximum of 1.0
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen

In [29]:

# from torch.cuda.amp import autocast, GradScaler

# def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
#                        eval_freq, eval_iter, start_context, tokenizer):
#     train_losses, val_losses, track_tokens_seen = [], [], []
#     tokens_seen, global_step = 0, -1

#     # 🔧 Initialize AMP gradient scaler
#     scaler = GradScaler()

#     for epoch in range(num_epochs):
#         model.train()

#         for input_batch, target_batch in train_loader:
#             input_batch = input_batch.to(device)
#             target_batch = target_batch.to(device)
#             optimizer.zero_grad()

#             # 🔁 Forward pass with autocast
#             with autocast():
#                 loss = calc_loss_batch(input_batch, target_batch, model, device)

#             # 🔁 Backward pass with gradient scaling
#             scaler.scale(loss).backward()
#             # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#             scaler.step(optimizer)
#             scaler.update()

#             tokens_seen += input_batch.numel()
#             global_step += 1

#             # Evaluation
#             if global_step % eval_freq == 0:
#                 train_loss, val_loss = evaluate_model(
#                     model, train_loader, val_loader, device, eval_iter)
#                 train_losses.append(train_loss)
#                 val_losses.append(val_loss)
#                 track_tokens_seen.append(tokens_seen)

#                 print(f"Ep {epoch+1} (Step {global_step:06d}): "
#                       f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

#         # Sample generation
#         generate_and_print_sample(model, tokenizer, device, start_context)

#     return train_losses, val_losses, track_tokens_seen


In [30]:


with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

[Eval Batch 0 - Before] Allocated: 0.72 GB | Reserved: 0.79 GB
[Eval Batch 0 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 1 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 1 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 2 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 2 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 3 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 3 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 4 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 4 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 0 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 0 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 1 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 1 - After] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 2 - Before] Allocated: 0.72 GB | Reserved: 3.30 GB
[Eval Batch 2 - After] Allocated: 0.72 GB | Reserved: 3.30 GB


In [31]:
import torch
torch.cuda.empty_cache()

In [32]:
import time
# import torch._dynamo

# # Disable torch._dynamo
# torch._dynamo.config.disable = True
torch.manual_seed(123)



start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.01)

num_epochs = 1

train_losses, val_losses = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
)
# Save model checkpoint
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': 1  # the last completed epoch
}

torch.save(checkpoint, 'gpt2_instruction_checkpoint.pt')
print("Checkpoint saved.")
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Ep 1 (Step 003905): Train loss 1.601, Val loss 1.712
[Eval Batch 0 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 0 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 1 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 1 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 2 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 2 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 3 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 3 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 4 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 4 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 0 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 0 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 1 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 1 - After] Allocated: 2.70 GB | Reserved: 15

ValueError: not enough values to unpack (expected 3, got 2)

In [33]:
# Save model checkpoint
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': 1  # the last completed epoch
}


In [34]:
torch.save(checkpoint, 'gpt2_instruction_checkpoint.pt')


In [41]:

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batches=5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batches=5)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

[Eval Batch 0 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 0 - After] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 1 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 1 - After] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 2 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 2 - After] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 3 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 3 - After] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 4 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 4 - After] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 0 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 0 - After] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 1 - Before] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 1 - After] Allocated: 2.70 GB | Reserved: 15.37 GB
[Eval Batch 2 - Before] Allocated: 2.71 GB | Reserved: 15.37 GB
[Eval Batch 2 - After] Allocated: 2.71 GB | Res

In [111]:
input_text="""
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
give me 5 cities name in india
"""

In [112]:
token_ids = generate(
    model=model,
    idx=text_to_token_ids(input_text, tokenizer).to(device), # Move input tensor to the same device as the model
    max_new_tokens=100,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [113]:
response_text = generated_text[len(input_text):].strip()
print(response_text)

### Response:
1. Mumbai
2. Chennai
3. Bangalore
4. Chennai
5. Delhi


Training For One more epoch

In [116]:
import torch

# Load checkpoint
checkpoint = torch.load('gpt2_instruction_checkpoint.pt', map_location=device)

# Restore model and optimizer state
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Get the last completed epoch
start_epoch = checkpoint['epoch']

print(f"Resuming training from epoch {start_epoch + 1}")

# Continue training for 1 more epoch
for epoch in range(start_epoch, start_epoch + 1):  # just 1 extra epoch
    train_losses, val_losses,token_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=format_input(val_data[0]), tokenizer=tokenizer
    )
    print(f"Epoch {epoch+1} completed")

# Optionally save again after training
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch + 1
}
torch.save(checkpoint, 'gpt2_instruction_checkpoint.pt')


Resuming training from epoch 2
[Eval Batch 0 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 0 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 1 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 1 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 2 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 2 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 3 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 3 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 4 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 4 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 0 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 0 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 1 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 1 - After] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 2 - Before] Allocated: 3.40 GB | Reserved: 15.37 GB
[Eval Batch 2 - 

OutOfMemoryError: CUDA out of memory. Tried to allocate 982.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 734.12 MiB is free. Process 29520 has 14.02 GiB memory in use. Of the allocated memory 12.98 GiB is allocated by PyTorch, and 933.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def estimate_finetune_memory(num_params: int, dtype_bytes=4): # full precision
    # 1x for params + 1x for grads + 2x for Adam + 2x for activations
    total_bytes = num_params * dtype_bytes * 6
    total_gib = total_bytes / (1024 ** 3)
    print(f"Estimated GPU memory for fine-tuning: {total_gib:.2f} GiB")

# Example:
estimate_finetune_memory(355_000_000)  # for 350M param model


In [None]:
def estimate_finetune_memory(num_params: int, dtype_bytes=2):  # Half precision
    # 1x for params + 1x for grads + 2x for Adam + 2x for activations
    total_bytes = num_params * dtype_bytes * 6
    total_gib = total_bytes / (1024 ** 3)
    print(f"Estimated GPU memory for fine-tuning: {total_gib:.2f} GiB")

# Example:
estimate_finetune_memory(355_000_000)  # for 350M param model

In [None]:
!pip install tensorflow