In [23]:
import torch
import torch.nn as nn
import tiktoken
import re
from torch.utils.data import Dataset, DataLoader

In [24]:
from gpt_model import *


In [25]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

tokenizer = tiktoken.get_encoding("gpt2")


In [26]:
import json
with open ("thelostrace_instruction_data.json", "r") as f:
    instruction_data = json.load(f)





In [27]:
def format_instruction(entry):
    text = (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    if entry["input"]:
        text += f"\n\n### Input:\n{entry['input']}"
    text += f"\n\n### Response:\n{entry['output']}"
    return text

text_data = "\n\n".join(format_instruction(e) for e in instruction_data)

In [28]:



class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [29]:
# train/eval 
# separate text into training and validation sets:
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [31]:
#loss helpers

def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches



In [32]:
#device - cuda

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

cuda


In [33]:
def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode
        
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel()
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )

    return train_losses, val_losses, track_tokens_seen


def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss


def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

In [34]:
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(
    torch.load("gpt2-124M-thelostrace-sft.pth", map_location="cpu")
)

model.to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.0004,
    weight_decay=0.1
)

train_model_simple(
    model,
    train_loader,
    val_loader,
    optimizer,
    device,
    num_epochs=10,
    eval_freq=5,
    eval_iter=5,
    start_context="Every effort moves you",
    tokenizer=tokenizer
)


  torch.load("gpt2-124M-thelostrace-sft.pth", map_location="cpu")


Ep 1 (Step 000000): Train loss 6.888, Val loss 7.455
Ep 1 (Step 000005): Train loss 3.644, Val loss 4.754
Every effort moves you?                                                 
Ep 2 (Step 000010): Train loss 2.585, Val loss 4.169
Every effort moves you?  ### Response:  ### Response: ### Response: ### Response: ### Response: ### Response: ### Response: ### Response:  ### Response: ### Response: ### Response: ###
Ep 3 (Step 000015): Train loss 2.070, Val loss 3.980
Every effort moves you who squint the request.  ### Input:  ### Input: ### Input: ### Input: ### Input: ### Input: ### Input: ### Input: ### Input: ### Input: ###
Ep 4 (Step 000020): Train loss 1.696, Val loss 3.822
Every effort moves you only the request.    ### Instruction: ### Instruction: ### Instruction: ### Instruction: ### Instruction: ### Instruction:  ### Response:  ### Response:  ### Response:  ### Instruction
Ep 5 (Step 000025): Train loss 1.290, Val loss 3.817
Every effort moves you only the Pict leader.    ### I

([6.88807315826416,
  3.6438942432403563,
  2.585373544692993,
  2.0695833921432496,
  1.6962015628814697,
  1.2899943828582763,
  1.0232301473617553,
  0.7492415547370911,
  0.5514777958393097,
  0.44778836965560914,
  0.3750479698181152,
  0.29740733206272124],
 [7.454814910888672,
  4.754324913024902,
  4.169482707977295,
  3.980177402496338,
  3.8217575550079346,
  3.817457437515259,
  3.895447254180908,
  3.9144985675811768,
  3.969843626022339,
  4.043259620666504,
  4.089314937591553,
  4.156817436218262],
 [512,
  3072,
  5632,
  8192,
  10752,
  13312,
  15872,
  18432,
  20992,
  23552,
  26112,
  28672])

In [35]:
CHOOSE_MODEL = "gpt2-124M"
STORY_NAME = "thelostrace-instruction-summary"

file_name = f"{re.sub(r'[ ()]', '', CHOOSE_MODEL)}-{STORY_NAME}-sft.pth"
torch.save(model.state_dict(), file_name)
print(f"Model saved as {file_name}")


Model saved as gpt2-124M-thelostrace-instruction-summary-sft.pth


In [36]:
#reload + inference 
inference_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("gpt2-124M-thelostrace-instruction-summary-sft.pth", map_location="cpu"))
model.to(inference_device)
model.eval()



  model.load_state_dict(torch.load("gpt2-124M-thelostrace-instruction-summary-sft.pth", map_location="cpu"))


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
      (norm2): LayerNorm()
      (feedforward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttention(
       

In [37]:
# read the book for more details and examples!
# temperature scaling and top-k sampling together increase 
# the diversity of predictions

def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    idx = idx.to(next(model.parameters()).device)
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # New (not in book): numerical stability tip to get equivalent results on mps device
            # subtract rowwise max before softmax
            logits = logits - logits.max(dim=-1, keepdim=True).values
            
            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [38]:
torch.manual_seed(123)

model.to(inference_device)
model.eval()

for entry in instruction_data[:3]:

    input_text = format_instruction(entry)

    token_ids = generate(
        model=model,
        idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=GPT_CONFIG_124M["context_length"],
        eos_id=50256
    )

    generated_text = token_ids_to_text(token_ids, tokenizer)

    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )

    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text}")
    print("-------------------------------------")


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Summarize the passage.

### Input:
Cororuc travels alone through a dark forest, uneasy about the land and the rumors of bandits.

### Response:
Cororuc journeys cautiously through a dangerous forest, troubled by both its atmosphere and the threat of bandits.

Correct response:
>> Cororuc journeys cautiously through a dangerous forest, troubled by both its atmosphere and the threat of bandits.

Model response:
>> ### Input:
The Picts once ruled Britain before being displaced.


Cororuc survives a once-dominant people forced into hiding by Celtic invasions.


Below is an instruction that describes a task. Write a response that appropriately completes the request.


Cororuc is freed and guided back toward civilization.



He believes all Celts must pay for the crimes committed against his people.
Cororuc battles Buruc and his companions after falling from a man.
Belo

In [None]:


prompt = """Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:


### Response:
"""

token_ids = generate(
    model=model,
    idx=text_to_token_ids(prompt, tokenizer).to(inference_device),
    max_new_tokens=256,
    context_size=GPT_CONFIG_124M["context_length"],
    temperature=1.0,
    top_k=40,
    eos_id=50256
)

print(token_ids_to_text(token_ids, tokenizer))


Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
Summarize the story in 100 words or less.

### Response:
Who is an instruction that describes a response that appropriately completes the request. Write a response that describes a hidden danger but remains curious.

### Response:
### Instruction:
Below is an instruction that describes a task. Write a response that appropriately completes the request.
The Picts capture him.

### Instruction:
### Response:
The Picts were a once-dominant, curiosity, and mercy.

Below is an instruction that describes a response that appropriately completes the request.
A Picts, dark, tattooed, and armed with flint weapons.

### Response:
Cororuc is freed and guided back toward civilization.
### Response:
### Input:
Below is an instruction that describes a task.

What theme people, turning survival into prolonged suffering.
### Instruction:
### Input:
Cororuc is tall, well-armed, caut