This notebook loads a LLM with OpenAI's GPT2 weights and then fine-tunes the model with a dataset of instructions and responses.

In [None]:
import json
import tiktoken
import torch
import instruction_dataset
import numpy as np
import time
import torch.nn as nn
import gpt_model
import pretrain_model
import load_pretrained_weights
from gpt_download import download_and_load_gpt2
from functools import partial
from tqdm import tqdm
from torch.utils.data import DataLoader

In [None]:
# load instruction dataset
with open("instruction-data.json", "r") as file:
    data = json.load(file)

print("Number of entries: ", len(data))
print("Example entry:\n", data[50])

In [None]:
# split data into training, test, and validation sets
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length: ", len(train_data))
print("Test set length: ", len(test_data))
print("Validation set length: ", len(val_data))

In [None]:
# Create datasets and dataloaders

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available:
    device = torch.device("mps") # Apple Silicon
else:
    device = "cpu"

tokenizer = tiktoken.get_encoding("gpt2")
num_workers = 0
batch_size = 8
torch.manual_seed(123)

collate_function = partial(
    instruction_dataset.collate,
    device=device,
    allowed_max_length=1024
)

train_dataset = instruction_dataset.InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn = collate_function,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

val_dataset = instruction_dataset.InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    collate_fn=collate_function,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False
)

test_dataset = instruction_dataset.InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    collate_fn=collate_function,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False
)

'''
print("Train loader")
for input_batch, target_batch in train_loader:
    print(input_batch.shape, target_batch.shape)
'''

In [None]:
# OpenAI GPT2 settings and parameters

settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")
print("Settings: ", settings)
print()
print("Params keys: ", params.keys())
print("Params token embedding weights: ", params["wte"])
print("Token embedding weights shape: ", params["wte"].shape)

In [None]:
# Intialize our LLM and load OpenAI GPT2 weights

gpt = gpt_model.GPTModel(
    vocab_size = 50257,
    context_length = 1024,
    emb_dim = 768,
    num_heads = 12,
    num_layers = 12,
    drop_rate = 0.0,
    qkv_bias = True
)
load_pretrained_weights.load_weights_into_gpt(gpt, params)
gpt.eval()

In [None]:
# calculate initial loss for training and validation sets
gpt.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = pretrain_model.calc_loss_loader(data_loader=train_loader, model=gpt, device=device, num_batches=5)
    val_loss = pretrain_model.calc_loss_loader(data_loader=val_loader, model=gpt, device=device, num_batches=5)

print("Training loss: ", train_loss)
print("Validation loss: ", val_loss)

In [None]:
# fine-tune LLM 

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 2

train_losses, val_losses, tokens_seen = pretrain_model.pretrain_model_simple(
    model=gpt,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context=instruction_dataset.format_input(val_data[0]),
    tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes")

In [None]:
torch.manual_seed(123)

for entry in test_data[:3]: # first 3 test set samples
    input_text = instruction_dataset.format_input(entry)
    token_ids = pretrain_model.generate(
        model=gpt,
        index=pretrain_model.text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=1024,
        eos_id=50256
    )
    
    generated_text = pretrain_model.token_ids_to_text(token_ids, tokenizer)
    
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )
    
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel reponse:\n>> {response_text.strip()}")

In [None]:


for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = instruction_dataset.format_input(entry)
    token_ids = pretrain_model.generate(
        model=gpt,
        index=pretrain_model.text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=1024,
        eos_id=50256
    )
    
    generated_text = pretrain_model.token_ids_to_text(token_ids, tokenizer)
    
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )
    test_data[i]["model_response"] = response_text
    
    with open("intruction-data-with-response.json", "w") as file:
        json.dump(test_data, file, indent=4)

In [None]:
def print_model_response(index):
    print(f"Instruction: {test_data[index]['instruction']}")
    print()
    print(f"Input: {test_data[index]['input']}")
    print()
    print(f"Output: {test_data[index]['output']}")
    print()
    print(f"Model response: {test_data[index]['model_response']}")
    print()

In [None]:
print_model_response(100)

In [None]:
print_model_response(60)