In [1]:
import json
import tiktoken
import torch
import instruction_dataset
import numpy as np
import time
import torch.nn as nn
import gpt_model
import pretrain_model
import load_pretrained_weights
from gpt_download import download_and_load_gpt2
from functools import partial
from tqdm import tqdm
from torch.utils.data import DataLoader

In [2]:
# load instruction dataset
with open("instruction-data.json", "r") as file:
    data = json.load(file)

print("Number of entries: ", len(data))
print("Example entry:\n", data[50])

Number of entries:  1100
Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [3]:
# split data into training, test, and validation sets
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Training set length: ", len(train_data))
print("Test set length: ", len(test_data))
print("Validation set length: ", len(val_data))

Training set length:  935
Test set length:  110
Validation set length:  55


In [4]:
# Create datasets and dataloaders

if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available:
    device = torch.device("mps") # Apple Silicon
else:
    device = "cpu"

tokenizer = tiktoken.get_encoding("gpt2")
num_workers = 0
batch_size = 8
torch.manual_seed(123)

collate_function = partial(
    instruction_dataset.collate,
    device=device,
    allowed_max_length=1024
)

train_dataset = instruction_dataset.InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    collate_fn = collate_function,
    shuffle=True,
    num_workers=num_workers,
    drop_last=True
)

val_dataset = instruction_dataset.InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    collate_fn=collate_function,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False
)

test_dataset = instruction_dataset.InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    collate_fn=collate_function,
    shuffle=False,
    num_workers=num_workers,
    drop_last=False
)

'''
print("Train loader")
for input_batch, target_batch in train_loader:
    print(input_batch.shape, target_batch.shape)
'''

'\nprint("Train loader")\nfor input_batch, target_batch in train_loader:\n    print(input_batch.shape, target_batch.shape)\n'

In [5]:
# OpenAI GPT2 settings and parameters

settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")
print("Settings: ", settings)
print()
print("Params keys: ", params.keys())
print("Params token embedding weights: ", params["wte"])
print("Token embedding weights shape: ", params["wte"].shape)

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 24.2kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.89MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 54.6kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:52<00:00, 9.53MiB/s] 
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 2.40MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 2.66MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.94MiB/s]


Settings:  {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

Params keys:  dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])
Params token embedding weights:  [[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Token embedding weights shape:  (50257, 768)


In [6]:
# Intialize our LLM and load OpenAI GPT2 weights

gpt = gpt_model.GPTModel(
    vocab_size = 50257,
    context_length = 1024,
    emb_dim = 768,
    num_heads = 12,
    num_layers = 12,
    drop_rate = 0.0,
    qkv_bias = True
)
load_pretrained_weights.load_weights_into_gpt(gpt, params)
gpt.eval()

GPTModel(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(1024, 768)
  (dropout_embedding): Dropout(p=0.0, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (attention): Attention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (attention): Attention(
        (

In [7]:
# calculate initial loss for training and validation sets
gpt.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = pretrain_model.calc_loss_loader(data_loader=train_loader, model=gpt, device=device, num_batches=5)
    val_loss = pretrain_model.calc_loss_loader(data_loader=val_loader, model=gpt, device=device, num_batches=5)

print("Training loss: ", train_loss)
print("Validation loss: ", val_loss)

Training loss:  4.285582256317139
Validation loss:  4.165835189819336


In [8]:
# fine-tune LLM 

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 2

train_losses, val_losses, tokens_seen = pretrain_model.pretrain_model_simple(
    model=gpt,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    num_epochs=num_epochs,
    eval_freq=5,
    eval_iter=5,
    start_context=instruction_dataset.format_input(val_data[0]),
    tokenizer=tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes")

Epoch 1, Step 000000 : Train loss 3.209,Validation loss 3.233
Below is an instruction that describes a task. Write a response that appropriately completes the request.   ### Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'  ### Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'  ### Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'  ### Instruction
Below is an instruction that describes a task. Write a response that appropriately completes the request.   ### Instruction: Convert the active sentence to passive: 'The chef cooks the meal every day.'  ### Response: Write a response that appropriately completes the request.  ### Response: Write a response that appropriately completes the request.  ### Response: Write a response that appropriately completes the request.  ### Response:
Below is an instruction that describes a task. Write a response that appropriately complet

In [9]:
torch.manual_seed(123)

for entry in test_data[:3]: # first 3 test set samples
    input_text = instruction_dataset.format_input(entry)
    token_ids = pretrain_model.generate(
        model=gpt,
        index=pretrain_model.text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=1024,
        eos_id=50256
    )
    
    generated_text = pretrain_model.token_ids_to_text(token_ids, tokenizer)
    
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )
    
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel reponse:\n>> {response_text.strip()}")

Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
Rewrite the sentence using a simile.

### Input:
The car is very fast.

Correct response:
>> The car is as fast as lightning.

Model reponse:
>> The car is as fast as a horse.
Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
What type of cloud is typically associated with thunderstorms?

Correct response:
>> The type of cloud typically associated with thunderstorms is cumulonimbus.

Model reponse:
>> A type of thunderstorm is a tropical storm.
Below is an instruction that describes a task. Write a response that appropriately completes the request. 

### Instruction:
Name the author of 'Pride and Prejudice'.

Correct response:
>> Jane Austen.

Model reponse:
>> The author of 'Pride and Prejudice' is Robert Frost.


In [10]:


for i, entry in tqdm(enumerate(test_data), total=len(test_data)):
    input_text = instruction_dataset.format_input(entry)
    token_ids = pretrain_model.generate(
        model=gpt,
        index=pretrain_model.text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
        context_size=1024,
        eos_id=50256
    )
    
    generated_text = pretrain_model.token_ids_to_text(token_ids, tokenizer)
    
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )
    test_data[i]["model_response"] = response_text
    
    with open("intruction-data-with-response.json", "w") as file:
        json.dump(test_data, file, indent=4)

100%|██████████| 110/110 [01:07<00:00,  1.63it/s]


In [12]:
def print_model_response(index):
    print(f"Instruction: {test_data[index]['instruction']}")
    print()
    print(f"Input: {test_data[index]['input']}")
    print()
    print(f"Output: {test_data[index]['output']}")
    print()
    print(f"Model response: {test_data[index]['model_response']}")
    print()

In [14]:
print_model_response(100)

Instruction: Generate a sentence using the word 'generous'.

Input: 

Output: He is very generous and always helps those in need.

Model response: He was very generous and always made sure that everyone was given a present.



In [19]:
print_model_response(60)

Instruction: What is an antonym of 'old'?

Input: 

Output: young.

Model response: An antonym of 'old' is 'young'.

