In [1]:
import pandas as pd
from rich import print as pprint

from dataset import GPTTokenizer
import torch
from torch import tensor, stack, nonzero
from torch.nn import functional as F
from torch import optim

  from pandas.core import (


In [2]:
inst_df = pd.read_json('instruction-data.json')

In [57]:
inst_df.sample(frac=0.2)

Unnamed: 0,instruction,input,output,formatted
763,"Classify the following items: pine tree, quart...",,Plants: Pine tree\nMinerals: Quartz\nAnimals: ...,(Below is an instruction that describes the ta...
88,Generate a sentence using the word 'harmony'.,,The choir's voices blended in perfect harmony.,(Below is an instruction that describes the ta...
259,"Provide a word that rhymes with ""care.""",,"A word that rhymes with ""care"" is ""fare.""",(Below is an instruction that describes the ta...
962,Define the term 'kinetic energy'.,,Kinetic energy is the energy that an object po...,(Below is an instruction that describes the ta...
54,What is the normal body temperature in Celsius?,,The normal body temperature is approximately 3...,(Below is an instruction that describes the ta...
...,...,...,...,...
470,Translate 'Where are you from?' into French.,,The French translation of 'Where are you from?...,(Below is an instruction that describes the ta...
1014,What is the abbreviation for 'Master of Busine...,,The abbreviation for 'Master of Business Admin...,(Below is an instruction that describes the ta...
993,Create a sentence using the word 'inevitable'.,,The confrontation was inevitable given the cir...,(Below is an instruction that describes the ta...
344,Rewrite this statement as an imperative sentence.,You should finish your assignment.,Finish your assignment.,(Below is an instruction that describes the ta...


In [4]:
## Formtting the data
def format_input(entry):
    instruction_text = (f"Below is an instruction that describes the task. "
                        f"Write a response that appropriately completes the request"
                        f"\n\n### Instruction:\n{entry['instruction']}")
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    response_text = f"\n\n### Response:\n"

    return instruction_text + input_text + response_text, entry['output']

In [60]:
inst_df.loc[1067]['instruction']

'Name the process by which plants absorb water through their roots.'

In [5]:
inst_df['formatted'] = inst_df[['instruction','input','output']].apply(format_input, axis=1)

#### Alpaca Prompt style

In [6]:
pprint(inst_df['formatted'][100])

In [7]:
train_data = inst_df[:int(0.8 * len(inst_df))]
test_data =  inst_df[int(0.8 * len(inst_df)):]

In [8]:
from torch.utils.data import Dataset, DataLoader
# GPTTokenizer.allowed_special = "<|endoftext|>"

In [9]:
class InsructionDataset(Dataset):

    def __init__(self, data:pd.DataFrame, tokenizer):
        self.data = data
        self.encoded_texts = []
        for _,row in self.data.iterrows():
            input_, response = self.__alpaca_format_input(row)
            formatted = input_ + response
            encoded_text = tokenizer.encode(formatted)
            self.encoded_texts.append(encoded_text)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __alpaca_format_input(self,entry):
        instruction_text = (f"Below is an instruction that describes the task. "
                            f"Write a response that appropriately completes the request"
                            f"\n\n### Instruction:\n{entry['instruction']}")
        input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
        response_text = f"\n\n### Response:\n"
        return instruction_text + input_text + response_text, entry['output']

In [10]:
def custom_collate_fn(batch, pad_token_id = 50256,
                      ignore_index = -100, DEVICE="cpu",
                      allowed_max_len = None):

    ## finding length + 1 longest sequence in the batch
    batch_max_len =max(len(item)+1 for item in batch)
    # PAD and prepare inputs
    inputs_lst = []
    targets_lst = []
    for item in batch:
        new_item = item.copy()
        new_item+=[pad_token_id]
        padded = (new_item + [pad_token_id] * (batch_max_len - len(new_item)))
        inputs = tensor(padded[:-1])
        targets = tensor(padded[1:])
        inputs_lst.append(inputs)
        targets_lst.append(targets)
        mask = targets==pad_token_id
        indices_to_replace = nonzero(mask).squeeze()
        ## making extra padding as -100 so that it is ignored by cross_entropy loss
        if indices_to_replace.numel()>1:
            targets[indices_to_replace[1:]] = ignore_index
        ## truncating length
        if allowed_max_len is not None:
            inputs = inputs[:allowed_max_len]
            targets = targets[:allowed_max_len]
    input_tensor = stack(inputs_lst).to(DEVICE)
    targets_tensor = stack(targets_lst).to(DEVICE)
    return {'x':input_tensor, 'y':targets_tensor}

In [11]:
batch = (
            [1,3,4,5,6,7],   # Sequence 1
            [4,2,3],         # Sequence 2
            [0,8,1,2,9]      # Sequence 3
        )

In [12]:
custom_collate_fn(batch)

{'x': tensor([[    1,     3,     4,     5,     6,     7],
         [    4,     2,     3, 50256, 50256, 50256],
         [    0,     8,     1,     2,     9, 50256]]),
 'y': tensor([[    3,     4,     5,     6,     7, 50256],
         [    2,     3, 50256,  -100,  -100,  -100],
         [    8,     1,     2,     9, 50256,  -100]])}

In [13]:
from functools import partial
customzed_collate_function = partial(custom_collate_fn, DEVICE = 'cpu',
                                     allowed_max_len = 1024)

In [14]:
num_workers = 0
batch_size = 64

torch.manual_seed(123)
train_dataset = InsructionDataset(inst_df, GPTTokenizer)
train_loader =DataLoader(train_dataset,
                         batch_size=batch_size,
                         collate_fn= custom_collate_fn,
                         shuffle= True,
                         drop_last= True,
                         num_workers=num_workers)

In [15]:
# for batch in train_loader:
#     pprint(GPTTokenizer.decode_batch(batch['x'].numpy()), end = '\n\n\n')
#     pprint(GPTTokenizer.decode_batch(batch['y'].numpy()))
#     break

In [16]:
### Loading a pre-trained model weights
from dataset import get_dataloader, GPTTokenizer
from model import (GPTModel, inference, token_ids_to_text, text_to_token_ids, generate)
from config import CUSTOM_GPT_CONFIG
from torchinfo import summary
from torch.nn import functional as F
from torch import optim
from tqdm import tqdm
from rich import print as pprint
from torch import tensor
import torch

In [17]:
inst_model = GPTModel(CUSTOM_GPT_CONFIG)
summary(inst_model)

Layer (type:depth-idx)                   Param #
GPTModel                                 --
├─Embedding: 1-1                         38,597,376
├─Embedding: 1-2                         786,432
├─Dropout: 1-3                           --
├─Sequential: 1-4                        --
│    └─TransformerBlock: 2-1             --
│    │    └─MultiHeadAttention: 3-1      2,362,368
│    │    └─FeedForward: 3-2             4,722,432
│    │    └─LayerNorm: 3-3               1,536
│    │    └─LayerNorm: 3-4               1,536
│    │    └─Dropout: 3-5                 --
│    └─TransformerBlock: 2-2             --
│    │    └─MultiHeadAttention: 3-6      2,362,368
│    │    └─FeedForward: 3-7             4,722,432
│    │    └─LayerNorm: 3-8               1,536
│    │    └─LayerNorm: 3-9               1,536
│    │    └─Dropout: 3-10                --
│    └─TransformerBlock: 2-3             --
│    │    └─MultiHeadAttention: 3-11     2,362,368
│    │    └─FeedForward: 3-12            4,722,432
│   

In [18]:
model_size = "124M"
from gpt_download import download_and_load_gpt2
model_dir = "downloaded_weights"
settings, params = download_and_load_gpt2(model_size=model_size,
                                          models_dir=model_dir)



File already exists and is up-to-date: downloaded_weights/124M/checkpoint




File already exists and is up-to-date: downloaded_weights/124M/encoder.json




File already exists and is up-to-date: downloaded_weights/124M/hparams.json




File already exists and is up-to-date: downloaded_weights/124M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: downloaded_weights/124M/model.ckpt.index




File already exists and is up-to-date: downloaded_weights/124M/model.ckpt.meta




File already exists and is up-to-date: downloaded_weights/124M/vocab.bpe


In [19]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



In [89]:
load_weights_into_gpt(inst_model, params)

In [110]:
prompts = ['''Twinkle Twinkle little star
How I wonder what you are
Up above the world so high
And now it takes a funny turn''']

In [111]:
# prompts = ["I am the ONE"] * 1
inst_model.eval()
res =    generate(model=inst_model,
         tokenizer=GPTTokenizer,
         max_new_tokens=200,
         temperature= 1,
         DEVICE = 'cpu',
         prompts=prompts,
         context_size=CUSTOM_GPT_CONFIG['context_length'],
         top_K= 30,
         eos_id="<|endoftext|>")

for idx, response in enumerate(res,1):
    pprint(f"Response {idx} : \n\n{response}", end = '\n\n\n')

In [112]:
def train_model(model,optimizer, train_loader, epochs = 10, DEVICE = 'cpu'):
    # optimizer = optim.AdamW(model.parameters(),
    #                         lr = 0.01)
    tokens_seen = 0
    losses = []
    EPOCHS = epochs
    # epoch_pbar = tqdm(range(EPOCHS), desc="Training", unit="epoch")
    # epoch_pbar = tqdm(range(EPOCHS), desc="Training", unit="epoch")
    model.to(DEVICE)
    for epoch in range(EPOCHS):
        model.train()
        batch_loss = 0


        batch_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)

        for batch in batch_pbar:
            optimizer.zero_grad()
            out_logits = model(batch['x'].to(DEVICE))
            loss = F.cross_entropy(out_logits.flatten(0,1), batch['y'].flatten(0).to(DEVICE))
            loss.backward()
            optimizer.step()
            tokens_seen += batch['x'].numel()
            batch_loss += loss.item()


        avg_loss = batch_loss / len(train_loader)
        losses.append(avg_loss)
        print(f"EPOCH : {epoch + 1} | Epoch Loss : {losses[epoch]}")
    return model, optimizer, losses

In [41]:
# optimizer = optim.AdamW(lr=0.001,weight_decay=0.01, params=inst_model.parameters())
inst_model, optimizer, losses = train_model(model = inst_model,optimizer=optimizer,
            train_loader=train_loader, epochs = 3, DEVICE = 'cpu')

                                                          

EPOCH : 1 | Epoch Loss : 0.32288456489058104


                                                          

EPOCH : 2 | Epoch Loss : 0.2822493472520043


                                                          

EPOCH : 3 | Epoch Loss : 0.2525990780662088




In [42]:
# Save the model and optimizer state
checkpoint = {
    'model_state_dict': inst_model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}

torch.save(checkpoint, 'saved_models/instruction_model_160M_7EP.pth')

In [30]:
inp_, res = format_input(test_data.loc[1090])

In [31]:
pprint(inp_, res)

In [32]:
template = '''Below is an instruction that describes the task. Write a response that appropriately completes the request

### Instruction:
{instruction}

### Input:
{input}

### Response:
'''

def generate_prompt(instruction, input_text=""):
    return template.format(instruction=instruction, input=input_text)

In [87]:
def invoke(model, tokenizer, instruction, input_text):
    model.eval()
    prompt = [generate_prompt(instruction, input_text=None)]
    generated_responses =    generate(model=inst_model,
             tokenizer=GPTTokenizer,
             max_new_tokens=90,
             temperature= 0,
             DEVICE = 'cpu',
             prompts=prompt,
             context_size=CUSTOM_GPT_CONFIG['context_length'],
             top_K= 20,
             eos_id="<|endoftext|>")
    responses = []
    for i,res in enumerate(generated_responses):
        responses.append(res[res.find("### Response") + len("### Response:"):res.find("<|endoftext|>")])
    return responses

In [88]:
## ALPACA FORMAT
'''Below is an instruction that describes the task. Write a response that appropriately completes the request

### Instruction:
{instruction}

### Input:
{input}

### Response:
'''
response = invoke(inst_model,
       GPTTokenizer,
       instruction="Rewrite the sentence using a simile.",
       input_text= "The dog is very loyal."
)
pprint(response[0])