In [1]:
import torch
import json 
import tiktoken
from torch.utils.data import Dataset

In [2]:
# load dataset json file
with open("/home/ge73qip/LLMs/LLMs_from_scratch/instruction-data.json", "r") as f:
    dataset = json.load(f)

In [3]:
dataset[:5]

[{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.',
  'input': 'freind --> friend',
  'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'},
 {'instruction': 'Edit the following sentence for grammar.',
  'input': 'He go to the park every day.',
  'output': 'He goes to the park every day.'},
 {'instruction': 'Convert 45 kilometers to meters.',
  'input': '',
  'output': '45 kilometers is 45000 meters.'},
 {'instruction': "Rewrite this sentence to start with 'Although': Despite the rain, they went for a walk.",
  'input': '',
  'output': 'Although it was raining, they went for a walk.'},
 {'instruction': 'What are the first 10 square numbers?',
  'input': '',
  'output': '1, 4, 9, 16, 25, 36, 49, 64, 81, 100.'}]

Once the data is extracted/downloaded, we move to structuring the data in the most common format called Apalca.
Example:
Input:
```json
{
    "instruction": "Change the following sentence to past perfect tense.",
    "input": "They finish the game.",
    "output": "They had finished the game."
}
```

Converted:
```
Below is an instruction that describes a task. Write a response that satisfies the instruction.

### Instruction
Change the following sentence to past perfect tense.

### Input
They finish the game.

### Response
They had finished the game.
```

In [4]:
# implementing the conversion
# we've to perform tokenisation 
# here, i will incorporate all the modification in the same function
def format_input_entry(entry):
    instruction = (
        f"Below is an instruction that describes a task. "
        f"Write an appropriate response for it."
        f"\n\n### Instruction: \n{entry['instruction']}"
    )

    input = (
        f"\n\n### Input: \n"
        f"{entry['input']}" if entry['input'] else ""
    )
    
    combined = instruction + input

    return combined


In [5]:
# testing the conversion
print(format_input_entry(dataset[3]))

Below is an instruction that describes a task. Write an appropriate response for it.

### Instruction: 
Rewrite this sentence to start with 'Although': Despite the rain, they went for a walk.


In [6]:
# split the dataset into train, val and test
def split_dataset(dataset, train_ratio=0.85, val_ratio=0.1, test_ratio=0.05):
    train_portion = int(len(dataset) * train_ratio)
    val_portion = int(len(dataset) * val_ratio)
    test_portion = int(len(dataset) * test_ratio)
    return dataset[:train_portion], dataset[train_portion:train_portion + val_portion], dataset[train_portion + val_portion:train_portion + val_portion + test_portion]

In [7]:
# split the data
train_data, val_data, test_data = split_dataset(dataset)
print(len(train_data), len(val_data), len(test_data))

935 110 55


In [8]:
# create pytorch dataset
class nakliInstructionFineTuneDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.encoded_dataset = []
        
        for item in self.dataset:
            input_modified = format_input_entry(item)
            response_modified = f"\n\nResponse: \n{item['output']}"
            apalca_type_format = input_modified + response_modified
            self.encoded_dataset.append(tokenizer.encode(apalca_type_format))

    def __len__(self):
        return len(self.encoded_dataset)

    def __getitem__(self, idx):
        return self.encoded_dataset[idx]

In [9]:
# test the nakliInstructionFineTuneDataset
tokenizer = tiktoken.get_encoding("gpt2")

train_pytorch_dataset = nakliInstructionFineTuneDataset(
    dataset=train_data,
    tokenizer=tokenizer
)

print("\n--- Dataset ---")
print(train_pytorch_dataset.encoded_dataset[0])



--- Dataset ---
[21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 19430, 281, 5035, 2882, 329, 340, 13, 198, 198, 21017, 46486, 25, 220, 198, 36, 2100, 4985, 262, 1708, 9546, 416, 25449, 340, 656, 262, 24993, 1813, 13, 198, 198, 21017, 23412, 25, 220, 198, 19503, 521, 14610, 1545, 198, 198, 31077, 25, 220, 198, 464, 24993, 286, 262, 1813, 9546, 366, 19503, 521, 1, 318, 11491, 11, 262, 3376, 24993, 318, 366, 6726, 1911]


In [10]:
# varying length/tokens for each sample
print(len(train_pytorch_dataset.encoded_dataset[0]))
print(len(train_pytorch_dataset.encoded_dataset[1]))

74
59


In [11]:
# now, we move to important aspect:
# padding -> target generation -> replace padding with -100 in target
# all this is performed for a single batch

def collate_fn_with_padding(batch, padding_token_id=50256, device="cpu"):
    # since, each sample has different length, we need to pad them
    # find max length
    max_length = max(len(sample)+1 for sample in batch)

    # padded batch 
    padded_inputs = []
    for sample in batch:
        # add padding token to make all samples same length
        padded_sample = sample + [padding_token_id] * (max_length - len(sample))

        # remove extra padding token from targets
        padded_inputs.append(torch.tensor(padded_sample[:-1]))

    # convert to tensor
    padded_inputs = torch.stack(padded_inputs).to(device)

    return padded_inputs

In [12]:
# testing
input_1 = [1, 2]
input_2 = [3, 4, 5, 6, 7]
input_3 = [8, 9, 10]

batch_1 = [input_1, input_2, input_3]

padded_batch_1 = collate_fn_with_padding(batch_1)
print(padded_batch_1)


tensor([[    1,     2, 50256, 50256, 50256],
        [    3,     4,     5,     6,     7],
        [    8,     9,    10, 50256, 50256]])


In [13]:
# i will add padding + target
def collate_fn_with_padNtarget(batch, padding_token_id=50256, device="cpu"):
    # since, each sample has different length, we need to pad them
    # find max length
    max_length = max(len(sample)+1 for sample in batch)

    # padded batch 
    padded_inputs = []
    padded_targets = []

    for sample in batch:
        # add padding token to make all samples same length
        padded_sample = sample + [padding_token_id] * (max_length - len(sample))

        # remove extra padding token from targets
        padded_inputs.append(torch.tensor(padded_sample[:-1]))
        padded_targets.append(torch.tensor(padded_sample[1:])) # shift by 1

    # convert to tensor
    padded_inputs = torch.stack(padded_inputs).to(device)
    padded_targets = torch.stack(padded_targets).to(device)

    return padded_inputs, padded_targets

In [14]:
# test use case
# testing
input_1 = [1, 2]
input_2 = [3, 4, 5, 6, 7]
input_3 = [8, 9, 10]

batch_1 = [input_1, input_2, input_3]

padded_input_1, padded_target_1 = collate_fn_with_padNtarget(batch_1)
print(padded_input_1)
print(padded_target_1)

tensor([[    1,     2, 50256, 50256, 50256],
        [    3,     4,     5,     6,     7],
        [    8,     9,    10, 50256, 50256]])
tensor([[    2, 50256, 50256, 50256, 50256],
        [    4,     5,     6,     7, 50256],
        [    9,    10, 50256, 50256, 50256]])


In [15]:
mask = padded_target_1[2] == 50256
indices = torch.nonzero(mask).squeeze()
if indices.numel() > 1:
    padded_target_1[2][indices[1:]] = -100

print(mask)
print(indices)
print(padded_target_1[2])


tensor([False, False,  True,  True,  True])
tensor([2, 3, 4])
tensor([    9,    10, 50256,  -100,  -100])


In [16]:
# final version of collate function
# converts all tokens with 50256 id to -100,
# so that they are not considered during cross entropy loss calculation

# i will add padding + target + replace with -100
def collate_func(batch, padding_token_id=50256, allowed_max_length=None, device="cpu"):
    # since, each sample has different length, we need to pad them
    # find max length
    max_length = max(len(sample)+1 for sample in batch)

    # padded batch 
    padded_inputs = []
    padded_targets = []

    for sample in batch:
        # add padding token to make all samples same length
        padded_sample = sample + [padding_token_id] * (max_length - len(sample))

        padded_inpt = torch.tensor(padded_sample[:-1])
        padded_targ = torch.tensor(padded_sample[1:])

        # replace output padding token with -100
        mask = padded_targ == padding_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            padded_targ[indices[1:]] = -100

        if allowed_max_length is not None:
            padded_inpt = padded_inpt[:allowed_max_length]
            padded_targ = padded_targ[:allowed_max_length]

        padded_inputs.append(padded_inpt)
        padded_targets.append(padded_targ)

    # convert to tensor
    padded_inputs = torch.stack(padded_inputs).to(device)
    padded_targets = torch.stack(padded_targets).to(device)

    return padded_inputs, padded_targets


In [17]:
inputs, targets = collate_func(batch_1)

print(inputs)
print(targets)

tensor([[    1,     2, 50256, 50256, 50256],
        [    3,     4,     5,     6,     7],
        [    8,     9,    10, 50256, 50256]])
tensor([[    2, 50256,  -100,  -100,  -100],
        [    4,     5,     6,     7, 50256],
        [    9,    10, 50256,  -100,  -100]])


In [18]:
from torch.utils.data import DataLoader

train_dataset = nakliInstructionFineTuneDataset(
    dataset=train_data,
    tokenizer=tokenizer
)
train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=collate_func,
    drop_last=True,
    num_workers=0
)

val_dataset = nakliInstructionFineTuneDataset(
    dataset=val_data,
    tokenizer=tokenizer
)
val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_func,
    drop_last=True,
    num_workers=0
)

In [19]:
for input, target in train_dataloader:
    print(input.shape, target.shape)

torch.Size([8, 82]) torch.Size([8, 82])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 56]) torch.Size([8, 56])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 78]) torch.Size([8, 78])
torch.Size([8, 87]) torch.Size([8, 87])
torch.Size([8, 59]) torch.Size([8, 59])
torch.Size([8, 56]) torch.Size([8, 56])
torch.Size([8, 59]) torch.Size([8, 59])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 55]) torch.Size([8, 55])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 86]) torch.Size([8, 86])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 59]) torch.Size([8, 59])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 69]) torch.Size([8, 69])


In [20]:
print(input[0])
print(target[0])

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          281,  5035,  2882,   329,   340,    13,   198,   198, 21017, 46486,
           25,   220,   198,  2061,   318,   262,  6697,   286,   705, 22031,
        30960,   198,   198, 31077,    25,   220,   198,   464,  6697,   286,
          705, 22031,     6,   318,   705,  2395,   499,  4458, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])
tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   281,
         5035,  2882,   329,   340,    13,   198,   198, 21017, 46486,    25,
          220,   198,  2061,   318,   262,  6697,   286,   705, 22031, 30960,
          198,   198, 31077,    25,   220,   198,   464,  6697,   286,   705,
        22031,     6,   318,   705,  2395,   499,  4458, 50256,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,

In [21]:
# loading GPT model
from nakliGPT import nakliGPT
from trainer import load_config

# load config
base_config = load_config("/home/ge73qip/LLMs/LLMs_from_scratch/nakliGPT/configuration/configuration.yaml")

print(base_config["model"])

2025-04-22 13:18:09.905766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745320690.425999 2425090 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745320690.582660 2425090 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745320691.828843 2425090 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745320691.828939 2425090 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745320691.828948 2425090 computation_placer.cc:177] computation placer alr

{'vocab_size': 50257, 'embd_dim': 768, 'context_length': 256, 'dropout': 0.1, 'num_heads': 12, 'num_blocks': 12, 'qkv_bias': False}


In [22]:
model_configs = {
    "gpt2-small (124M)": {"embd_dim": 768, "context_length": 1024, "num_blocks": 12, "num_heads": 12, "qkv_bias": True},
    "gpt2-medium (355M)": {"embd_dim": 1024, "context_length": 1024, "num_blocks": 24, "num_heads": 16, "qkv_bias": True},
    "gpt2-large (774M)": {"embd_dim": 1280, "context_length": 1024, "num_blocks": 36, "num_heads": 20, "qkv_bias": True},
    "gpt2-xl (1558M)": {"embd_dim": 1600, "context_length": 1024, "num_blocks": 48, "num_heads": 25, "qkv_bias": True},
}

In [23]:
CHOOSE_MODEL = "gpt2-medium (355M)"

base_config["model"].update(model_configs[CHOOSE_MODEL])

In [24]:
base_config["model"]

{'vocab_size': 50257,
 'embd_dim': 1024,
 'context_length': 1024,
 'dropout': 0.1,
 'num_heads': 16,
 'num_blocks': 24,
 'qkv_bias': True}

In [25]:
from gpt_download import download_and_load_gpt2

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


In [26]:
from load_pretrained import load_weights_into_gpt

model = nakliGPT(base_config)
load_weights_into_gpt(model, params)
model.eval()

nakliGPT(
  (token_embedding): Embedding(50257, 1024)
  (positional_embedding): Embedding(1024, 1024)
  (dropout_embedding): Dropout(p=0.1, inplace=False)
  (transformer_block): nakliTransformerDecoder(
    (attention): nakliMultiHeadAttention(
      (dropout): Dropout(p=0.1, inplace=False)
      (W_query): Linear(in_features=1024, out_features=1024, bias=True)
      (W_key): Linear(in_features=1024, out_features=1024, bias=True)
      (W_value): Linear(in_features=1024, out_features=1024, bias=True)
      (W_output): Linear(in_features=1024, out_features=1024, bias=True)
    )
    (layer_norm_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (layer_norm_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (feed_forward): nakliFeedForward(
      (W): Linear(in_features=1024, out_features=4096, bias=True)
      (act): GELU(approximate='none')
      (W_output): Linear(in_features=4096, out_features=1024, bias=True)
    )
    (dropout): Dropout(p=0.1, inplace=False)

In [27]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
    return encoded_tensor


def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)  # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [28]:
# testing the loaded model
input_text = format_input_entry(val_data[1])
print(input_text)

Below is an instruction that describes a task. Write an appropriate response for it.

### Instruction: 
What type of cloud is typically associated with thunderstorms?


In [29]:
from text_utils import nakliGreedySampling

new_token_ids_generator = nakliGreedySampling(
    model, 
    tokenizer, 
    num_new_tokens=35, 
    max_context_length=base_config["model"]["context_length"]
)

In [30]:
generated_text = new_token_ids_generator.generate_text(
    torch.tensor(tokenizer.encode(input_text)).unsqueeze(0)
)


In [31]:
print(generated_text)

Below is an instruction that describes a task. Write an appropriate response for it.

### Instruction: 
What type of cloud is typically associated with thunderstorms?

A: A thunderstorm is a storm that is associated with lightning.

### Instruction: 

What is the difference between a thunderstorm and a tornado


In [None]:
response_text = (
    generated_text[len(input_text):]
    .replace("### Response:", "")
    .strip()
)
print(response_text)

A: A thunderstorm is a storm that is associated with lightning.

### Instruction: 

What is the difference between a thunderstorm and a tornado


: 

In [None]:
# fine tune the weights based on the dataset
from trainer import nakliTrainer
# load trainer
trainer = nakliTrainer(
    config=base_config,
    model=model,
    dataloader=train_dataloader
)

# train
trainer.train()

# save model
model_path = "nakliGPT_model_fine_tuned.pth"
trainer.save_model(model_path)

Epoch 1/2:  43%|████▎     | 50/116 [05:10<06:48,  6.19s/it, loss=1.1] 

[Epoch 1] Step 50: Loss = 1.1032


Epoch 1/2:  54%|█████▍    | 63/116 [06:32<05:32,  6.27s/it, loss=0.174] 