#INTRO

Dataset used for FINETUNNING :-> https://huggingface.co/datasets/databricks/databricks-dolly-15k

Extra resources for GPT FINETUNNIGN AND MORE KNOWLEDGE : https://jaykmody.com/blog/gpt-from-scratch/#classification-fine-tuning

In [8]:
!pip install -q tiktoken tqdm datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m46.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.1/316.1 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
from importlib.metadata import version

import matplotlib
import tiktoken
import torch

print("matplotlib version:", version("matplotlib"))
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

matplotlib version: 3.7.1
torch version: 2.3.1+cu121
tiktoken version: 0.7.0


In [10]:
import tqdm.auto as tqdm

In [11]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [12]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

batch = []

txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


#MODEL

In [13]:
from torch import nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads  # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # optional projection

        return context_vec


#####################################
# Chapter 4
#####################################
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)


class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed-forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x


class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.context_length = cfg["context_length"]
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.context_length:]
            # get the predictions
            logits = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

#code for downloading gpt weights

In [14]:
import os
import urllib.request

# import requests
import json
import numpy as np
import tensorflow as tf
from tqdm import tqdm


def download_and_load_gpt2(model_size, models_dir):
    # Validate model size
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")

    # Define paths
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    # Download files
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = os.path.join(base_url, model_size, filename)
        file_path = os.path.join(model_dir, filename)
        download_file(file_url, file_path)

    # Load settings and params
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    settings = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

    return settings, params


def download_file(url, destination):
    # Send a GET request to download the file

    try:
        with urllib.request.urlopen(url) as response:
            # Get the total file size from headers, defaulting to 0 if not present
            file_size = int(response.headers.get("Content-Length", 0))

            # Check if file exists and has the same size
            if os.path.exists(destination):
                file_size_local = os.path.getsize(destination)
                if file_size == file_size_local:
                    print(f"File already exists and is up-to-date: {destination}")
                    return

            # Define the block size for reading the file
            block_size = 1024  # 1 Kilobyte

            # Initialize the progress bar with total file size
            progress_bar_description = os.path.basename(url)  # Extract filename from URL
            with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
                # Open the destination file in binary write mode
                with open(destination, "wb") as file:
                    # Read the file in chunks and write to destination
                    while True:
                        chunk = response.read(block_size)
                        if not chunk:
                            break
                        file.write(chunk)
                        progress_bar.update(len(chunk))  # Update progress bar
    except urllib.error.HTTPError:
        s = (
            f"The specified URL ({url}) is incorrect, the internet connection cannot be established,"
            "\nor the requested file is temporarily unavailable.\nPlease visit the following website"
            " for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273")
        print(s)


# Alternative way using `requests`
"""
def download_file(url, destination):
    # Send a GET request to download the file in streaming mode
    response = requests.get(url, stream=True)

    # Get the total file size from headers, defaulting to 0 if not present
    file_size = int(response.headers.get("content-length", 0))

    # Check if file exists and has the same size
    if os.path.exists(destination):
        file_size_local = os.path.getsize(destination)
        if file_size == file_size_local:
            print(f"File already exists and is up-to-date: {destination}")
            return

    # Define the block size for reading the file
    block_size = 1024  # 1 Kilobyte

    # Initialize the progress bar with total file size
    progress_bar_description = url.split("/")[-1]  # Extract filename from URL
    with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
        # Open the destination file in binary write mode
        with open(destination, "wb") as file:
            # Iterate over the file data in chunks
            for chunk in response.iter_content(block_size):
                progress_bar.update(len(chunk))  # Update progress bar
                file.write(chunk)  # Write the chunk to the file
"""


def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params

#loading the weights

In [15]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt2")
print("Settings:", settings)
print("Parameter dictionary keys:", params.keys())
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)


checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 20.0kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.42MiB/s]
hparams.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 76.2kiB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 498M/498M [00:20<00:00, 23.9MiB/s]
model.ckpt.index: 100%|██████████| 5.21k/5.21k [00:00<00:00, 3.32MiB/s]
model.ckpt.meta: 100%|██████████| 471k/471k [00:00<00:00, 1.27MiB/s]
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 1.50MiB/s]


Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameter dictionary keys: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])
[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
Token embedding weight tensor dimensions: (50257, 768)


In [16]:
def assign(left, right):
    if left is None:
      return torch.nn.Parameter(torch.tensor(right))
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

model = GPTModel(GPT_CONFIG_124M)
load_weights_into_gpt(model, params)
# model.to(device);


In [10]:
#lets test the modl with laoded weights
device = "cpu"
model = GPTModel(GPT_CONFIG_124M)
input_txt = "Once hesitated never"
input_txt = "Every effort moves you"
encoded_txt = torch.tensor(tokenizer.encode(input_txt)).unsqueeze(0).to(device)

output = model.generate(encoded_txt, max_new_tokens = 25)
print(output.shape)
print(tokenizer.decode(output[0].tolist()))

torch.Size([1, 29])
Every effort moves you cyclists Vega darling splits minimalisttank OlcrewMitbeans DevOnline embodiment visions dod Honestly Gods estimated enabling indirectlyOWNoslovClub KristekerSeg


In [11]:
load_weights_into_gpt(model, params)

output = model.generate(encoded_txt, max_new_tokens = 25)
print(tokenizer.decode(output[0].tolist()))

Every effort moves you from place to place." They seemed unconvinced that the passenger access stairway on its own made them even more impervious


In [12]:
#trying to ask him a question
input_txt = "how to make a cake?"
encoded_txt = torch.tensor(tokenizer.encode(input_txt)).unsqueeze(0).to(device)

output = model.generate(encoded_txt, max_new_tokens = 25)
print(tokenizer.decode(output[0].tolist()))

how to make a cake? Ruby put it away, Yang put it down… no, ditch it was what you just did command.


[……


#Lets FINETUNE

In [13]:
from datasets import load_dataset

ds = load_dataset("databricks/databricks-dolly-15k")   # i think that 15k exmaples is too much and may need to decrease them
ds # we will use only instruction and response

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 15011
    })
})

In [81]:
import numpy
numpy.array(ds['train'])[:2][0]

{'instruction': 'When did Virgin Australia start operating?',
 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
 'category': 'closed_qa'}

In [14]:
# lets make the dataset as it will be needed
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

# VIMP NOTE FOR FINETUNING I WILL NOTE USE INS AND RESP DATASET BUT MY DATASET WILL BE COMBINED NEXT WORD PREDICTOR MY INPUT WILL BW INS + RESP - 1 AND OUTPUT WILL BE INS + RESP + 1 SO MY MODEL WILL LEARN TO PREDICT THE NEXT WORDS
class InstructionDataset(Dataset):
    def __init__(self, ds, split = "train"):
        super().__init__()
        ds = ds['train']  # also add the split

        divide = int(len(ds) * 0.8)
        ds = np.array(ds)

        if split == "train":
            self.ds1 = ds[:divide]
        else: # TEST CASE
            self.ds1 = ds[divide:]


    def __len__(self):
        return len(self.ds1)

    def __getitem__(self, idx):
        ins = self.ds1[idx]["instruction"] # so i need to pad them but it will be done in collate function as i want model to experienecediff lens
        resp = self.ds1[idx]["response"]

        txt = f"###INSTRUCTION :\n {ins} \n###RESPONSE :\n {resp}"
        txt = tokenizer.encode(txt)

        return txt

train_data = InstructionDataset(ds, split = "train")
test_data = InstructionDataset(ds, split = "test")
train_data

<__main__.InstructionDataset at 0x7bf13284e440>

In [3]:
#new dataset
import json

with open("instruction-examples.json",'r') as file:
    data = json.load(file)

print(data[0])

{'instruction': 'Identify the verb in the following sentence: The cat sleeps on the couch.', 'input': '', 'output': 'The verb in the sentence is "sleeps."'}


In [6]:

def make_text(entry):
  instruction_text = (
          f"Below is an instruction that describes a task. "
          f"Write a response that appropriately completes the request."
          f"\n\n### Instruction:\n{entry['instruction']}"
      )
  inp = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
  return  instruction_text + inp

# lets make the dataset as it will be needed
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

# VIMP NOTE FOR FINETUNING I WILL NOTE USE INS AND RESP DATASET BUT MY DATASET WILL BE COMBINED NEXT WORD PREDICTOR MY INPUT WILL BW INS + RESP - 1 AND OUTPUT WILL BE INS + RESP + 1 SO MY MODEL WILL LEARN TO PREDICT THE NEXT WORDS
class InstructionDataset(Dataset):
    def __init__(self, ds, split = "train"):

        divide = int(len(ds) * 0.8)
        ds = np.array(ds)

        if split == "train":
            self.ds1 = ds[:divide]
        else: # TEST CASE
            self.ds1 = ds[divide:]


    def __len__(self):
        return len(self.ds1)

    def __getitem__(self, idx):
        ins = make_text(self.ds1[idx])
        resp = f"\n\n### Response:\n{self.ds1[idx]['output']}"

        txt = ins + resp
        txt = tokenizer.encode(txt)

        return txt

train_data = InstructionDataset(data, split = "train")
test_data = InstructionDataset(data, split = "test")
train_data

<__main__.InstructionDataset at 0x7e43e094f2b0>

In [7]:
len(train_data), len(test_data)

(160, 40)

In [84]:
tokenizer.encode("RESPONSE"), tokenizer.decode([50256])


([19535, 47, 1340, 5188], '<|endoftext|>')

In [87]:
train_data[0]

[21017,
 1268,
 46126,
 2849,
 1058,
 198,
 1649,
 750,
 5283,
 4505,
 923,
 5361,
 30,
 220,
 198,
 21017,
 19535,
 47,
 1340,
 5188,
 1058,
 198,
 5283,
 4505,
 32400,
 2594,
 319,
 3261,
 2932,
 4751,
 355,
 5283,
 4518,
 11,
 351,
 734,
 6215,
 319,
 257,
 2060,
 6339,
 13]

In [17]:
# lets go for the dataloaders part and also our collate function
def collate_fn(batch):
  #lets findout the maximum length
  max_len = 0
  for i in batch:
    max_len = max(max_len, i.shape[0])

  # lets padd the input with 50257 which signifies the end of text
  input_list  = []
  output_list = []

  for i in batch:
    item = i.copy()
    ln = len(item)
    # first adding the end of text token
    item += [50256]

    #now lets pad
    padd = (item + [50256] * (max_len - ln))

    final_input = padd[:-1]
    final_output = padd[1:]

    input_list.append(torch.tensor(item))
    output_list.append(torch.tensor(item))

  return torch.stack(input_list), torch.stack(output_list) # so now most of our examples will contain 50257 so our model will overfit like a goodboy and show us a low loss from our loss function

def collate_fn(batch, allowed_len = 1024): # sinceour input window is limited so we might need to truncate our inpits to avoid the risk of error and make it robust
  #lets findout the maximum length
  max_len = 0
  for i in batch:
    max_len = max(max_len, len(i))

  # lets padd the input with 50257 which signifies the end of text
  input_list  = []
  output_list = []

  for i in batch:
    item = i.copy()
    ln = len(item)
    # first adding the end of text token
    item += [50256]

    #now lets pad
    padd_for_inp = (item + [50256] * (max_len - ln))
    padd_for_out = (item + [-100] * (max_len - ln))  # so that we can easily use ignore_idx

    final_input = padd_for_inp[:-1]
    final_output = padd_for_out[1:]

    if allowed_len is not None:
      final_input = final_input[:allowed_len]
      final_output = final_output[:allowed_len]

    input_list.append(torch.tensor(final_input))
    output_list.append(torch.tensor(final_output))

  return torch.stack(input_list), torch.stack(output_list)


train_dataloader = DataLoader(train_data, batch_size = 2, shuffle = True, collate_fn = collate_fn, drop_last = True)
test_dataloader = DataLoader(test_data, batch_size = 2, shuffle = False, collate_fn = collate_fn, drop_last = True)

train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7e429aef2da0>

In [13]:
next(iter(train_dataloader))[0].shape
next(iter(train_dataloader))[1].shape

torch.Size([8, 774])

In [115]:
next(iter(train_dataloader))[0][0],next(iter(train_dataloader))[1][0]

(tensor([21017,  1268, 46126,  2849,  1058,   198,  1867, 34423,   389,  1884,
           284,   307, 19189,   287,  6995,    30,   220,   198, 21017, 19535,
            47,  1340,  5188,  1058,   198,  6995,  3568,   284,   307,   257,
           922,  1295,   284,   307,   257,  6253,    11, 40439, 11949,    11,
           393, 13453,    13,   554,   262,  1613,   340,   743,   423,   587,
           257,  3024,  3077,   329,  3259,   287,   262,  3056,   393,  9358,
         11798,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 5

In [18]:
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

model = GPTModel(GPT_CONFIG_124M)
load_weights_into_gpt(model, params)

model.to(device)

NUM_EPOCHS = 2
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.0000005, weight_decay = 0.1)

from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

# scheduler = StepLR(optimizer, step_size=10, gamma=0.1)
# scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=0)


In [31]:
# i think that now i shall train it a litle bit
from tqdm.auto import tqdm

NUM_EPOCHS = 50

for epoch in tqdm(range(NUM_EPOCHS)):
  model.train()
  train_loss = 0
  for idx, (inp, out) in tqdm(enumerate(train_dataloader)):
    inp = inp.to(device)
    out = out.to(device)

    # print(f"IINPUT SHAPE : {inp.shape}")
    logits = model(inp)
    loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), out.view(-1))
    train_loss += loss.item()
    # print(loss.item())

    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    # scheduler.step()

    if idx % 100 == 0:
      print(f"EPOCH : {epoch} || BATCH : {idx} || LOSS : {loss.item()}")

  test_loss = 0
  model.eval()
  with torch.no_grad():
    for idx, (inp, out) in tqdm(enumerate(test_dataloader)):
      inp = inp.to(device)
      out = out.to(device)

      logits = model(inp)
      loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), out.view(-1))
      test_loss += loss.item()

      if idx % 10 == 0:
        print(f"EPOCH : {epoch} || BATCH : {idx} || LOSS : {loss.item()}")

  train_loss /= len(train_dataloader)
  test_loss /= len(test_dataloader)

  if epoch % 10 == 0:
    torch.save(model.state_dict(), f"model_{epoch}.pth")
  if epoch % 1 == 0:
    print(f"EPOCH : {epoch} || train_loss : {train_loss} || test_loss : {test_loss}")



  0%|          | 0/50 [00:00<?, ?it/s]

0it [00:00, ?it/s]

EPOCH : 0 || BATCH : 0 || LOSS : 0.45909619331359863


0it [00:00, ?it/s]

EPOCH : 0 || BATCH : 0 || LOSS : 0.6685781478881836
EPOCH : 0 || BATCH : 10 || LOSS : 0.8710431456565857
EPOCH : 0 || train_loss : 0.6395718917250633 || test_loss : 1.135982495546341


0it [00:00, ?it/s]

EPOCH : 1 || BATCH : 0 || LOSS : 0.6117076277732849


0it [00:00, ?it/s]

EPOCH : 1 || BATCH : 0 || LOSS : 0.6672303080558777
EPOCH : 1 || BATCH : 10 || LOSS : 0.8701194524765015
EPOCH : 1 || train_loss : 0.6314762480556965 || test_loss : 1.1365850120782852


0it [00:00, ?it/s]

EPOCH : 2 || BATCH : 0 || LOSS : 0.686863124370575


0it [00:00, ?it/s]

EPOCH : 2 || BATCH : 0 || LOSS : 0.6659398674964905
EPOCH : 2 || BATCH : 10 || LOSS : 0.871458888053894
EPOCH : 2 || train_loss : 0.6333224192261696 || test_loss : 1.1373263239860534


0it [00:00, ?it/s]

EPOCH : 3 || BATCH : 0 || LOSS : 0.585375964641571


0it [00:00, ?it/s]

EPOCH : 3 || BATCH : 0 || LOSS : 0.6667912602424622
EPOCH : 3 || BATCH : 10 || LOSS : 0.8734226822853088
EPOCH : 3 || train_loss : 0.6385147172957659 || test_loss : 1.1378649771213531


0it [00:00, ?it/s]

EPOCH : 4 || BATCH : 0 || LOSS : 0.70677649974823


0it [00:00, ?it/s]

EPOCH : 4 || BATCH : 0 || LOSS : 0.6687684655189514
EPOCH : 4 || BATCH : 10 || LOSS : 0.8708236217498779
EPOCH : 4 || train_loss : 0.6365114700049161 || test_loss : 1.1366264641284942


0it [00:00, ?it/s]

EPOCH : 5 || BATCH : 0 || LOSS : 0.728334903717041


0it [00:00, ?it/s]

EPOCH : 5 || BATCH : 0 || LOSS : 0.6689098477363586
EPOCH : 5 || BATCH : 10 || LOSS : 0.8691728115081787
EPOCH : 5 || train_loss : 0.6215472884476185 || test_loss : 1.1388673186302185


0it [00:00, ?it/s]

EPOCH : 6 || BATCH : 0 || LOSS : 0.7770032286643982


0it [00:00, ?it/s]

EPOCH : 6 || BATCH : 0 || LOSS : 0.6690863966941833
EPOCH : 6 || BATCH : 10 || LOSS : 0.8705978989601135
EPOCH : 6 || train_loss : 0.6208204790949822 || test_loss : 1.140890011191368


0it [00:00, ?it/s]

EPOCH : 7 || BATCH : 0 || LOSS : 0.6302351355552673


0it [00:00, ?it/s]

EPOCH : 7 || BATCH : 0 || LOSS : 0.6714053750038147
EPOCH : 7 || BATCH : 10 || LOSS : 0.8713968396186829
EPOCH : 7 || train_loss : 0.6176317591220141 || test_loss : 1.1429446399211884


0it [00:00, ?it/s]

EPOCH : 8 || BATCH : 0 || LOSS : 0.8160295486450195


0it [00:00, ?it/s]

EPOCH : 8 || BATCH : 0 || LOSS : 0.672939121723175
EPOCH : 8 || BATCH : 10 || LOSS : 0.8737446069717407
EPOCH : 8 || train_loss : 0.6176479116082192 || test_loss : 1.1442934155464173


0it [00:00, ?it/s]

EPOCH : 9 || BATCH : 0 || LOSS : 0.543167769908905


0it [00:00, ?it/s]

EPOCH : 9 || BATCH : 0 || LOSS : 0.6715694665908813
EPOCH : 9 || BATCH : 10 || LOSS : 0.8725669384002686
EPOCH : 9 || train_loss : 0.6295971505343914 || test_loss : 1.1444679081439972


0it [00:00, ?it/s]

EPOCH : 10 || BATCH : 0 || LOSS : 0.6970744729042053


0it [00:00, ?it/s]

EPOCH : 10 || BATCH : 0 || LOSS : 0.6717280745506287
EPOCH : 10 || BATCH : 10 || LOSS : 0.8737438321113586
EPOCH : 10 || train_loss : 0.6250020641833544 || test_loss : 1.1447493582963943


0it [00:00, ?it/s]

EPOCH : 11 || BATCH : 0 || LOSS : 0.5915156602859497


0it [00:00, ?it/s]

EPOCH : 11 || BATCH : 0 || LOSS : 0.6724674701690674
EPOCH : 11 || BATCH : 10 || LOSS : 0.8748221397399902
EPOCH : 11 || train_loss : 0.624992772936821 || test_loss : 1.1453934997320174


0it [00:00, ?it/s]

EPOCH : 12 || BATCH : 0 || LOSS : 0.5854694247245789


0it [00:00, ?it/s]

EPOCH : 12 || BATCH : 0 || LOSS : 0.6713736057281494
EPOCH : 12 || BATCH : 10 || LOSS : 0.8757476806640625
EPOCH : 12 || train_loss : 0.6204468470066786 || test_loss : 1.146030095219612


0it [00:00, ?it/s]

EPOCH : 13 || BATCH : 0 || LOSS : 0.5591225028038025


0it [00:00, ?it/s]

EPOCH : 13 || BATCH : 0 || LOSS : 0.6712117791175842
EPOCH : 13 || BATCH : 10 || LOSS : 0.8741768598556519
EPOCH : 13 || train_loss : 0.6130292627960443 || test_loss : 1.1472181022167205


0it [00:00, ?it/s]

EPOCH : 14 || BATCH : 0 || LOSS : 0.4817288815975189


0it [00:00, ?it/s]

EPOCH : 14 || BATCH : 0 || LOSS : 0.6732125282287598
EPOCH : 14 || BATCH : 10 || LOSS : 0.8733607530593872
EPOCH : 14 || train_loss : 0.6109516650438309 || test_loss : 1.1488361686468125


0it [00:00, ?it/s]

EPOCH : 15 || BATCH : 0 || LOSS : 0.6098339557647705


0it [00:00, ?it/s]

EPOCH : 15 || BATCH : 0 || LOSS : 0.6739375591278076
EPOCH : 15 || BATCH : 10 || LOSS : 0.8735454678535461
EPOCH : 15 || train_loss : 0.6107106745243073 || test_loss : 1.148780146241188


0it [00:00, ?it/s]

EPOCH : 16 || BATCH : 0 || LOSS : 0.7023290395736694


0it [00:00, ?it/s]

EPOCH : 16 || BATCH : 0 || LOSS : 0.6749189496040344
EPOCH : 16 || BATCH : 10 || LOSS : 0.8713452816009521
EPOCH : 16 || train_loss : 0.6041256472468376 || test_loss : 1.149340346455574


0it [00:00, ?it/s]

EPOCH : 17 || BATCH : 0 || LOSS : 0.48977118730545044


0it [00:00, ?it/s]

EPOCH : 17 || BATCH : 0 || LOSS : 0.676315188407898
EPOCH : 17 || BATCH : 10 || LOSS : 0.8697177171707153
EPOCH : 17 || train_loss : 0.6061219237744808 || test_loss : 1.1510781526565552


0it [00:00, ?it/s]

EPOCH : 18 || BATCH : 0 || LOSS : 0.6845447421073914


0it [00:00, ?it/s]

EPOCH : 18 || BATCH : 0 || LOSS : 0.6759189963340759
EPOCH : 18 || BATCH : 10 || LOSS : 0.8702548742294312
EPOCH : 18 || train_loss : 0.6016653381288052 || test_loss : 1.1517037719488143


0it [00:00, ?it/s]

EPOCH : 19 || BATCH : 0 || LOSS : 0.746872067451477


0it [00:00, ?it/s]

EPOCH : 19 || BATCH : 0 || LOSS : 0.6743468642234802
EPOCH : 19 || BATCH : 10 || LOSS : 0.8718948364257812
EPOCH : 19 || train_loss : 0.5999963920563459 || test_loss : 1.1530837565660477


0it [00:00, ?it/s]

EPOCH : 20 || BATCH : 0 || LOSS : 0.5499894618988037


0it [00:00, ?it/s]

EPOCH : 20 || BATCH : 0 || LOSS : 0.67274010181427
EPOCH : 20 || BATCH : 10 || LOSS : 0.8735768795013428
EPOCH : 20 || train_loss : 0.5950191006064415 || test_loss : 1.1543441504240035


0it [00:00, ?it/s]

EPOCH : 21 || BATCH : 0 || LOSS : 0.7817364931106567


0it [00:00, ?it/s]

EPOCH : 21 || BATCH : 0 || LOSS : 0.6745259165763855
EPOCH : 21 || BATCH : 10 || LOSS : 0.8735328912734985
EPOCH : 21 || train_loss : 0.5881656229496002 || test_loss : 1.1570741921663283


0it [00:00, ?it/s]

EPOCH : 22 || BATCH : 0 || LOSS : 0.4768925607204437


0it [00:00, ?it/s]

EPOCH : 22 || BATCH : 0 || LOSS : 0.6754237413406372
EPOCH : 22 || BATCH : 10 || LOSS : 0.8707160949707031
EPOCH : 22 || train_loss : 0.6006220303475857 || test_loss : 1.1566449612379075


0it [00:00, ?it/s]

EPOCH : 23 || BATCH : 0 || LOSS : 0.8491616249084473


0it [00:00, ?it/s]

EPOCH : 23 || BATCH : 0 || LOSS : 0.6745123863220215
EPOCH : 23 || BATCH : 10 || LOSS : 0.8710678219795227
EPOCH : 23 || train_loss : 0.5914095874875784 || test_loss : 1.1553486943244935


0it [00:00, ?it/s]

EPOCH : 24 || BATCH : 0 || LOSS : 0.6369235515594482


0it [00:00, ?it/s]

EPOCH : 24 || BATCH : 0 || LOSS : 0.6765516400337219
EPOCH : 24 || BATCH : 10 || LOSS : 0.8716848492622375
EPOCH : 24 || train_loss : 0.5910862285643816 || test_loss : 1.1573955506086349


0it [00:00, ?it/s]

EPOCH : 25 || BATCH : 0 || LOSS : 0.714623212814331


0it [00:00, ?it/s]

EPOCH : 25 || BATCH : 0 || LOSS : 0.6792916655540466
EPOCH : 25 || BATCH : 10 || LOSS : 0.8725273609161377
EPOCH : 25 || train_loss : 0.5954773746430874 || test_loss : 1.158742943406105


0it [00:00, ?it/s]

EPOCH : 26 || BATCH : 0 || LOSS : 0.514127790927887


0it [00:00, ?it/s]

EPOCH : 26 || BATCH : 0 || LOSS : 0.6793321967124939
EPOCH : 26 || BATCH : 10 || LOSS : 0.8737313747406006
EPOCH : 26 || train_loss : 0.5825734589248895 || test_loss : 1.1599067479372025


0it [00:00, ?it/s]

EPOCH : 27 || BATCH : 0 || LOSS : 0.3550260365009308


0it [00:00, ?it/s]

EPOCH : 27 || BATCH : 0 || LOSS : 0.6807952523231506
EPOCH : 27 || BATCH : 10 || LOSS : 0.8752460479736328
EPOCH : 27 || train_loss : 0.5825518533587456 || test_loss : 1.1614881783723832


0it [00:00, ?it/s]

EPOCH : 28 || BATCH : 0 || LOSS : 0.40919673442840576


0it [00:00, ?it/s]

EPOCH : 28 || BATCH : 0 || LOSS : 0.6779411435127258
EPOCH : 28 || BATCH : 10 || LOSS : 0.8752418756484985
EPOCH : 28 || train_loss : 0.5753172010183334 || test_loss : 1.160663041472435


0it [00:00, ?it/s]

EPOCH : 29 || BATCH : 0 || LOSS : 0.6897191405296326


0it [00:00, ?it/s]

EPOCH : 29 || BATCH : 0 || LOSS : 0.6792823076248169
EPOCH : 29 || BATCH : 10 || LOSS : 0.8755569458007812
EPOCH : 29 || train_loss : 0.5772499490529299 || test_loss : 1.1626040041446686


0it [00:00, ?it/s]

EPOCH : 30 || BATCH : 0 || LOSS : 0.5952420234680176


0it [00:00, ?it/s]

EPOCH : 30 || BATCH : 0 || LOSS : 0.6789515614509583
EPOCH : 30 || BATCH : 10 || LOSS : 0.8768224120140076
EPOCH : 30 || train_loss : 0.5716780588030815 || test_loss : 1.164658147096634


0it [00:00, ?it/s]

EPOCH : 31 || BATCH : 0 || LOSS : 0.516723096370697


0it [00:00, ?it/s]

EPOCH : 31 || BATCH : 0 || LOSS : 0.6779047250747681
EPOCH : 31 || BATCH : 10 || LOSS : 0.876889705657959
EPOCH : 31 || train_loss : 0.5723836123943329 || test_loss : 1.164596325159073


0it [00:00, ?it/s]

EPOCH : 32 || BATCH : 0 || LOSS : 0.47381648421287537


0it [00:00, ?it/s]

EPOCH : 32 || BATCH : 0 || LOSS : 0.6791945695877075
EPOCH : 32 || BATCH : 10 || LOSS : 0.8778285980224609
EPOCH : 32 || train_loss : 0.5665889080613852 || test_loss : 1.1674234509468078


0it [00:00, ?it/s]

EPOCH : 33 || BATCH : 0 || LOSS : 0.4156137704849243


0it [00:00, ?it/s]

EPOCH : 33 || BATCH : 0 || LOSS : 0.6795123815536499
EPOCH : 33 || BATCH : 10 || LOSS : 0.8823956847190857
EPOCH : 33 || train_loss : 0.5702125154435634 || test_loss : 1.1699073940515519


0it [00:00, ?it/s]

EPOCH : 34 || BATCH : 0 || LOSS : 0.8375407457351685


0it [00:00, ?it/s]

EPOCH : 34 || BATCH : 0 || LOSS : 0.6824031472206116
EPOCH : 34 || BATCH : 10 || LOSS : 0.8819854855537415
EPOCH : 34 || train_loss : 0.5777850646525622 || test_loss : 1.169747430086136


0it [00:00, ?it/s]

EPOCH : 35 || BATCH : 0 || LOSS : 0.44399696588516235


0it [00:00, ?it/s]

EPOCH : 35 || BATCH : 0 || LOSS : 0.6830783486366272
EPOCH : 35 || BATCH : 10 || LOSS : 0.8806149959564209
EPOCH : 35 || train_loss : 0.5692360162734985 || test_loss : 1.1697327017784118


0it [00:00, ?it/s]

EPOCH : 36 || BATCH : 0 || LOSS : 0.5577828884124756


0it [00:00, ?it/s]

EPOCH : 36 || BATCH : 0 || LOSS : 0.6817681789398193
EPOCH : 36 || BATCH : 10 || LOSS : 0.8822071552276611
EPOCH : 36 || train_loss : 0.5648446019738913 || test_loss : 1.1708945006132125


0it [00:00, ?it/s]

EPOCH : 37 || BATCH : 0 || LOSS : 0.39063960313796997


0it [00:00, ?it/s]

EPOCH : 37 || BATCH : 0 || LOSS : 0.6814687252044678
EPOCH : 37 || BATCH : 10 || LOSS : 0.8823670148849487
EPOCH : 37 || train_loss : 0.5607102695852518 || test_loss : 1.1721390157938003


0it [00:00, ?it/s]

EPOCH : 38 || BATCH : 0 || LOSS : 0.407776415348053


0it [00:00, ?it/s]

EPOCH : 38 || BATCH : 0 || LOSS : 0.6824241876602173
EPOCH : 38 || BATCH : 10 || LOSS : 0.8808817863464355
EPOCH : 38 || train_loss : 0.5596814654767513 || test_loss : 1.1738138258457185


0it [00:00, ?it/s]

EPOCH : 39 || BATCH : 0 || LOSS : 0.4065549373626709


0it [00:00, ?it/s]

EPOCH : 39 || BATCH : 0 || LOSS : 0.6818345189094543
EPOCH : 39 || BATCH : 10 || LOSS : 0.8816791772842407
EPOCH : 39 || train_loss : 0.5553832963109017 || test_loss : 1.1733562618494033


0it [00:00, ?it/s]

EPOCH : 40 || BATCH : 0 || LOSS : 0.6036117076873779


0it [00:00, ?it/s]

EPOCH : 40 || BATCH : 0 || LOSS : 0.6831125020980835
EPOCH : 40 || BATCH : 10 || LOSS : 0.8807420134544373
EPOCH : 40 || train_loss : 0.5621828511357307 || test_loss : 1.1739926040172577


0it [00:00, ?it/s]

EPOCH : 41 || BATCH : 0 || LOSS : 0.4486401081085205


KeyboardInterrupt: 

In [29]:
torch.save(model.state_dict(), f"model_final.pth")

In [35]:
# so lets now check how it actually performs
model.eval()
with torch.no_grad():
  input_txt = make_text({'instruction':"tell antonym", 'input' : "bye"})
  encoded_txt = torch.tensor(tokenizer.encode(input_txt)).unsqueeze(0).to(device)

  output = model.generate(encoded_txt, max_new_tokens = 100)
  print(tokenizer.decode(output[0].tolist()))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
tell antonym

### Input:
bye

### Response:
Arithmetic: 3, 10

### Response:
Historical Data<|endoftext|>The Microsoft Xbox 360 console was given to the world on Sunday night as one of its last shows before being removed by the Nintendo 64.

The consoles were removed by Nintendo in October 2013 for counterfeiting and was returned to the home world on Friday.

The removal of the boxes was not the only violation of the 3rd Amendment, as presenters to the game were fined,


In [None]:
instruction_text = (
          f"Below is an instruction that describes a task. "
          f"Write a response that appropriately completes the request."
          f"\n\n### Instruction:\n{entry['instruction']}"
      )
  inp = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
          resp = f"\n\n### Response:\n{self.ds1[idx]['output']}"
