In [1]:
from pathlib import Path
import sys
sys.path.append((Path.cwd().parent/"src").resolve().as_posix())

import finetune.settings as s
from finetune.utils import ModelSummary
from finetune.models import GPT
from finetune.utils import ModelCheckpointManager
from finetune.dataloader import UltraChat200kDataLoaderLite
from finetune.LoRA import LoRALinear
from finetune.utils import instruct_generate
import numpy as np
import torch

we need CUDA for DDP so falling back to CPU


## Explore dataset

In [2]:
dataloader = UltraChat200kDataLoaderLite(split="val")
x, y = dataloader.next_batch()
x.shape

found 1 shards for split val


torch.Size([8, 1024])

In [3]:
def count_tokens_in_dir(data_dir, split="train"):
    """
    Count total tokens across all .npy shards for a given split (train/val).
    """
    data_dir = Path(data_dir)
    files = sorted(data_dir.glob(f"{split}_*.npy"))
    
    total_tokens = 0
    for f in files:
        arr = np.load(f, mmap_mode="r")  # don't load into RAM
        total_tokens += arr.shape[0]     # length = number of tokens in that shard
    
    return total_tokens

# point to your dataset folder
data_path = s.ultrachat_200k_data_path

train_tokens = count_tokens_in_dir(data_path, "train")
val_tokens   = count_tokens_in_dir(data_path, "val")

print(f"Total train tokens: {train_tokens:,}")
print(f"Total val tokens:   {val_tokens:,}")

Total train tokens: 205,829,073
Total val tokens:   50,000,000


## Finetuned model

In [4]:
pretrained_model = GPT()

wandb_path='sampath017/GPT3_124M/model_checkpoint_train_step_17000_val_loss_3.08:v0'
cache_dir=s.models_root_path/"pretrained_models"
pretrained_model, _ = ModelCheckpointManager.get_checkpoint_from_wandb(pretrained_model, wandb_path, cache_dir, model_type="pretrained")
ModelSummary.summary(pretrained_model)

[34m[1mwandb[0m: Downloading large artifact model_checkpoint_train_step_17000_val_loss_3.08:v0, 1425.29MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:3.1 (453.9MB/s)


Using checkpoint: C:\Users\sampath\Dev\GPT\models\pretrained_models\model_checkpoint_train_step_17000_val_loss_3.08.pt
Model size: 475.03 MB
Trainable parameters: 124.53M
Non-trainable parameters: 0


In [6]:
s = instruct_generate(pretrained_model, prompt="How are you?")
print(s)

KeyboardInterrupt: 

In [3]:
# apply LoRA
finetuned_model = LoRALinear.apply_lora(pretrained_model, r=16, alpha=32, dropout=0.05,
                              target_modules=("attn", "proj"))

# wandb_path='sampath017/GPT3_124M_instruct/model_checkpoint_train_step_5_val_loss_3.29:v0'
# cache_dir=s.models_root_path/"finetuned_models"
# finetuned_model = ModelCheckpointManager.get_model_from_wandb(finetuned_model, wandb_path, cache_dir, model_type="finetuned")

if s.ddp_master_process:
    # check trainable params
    trainable = sum(p.numel() for p in finetuned_model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in finetuned_model.parameters())
    print(
        f"Trainable params: {trainable} / {total} ({100*trainable/total:.2f}%)")

Trainable params: 884736 / 125374080 (0.71%)


## Explore LoRA

In [12]:
d = 10
r = 2 

A = torch.randn(d, r)
B = torch.randn(r, d)

A.shape, B.shape

(torch.Size([10, 2]), torch.Size([2, 10]))

In [13]:
W = A@B
W.shape

torch.Size([10, 10])

In [14]:
np.linalg.matrix_rank(W)

np.int64(2)

In [2]:
(0.01 * 80) * 3

2.4000000000000004