In [1]:
import os,time,math,pickle,random
import numpy as np,pandas as pd
import torch,torch._dynamo
from model import GPT

In [2]:
seed=1337
out_dir = 'fine-tuning'
os.makedirs(out_dir, exist_ok=True)
model_dir='long'

In [3]:
with open(os.path.join(model_dir, 'meta.pkl'), 'rb') as f:
    meta = pickle.load(f)
stoi, itos = meta['stoi'], meta['itos']
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [4]:
device='mps'

In [5]:
ckpt_path = os.path.join(model_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
config=checkpoint["config"]
model = GPT(config)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.to(device)

number of parameters: 3.37M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(55, 216, padding_idx=27)
    (wpe): Embedding(50, 216)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=216, out_features=648, bias=False)
          (c_proj): Linear(in_features=216, out_features=216, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=216, out_features=864, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=864, out_features=216, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=216, out_features=55, bias=False)
)

In [6]:
fp=open(os.path.join(out_dir, 'train.bin'),"r")
train_names=[]
evrythng=fp.readlines()
for line in evrythng:
    train_names.append(line.split("\n")[0])
fp.close()
fp=open(os.path.join(out_dir, 'val.bin'),"r")
val_names=[]
evrythng=fp.readlines()
for line in evrythng:
    val_names.append(line.split("\n")[0])
fp.close()

In [7]:
batch_size=8
block_size=config["block_size"]

In [8]:
n_layer = config["n_layer"]
n_head = config["n_head"]
n_embd = config["n_embd"]
dropout = config["dropout"]
bias=config["bias"]

In [9]:
learning_rate = 1e-4
max_iters = 1000
lr_decay_iters = 1000
min_lr = 1e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
warmup_iters = 50
grad_clip = 1.0
decay_lr = False

In [10]:
eval_interval = 30
log_interval = 10
eval_iters = 30

In [11]:
device = 'mps'
#torch._dynamo.config.suppress_errors = True
dtype='float16'
compile=False

In [12]:
tokens_per_iter =  batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

tokens per iteration will be: 400


In [13]:
torch.manual_seed(seed)

<torch._C.Generator at 0x11c0313d0>

In [14]:
def get_batch(split):
    data = train_names if split == 'train' else val_names
    ix = torch.randint(len(data), (batch_size,))
    pad_token=stoi["*"]
    x=torch.ones(batch_size,block_size,dtype=torch.long)*pad_token
    y=torch.ones(batch_size,block_size,dtype=torch.long)*pad_token
    for i,index in enumerate(ix):
        encoded=encode(data[index])
        x[i][:len(encoded)-1]=torch.Tensor(encoded[:-1])
        y[i][:len(encoded)-1]=torch.Tensor(encoded[1:])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2))

num decayed parameter tensors: 26, with 3,381,912 parameters
num non-decayed parameter tensors: 13, with 2,808 parameters


In [16]:
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)

In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [18]:
#scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
iter_num = 0
best_val_loss = 1e9
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    logits, loss = model(X, Y)
    X, Y = get_batch('train')
    optimizer.zero_grad(set_to_none=True)
    #scaler.scale(loss).backward()
    loss.backward()
    # clip the gradient
    if grad_clip != 0.0:
        #scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    #scaler.step(optimizer)
    #scaler.update()
    optimizer.step()
    

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        lossf = loss.item()
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
    iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

step 0: train loss 1.8634, val loss 1.8713
iter 0: loss 1.8983, time 831.98ms
iter 10: loss 1.6585, time 45.48ms
iter 20: loss 1.8507, time 44.80ms
step 30: train loss 1.8305, val loss 1.7898
saving checkpoint to fine-tuning
iter 30: loss 1.7881, time 634.26ms
iter 40: loss 1.8344, time 45.01ms
iter 50: loss 1.7297, time 45.09ms
step 60: train loss 1.7849, val loss 1.7722
saving checkpoint to fine-tuning
iter 60: loss 1.7819, time 608.30ms
iter 70: loss 1.9582, time 44.65ms
iter 80: loss 1.9889, time 44.78ms
step 90: train loss 1.7696, val loss 1.8369
iter 90: loss 1.5937, time 576.83ms
iter 100: loss 1.5041, time 44.57ms
iter 110: loss 1.7855, time 44.77ms
step 120: train loss 1.7886, val loss 1.7887
iter 120: loss 1.8312, time 610.09ms
iter 130: loss 1.9452, time 45.23ms
iter 140: loss 1.8681, time 44.33ms
step 150: train loss 1.7524, val loss 1.8125
iter 150: loss 1.9883, time 580.69ms
iter 160: loss 1.8151, time 45.36ms
iter 170: loss 1.8864, time 46.70ms
step 180: train loss 1.772

In [19]:
best_val_loss.item()

1.678484320640564