In [1]:
import os,time,math,pickle,random
import numpy as np,pandas as pd
import torch,torch._dynamo
from model import GPT

In [2]:
seed=1337
out_dir = 'instructions'
os.makedirs(out_dir, exist_ok=True)
model_dir='long'

In [3]:
with open(os.path.join(model_dir, 'meta.pkl'), 'rb') as f:
    meta = pickle.load(f)
stoi, itos = meta['stoi'], meta['itos']
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [4]:
device='mps'

In [5]:
ckpt_path = os.path.join(model_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
config=checkpoint["config"]
model = GPT(config)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
model.to(device)

number of parameters: 3.37M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(55, 216, padding_idx=27)
    (wpe): Embedding(50, 216)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=216, out_features=648, bias=False)
          (c_proj): Linear(in_features=216, out_features=216, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=216, out_features=864, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=864, out_features=216, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=216, out_features=55, bias=False)
)

In [6]:
fp=open(os.path.join(out_dir, 'train.bin'),"r")
train_names=[]
evrythng=fp.readlines()
for line in evrythng:
    train_names.append(line.split("\n")[0])
fp.close()
fp=open(os.path.join(out_dir, 'val.bin'),"r")
val_names=[]
evrythng=fp.readlines()
for line in evrythng:
    val_names.append(line.split("\n")[0])
fp.close()

In [7]:
batch_size=32
block_size=config["block_size"]

In [8]:
n_layer = config["n_layer"]
n_head = config["n_head"]
n_embd = config["n_embd"]
dropout = config["dropout"]
bias=config["bias"]

In [9]:
learning_rate = 1e-4
max_iters = 2000
lr_decay_iters = 2000
min_lr = 1e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
warmup_iters = 50
grad_clip = 1.0
decay_lr = False

In [10]:
eval_interval = 30
log_interval = 10
eval_iters = 50

In [11]:
device = 'mps'
#torch._dynamo.config.suppress_errors = True
dtype='float16'
compile=False

In [12]:
tokens_per_iter =  batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

tokens per iteration will be: 1,600


In [13]:
torch.manual_seed(seed)

<torch._C.Generator at 0x11acc93d0>

In [14]:
def get_batch(split):
    data = train_names if split == 'train' else val_names
    ix = torch.randint(len(data), (batch_size,))
    pad_token=stoi["*"]
    x=torch.ones(batch_size,block_size,dtype=torch.long)*pad_token
    y=torch.ones(batch_size,block_size,dtype=torch.long)*pad_token
    for i,index in enumerate(ix):
        sep=data[index].index("!")
        encoded=encode(data[index][:sep])+encode(data[index][sep+1:])
        x[i][:len(encoded)-1]=torch.Tensor(encoded[:-1])
        y[i][sep:len(encoded)-1]=torch.Tensor(encoded[1+sep:])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2))

num decayed parameter tensors: 26, with 3,381,912 parameters
num non-decayed parameter tensors: 13, with 2,808 parameters


In [16]:
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)

In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [18]:
#scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
iter_num = 0
best_val_loss = 1e9
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    logits, loss = model(X, Y)
    X, Y = get_batch('train')
    optimizer.zero_grad(set_to_none=True)
    #scaler.scale(loss).backward()
    loss.backward()
    # clip the gradient
    if grad_clip != 0.0:
        #scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    #scaler.step(optimizer)
    #scaler.update()
    optimizer.step()
    

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        lossf = loss.item()
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
    iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

step 0: train loss 2.6463, val loss 2.6230
iter 0: loss 2.3666, time 2587.52ms
iter 10: loss 2.0301, time 76.20ms
iter 20: loss 2.1836, time 76.10ms
step 30: train loss 1.9384, val loss 2.0225
saving checkpoint to instructions
iter 30: loss 2.0489, time 2359.26ms
iter 40: loss 1.9623, time 76.08ms
iter 50: loss 1.9078, time 75.89ms
step 60: train loss 1.8705, val loss 1.9165
saving checkpoint to instructions
iter 60: loss 1.8445, time 2333.62ms
iter 70: loss 2.0422, time 76.09ms
iter 80: loss 1.9649, time 76.74ms
step 90: train loss 1.8315, val loss 1.8661
saving checkpoint to instructions
iter 90: loss 1.9063, time 2382.39ms
iter 100: loss 1.9285, time 75.65ms
iter 110: loss 1.9380, time 75.73ms
step 120: train loss 1.8079, val loss 1.8313
saving checkpoint to instructions
iter 120: loss 1.7963, time 2380.94ms
iter 130: loss 1.7950, time 75.97ms
iter 140: loss 1.8822, time 76.71ms
step 150: train loss 1.7500, val loss 1.8126
saving checkpoint to instructions
iter 150: loss 1.8518, tim

iter 1480: loss 1.6443, time 76.10ms
iter 1490: loss 1.6027, time 76.14ms
step 1500: train loss 1.4708, val loss 1.7382
iter 1500: loss 1.4790, time 2280.57ms
iter 1510: loss 1.4617, time 76.44ms
iter 1520: loss 1.7106, time 76.34ms
step 1530: train loss 1.4871, val loss 1.7315
iter 1530: loss 1.4685, time 2277.61ms
iter 1540: loss 1.5211, time 76.76ms
iter 1550: loss 1.5933, time 76.26ms
step 1560: train loss 1.4736, val loss 1.7385
iter 1560: loss 1.5108, time 2376.81ms
iter 1570: loss 1.5084, time 75.92ms
iter 1580: loss 1.5839, time 76.41ms
step 1590: train loss 1.4852, val loss 1.7570
iter 1590: loss 1.5455, time 2403.54ms
iter 1600: loss 1.6572, time 84.07ms
iter 1610: loss 1.5415, time 76.96ms
step 1620: train loss 1.4839, val loss 1.7462
iter 1620: loss 1.5211, time 2286.64ms
iter 1630: loss 1.5504, time 76.46ms
iter 1640: loss 1.5573, time 76.51ms
step 1650: train loss 1.4764, val loss 1.7451
iter 1650: loss 1.7358, time 2399.22ms
iter 1660: loss 1.5677, time 84.72ms
iter 1670

In [19]:
best_val_loss.item()

1.7063528299331665