In [1]:
import os,time,math,pickle,random,shutil
import numpy as np,pandas as pd
import torch,torch._dynamo
from model import GPT

In [2]:
resume=True

In [3]:
data_dir="long"

In [4]:
seed=1337
out_dir = 'long'
if resume: 
    os.makedirs(out_dir, exist_ok=True)
else:
    shutil.rmtree(out_dir)
    os.makedirs(out_dir, exist_ok=True)

In [5]:
fp=open(os.path.join(data_dir, 'train.bin'), 'r')
evrythng=fp.readlines()
train_names=[]
for line in evrythng:
    train_names.append(line.split("\n")[0])
fp.close()
fp=open(os.path.join(data_dir, 'val.bin'), 'r')
evrythng=fp.readlines()
val_names=[]
for line in evrythng:
    val_names.append(line.split("\n")[0])
fp.close()

In [6]:
with open(os.path.join(data_dir, 'meta.pkl'), 'rb') as f:
    meta = pickle.load(f)
stoi, itos, vocab_size = meta['stoi'], meta['itos'], meta['vocab_size']
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [7]:
batch_size=64
block_size=50

In [8]:
n_layer = 6
n_head = 6
n_embd = 216
dropout = 0.2
bias=False

In [9]:
learning_rate = 1e-3
max_iters = 200000
lr_decay_iters = 200000
min_lr = 1e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.99
warmup_iters = 1000
grad_clip = 1.0
decay_lr = True

In [10]:
eval_interval = 500
log_interval = 100
eval_iters = 200

In [11]:
device = 'mps'
#torch._dynamo.config.suppress_errors = True
dtype='float16'
compile=False

In [12]:
tokens_per_iter =  batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")

tokens per iteration will be: 3,200


In [13]:
torch.manual_seed(seed)

<torch._C.Generator at 0x11a9dd3d0>

In [14]:
def get_batch(split):
    data = train_names if split == 'train' else val_names
    ix = torch.randint(len(data), (batch_size,))
    pad_token=stoi["*"]
    x=torch.ones(batch_size,block_size,dtype=torch.long)*pad_token
    y=torch.ones(batch_size,block_size,dtype=torch.long)*pad_token
    for i,index in enumerate(ix):
        encoded=encode(data[index])
        x[i][:len(encoded)-1]=torch.Tensor(encoded[:-1])
        y[i][:len(encoded)-1]=torch.Tensor(encoded[1:])
    x, y = x.to(device), y.to(device)
    return x, y

In [15]:
config=dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size, bias=bias,
            vocab_size=vocab_size, dropout=dropout, pad_token=stoi["*"])

In [16]:
if resume:
    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
    checkpoint = torch.load(ckpt_path, map_location=device)
    config = checkpoint['config']
    model = GPT(config)
    state_dict = checkpoint['model']
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    iter_num = checkpoint['iter_num']
    best_val_loss = checkpoint['best_val_loss']
else:
    model=GPT(config)
model.to(device)    

number of parameters: 3.37M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(55, 216, padding_idx=27)
    (wpe): Embedding(50, 216)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=216, out_features=648, bias=False)
          (c_proj): Linear(in_features=216, out_features=216, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=216, out_features=864, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=864, out_features=216, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=216, out_features=55, bias=False)
)

In [17]:
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2))
if resume:
    optimizer.load_state_dict(checkpoint['optimizer'])

num decayed parameter tensors: 26, with 3,381,912 parameters
num non-decayed parameter tensors: 13, with 2,808 parameters


In [18]:
if compile:
    print("compiling the model... (takes a ~minute)")
    unoptimized_model = model
    model = torch.compile(model)

In [19]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [20]:
#scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
X, Y = get_batch('train') # fetch the very first batch
t0 = time.time()
if not resume:
    iter_num = 0
    best_val_loss = 1e9
while True:

    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {'temp'}")
                torch.save(checkpoint, os.path.join("temp", 'ckpt.pt'))

    logits, loss = model(X, Y)
    X, Y = get_batch('train')
    optimizer.zero_grad(set_to_none=True)
    #scaler.scale(loss).backward()
    loss.backward()
    # clip the gradient
    if grad_clip != 0.0:
        #scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    #scaler.step(optimizer)
    #scaler.update()
    optimizer.step()
    

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        lossf = loss.item()
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms")
    iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

step 148500: train loss 1.9064, val loss 2.0106
iter 148500: loss 2.0001, time 18284.98ms
iter 148600: loss 1.9281, time 189.75ms
iter 148700: loss 1.9201, time 194.36ms
iter 148800: loss 1.9718, time 193.77ms
iter 148900: loss 2.0280, time 186.52ms
step 149000: train loss 1.9030, val loss 2.0224
iter 149000: loss 1.9999, time 17632.54ms


KeyboardInterrupt: 