<a href="https://colab.research.google.com/github/sayanbanerjee32/TASI_ERAv2_S21/blob/main/gpt2_training_cusom_input.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install tiktoken
!pip install -Uq pynvml

In [2]:
# assignment repo
!git clone https://github.com/sayanbanerjee32/TASI_ERAv2_S21.git

Cloning into 'TASI_ERAv2_S21'...
remote: Enumerating objects: 35, done.[K
remote: Counting objects: 100% (35/35), done.[K
remote: Compressing objects: 100% (32/32), done.[K
Receiving objects: 100% (35/35), 294.87 KiB | 9.21 MiB/s, done.
remote: Total 35 (delta 12), reused 0 (delta 0), pack-reused 0[K
Resolving deltas: 100% (12/12), done.


In [3]:
## move python files to outside
!mv TASI_ERAv2_S21/*.py .

In [4]:
import math
import os
import torch

from model_gpt2 import GPTConfig, GPT
from data_loader_lite import DataLoaderLite

In [5]:
from huggingface_hub import HfApi
from google.colab import userdata
userdata.get('HF_TOKEN')
api = HfApi()

## Training on custom input data

In [6]:
### hypre params
max_lr = 6e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
warmup_steps = 50
max_steps = 1000

# save / log config
out_dir = 'saved_model'
save_interval = 100
log_interval = 50

In [7]:
# dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
# # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
# ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]

In [8]:
import time
# attempt to auto detect device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
# elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
#     device = "mps"

print(f"Using device: {device}")

Using device: cuda


In [9]:
torch.manual_seed(1337)
if device == "cuda":
    torch.cuda.manual_seed(1337)

In [10]:
total_batch_size = 524288 # to align with gpt2 training batch size in number of tokens
B = 8
T = 1024
assert total_batch_size % (B*T) == 0, "make sure total_batch_size is a multiple of B*T"
grad_accum_steps = total_batch_size // (B*T)
print(f"total_batch_size = {total_batch_size}, grad_accum_steps = {grad_accum_steps}")

total_batch_size = 524288, grad_accum_steps = 64


In [11]:
train_loader = DataLoaderLite(B = B, T = T)
x, y = train_loader.next_batch()
x.shape, y.shape

Loaded 338025 tokens
1 epoch = 41 batches


(torch.Size([8, 1024]), torch.Size([8, 1024]))

In [12]:
torch.set_float32_matmul_precision('high') # is not working in T4
model_args = dict(vocab_size=50304)
gptconf = GPTConfig(**model_args)
model = GPT(gptconf) # next number for power of 2
model.to(device)
model = torch.compile(model) # does not work collab T4

In [13]:
min_lr = max_lr * 0.1

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    if it > max_steps:
        return min_lr

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(decay_ratio * math.pi))

    return min_lr + coeff * (max_lr - min_lr)

In [14]:
# logits, loss = model(x, y)
# print(loss)
# expected loss - -ln(1/505257) = 10.82
# AdamW is a bugfix of Adam so to say
# optimizer

# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), eps=1e-8)
optimizer = model.configure_optimizers(weight_decay, max_lr, (beta1, beta2), device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=True)
best_loss = 1e9
os.makedirs(out_dir, exist_ok=True)
for step in range(max_steps):
    t0 = time.time()
    optimizer.zero_grad()
    loss_accum = 0.0
    # determine and set learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


    for micro_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)

        # with torch.autocast(device_type = device, dtype=torch.bfloat16): # does not work collab T4
        with torch.autocast(device_type = device, dtype=torch.float16): # this would need gradient scaling
            logits, loss = model(x, y)
        loss = loss / grad_accum_steps # loss normalizer
        loss_accum += loss.detach()
        # loss.backward()
        scaler.scale(loss).backward()

    # gradient clipping
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)


    # optimizer.step()
    scale = scaler.get_scale()
    scaler.step(optimizer)
    scaler.update()
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0)
    tokens_processed = train_loader.B * train_loader.T * grad_accum_steps
    tokens_per_sec = tokens_processed / dt
    if step % log_interval == 0 or step == max_steps-1 or loss_accum.item() < 0.099999:
        print(f"step {step} | loss: {loss_accum.item():.6f} | lr: {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")

    if step % save_interval == 0 or step == max_steps-1 or loss_accum.item() < 0.099999:
        if loss_accum.item() < best_loss:
            best_loss = loss_accum.item()
            if step > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': step,
                    'best_loss': best_loss,
                    }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    if loss_accum.item() < 0.099999:
        print("Stopping training as reached target loss")
        break

num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True
step 0 | loss: 10.983610 | lr: 1.2000e-05 | norm: 9.1490 | dt: 81766.22ms | tok/sec: 6412.04
step 50 | loss: 5.600409 | lr: 6.0000e-04 | norm: 0.6819 | dt: 28532.17ms | tok/sec: 18375.33
step 100 | loss: 4.467509 | lr: 5.9632e-04 | norm: 0.8411 | dt: 28969.64ms | tok/sec: 18097.84
saving checkpoint to saved_model
step 150 | loss: 3.736810 | lr: 5.8537e-04 | norm: 1.0279 | dt: 29231.67ms | tok/sec: 17935.62
step 200 | loss: 2.950556 | lr: 5.6746e-04 | norm: 2.2319 | dt: 29064.69ms | tok/sec: 18038.65
saving checkpoint to saved_model
step 250 | loss: 2.031224 | lr: 5.4307e-04 | norm: 3.1140 | dt: 29074.49ms | tok/sec: 18032.58
step 300 | loss: 1.301913 | lr: 5.1287e-04 | norm: 4.9952 | dt: 29089.38ms | tok/sec: 18023.35
saving checkpoint to saved_model
step 350 | loss: 0.543757 | lr: 4.7768e-04 | norm: 3.6370 | dt: 29081.34ms | tok/sec: 180

In [15]:
import gc
print(torch.cuda.list_gpu_processes())
gc.collect()
torch.cuda.empty_cache()

GPU:0
process       2630 uses    11842.000 MB GPU memory


## Sample Generations

In [16]:
max_length = 30
num_return_sequences = 5

In [17]:
# prefix
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype = torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) #(5,8)
x = tokens.to(device)

In [18]:
# generate
x = model.generate(x, max_new_tokens=max_length)

In [19]:
# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

> Hello, I'm a language model, what consosawatedwell,--pt:
Can you work! embroughDid star is one the
> Hello, I'm a language model,Royal
To think't, take myyre;
Had you sought it were mark your wrought malicious,

> Hello, I'm a language model, what
b parting to the boy. say William, now might known the

MARCILAND:

> Hello, I'm a language model,ThinkShicoke that
Pl Mark minere
Have yeISROM O Boling contrens
You
> Hello, I'm a language model, peace
Sweet honest! Thus is smile the speak the or keepThat evade us behold, has I shall.


## Upload to hugging face model hub

In [20]:
import os
os.makedirs('to_upload', exist_ok=True)

In [21]:
!cp model_gpt2.py to_upload
!cp -r saved_model to_upload

In [23]:
api.upload_folder(
    folder_path="./to_upload",
    repo_id="sayanbanerjee32/nanogpt2_test",
    repo_type="model",
)

ckpt.pt:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sayanbanerjee32/nanogpt2_test/commit/569af069ed8cc00766769e74caf8e84b06fd0941', commit_message='Upload folder using huggingface_hub', commit_description='', oid='569af069ed8cc00766769e74caf8e84b06fd0941', pr_url=None, pr_revision=None, pr_num=None)