<a href="https://colab.research.google.com/github/sayanbanerjee32/TASI_ERAv2_S21/blob/main/gpt2_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install tiktoken
!pip install -Uq pynvml

In [2]:
# assignment repo
!git clone https://github.com/sayanbanerjee32/TASI_ERAv2_S21.git

Cloning into 'TASI_ERAv2_S21'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 14 (delta 2), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (14/14), 8.57 KiB | 8.57 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [3]:
## move python files to outside
!mv TASI_ERAv2_S21/*.py .

In [4]:
import math
import os
import torch

from model_gpt2 import GPTConfig, GPT
from data_loader_lite import DataLoaderLite

In [5]:
from huggingface_hub import HfApi
from google.colab import userdata
userdata.get('HF_TOKEN')
api = HfApi()

## Training on tiny shakespeare

In [6]:
# tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-06-27 11:17:35--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-27 11:17:36 (30.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [7]:
### hypre params
max_lr = 6e-4
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
warmup_steps = 50
max_steps = 1000

# save / log config
out_dir = 'saved_model'
save_interval = 100
log_interval = 50

In [8]:
# dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
# # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
# ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]

In [9]:
import time
# attempt to auto detect device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
# elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
#     device = "mps"

print(f"Using device: {device}")

Using device: cuda


In [10]:
torch.manual_seed(1337)
if device == "cuda":
    torch.cuda.manual_seed(1337)

In [11]:
total_batch_size = 524288 # to align with gpt2 training batch size in number of tokens
B = 8
T = 1024
assert total_batch_size % (B*T) == 0, "make sure total_batch_size is a multiple of B*T"
grad_accum_steps = total_batch_size // (B*T)
print(f"total_batch_size = {total_batch_size}, grad_accum_steps = {grad_accum_steps}")

total_batch_size = 524288, grad_accum_steps = 64


In [12]:
train_loader = DataLoaderLite(B = B, T = T)
x, y = train_loader.next_batch()
x.shape, y.shape

Loaded 338025 tokens
1 epoch = 41 batches


(torch.Size([8, 1024]), torch.Size([8, 1024]))

In [13]:
torch.set_float32_matmul_precision('high') # is not working in T4
model_args = dict(vocab_size=50304)
gptconf = GPTConfig(**model_args)
model = GPT(gptconf) # next number for power of 2
model.to(device)
model = torch.compile(model) # does not work collab T4

In [14]:
min_lr = max_lr * 0.1

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    if it > max_steps:
        return min_lr

    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(decay_ratio * math.pi))

    return min_lr + coeff * (max_lr - min_lr)

In [15]:
# logits, loss = model(x, y)
# print(loss)
# expected loss - -ln(1/505257) = 10.82
# AdamW is a bugfix of Adam so to say
# optimizer

# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9, 0.95), eps=1e-8)
optimizer = model.configure_optimizers(weight_decay, max_lr, (beta1, beta2), device)
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=True)
best_loss = 1e9
os.makedirs(out_dir, exist_ok=True)
for step in range(max_steps):
    t0 = time.time()
    optimizer.zero_grad()
    loss_accum = 0.0
    # determine and set learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


    for micro_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)

        # with torch.autocast(device_type = device, dtype=torch.bfloat16): # does not work collab T4
        with torch.autocast(device_type = device, dtype=torch.float16): # this would need gradient scaling
            logits, loss = model(x, y)
        loss = loss / grad_accum_steps # loss normalizer
        loss_accum += loss.detach()
        # loss.backward()
        scaler.scale(loss).backward()

    # gradient clipping
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)


    # optimizer.step()
    scale = scaler.get_scale()
    scaler.step(optimizer)
    scaler.update()
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1 - t0)
    tokens_processed = train_loader.B * train_loader.T * grad_accum_steps
    tokens_per_sec = tokens_processed / dt
    if step % log_interval == 0 or step == max_steps-1 or loss_accum.item() < 0.099999:
        print(f"step {step} | loss: {loss_accum.item():.6f} | lr: {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")

    if step % save_interval == 0 or step == max_steps-1 or loss_accum.item() < 0.099999:
        if loss_accum.item() < best_loss:
            best_loss = loss_accum.item()
            if step > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': step,
                    'best_loss': best_loss,
                    }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    if loss_accum.item() < 0.099999:
        print("Stopping training as reached target loss")
        break

num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True
step 0 | loss: 10.983610 | lr: 1.2000e-05 | norm: 9.1490 | dt: 81627.05ms | tok/sec: 6422.97
step 50 | loss: 5.546760 | lr: 6.0000e-04 | norm: 0.4880 | dt: 28678.36ms | tok/sec: 18281.66
step 100 | loss: 4.447003 | lr: 5.9632e-04 | norm: 0.5548 | dt: 28951.61ms | tok/sec: 18109.11
saving checkpoint to saved_model
step 150 | loss: 3.693120 | lr: 5.8537e-04 | norm: 1.0331 | dt: 29042.14ms | tok/sec: 18052.67
step 200 | loss: 2.844754 | lr: 5.6746e-04 | norm: 1.9574 | dt: 29165.84ms | tok/sec: 17976.10
saving checkpoint to saved_model
step 250 | loss: 2.038058 | lr: 5.4307e-04 | norm: 3.8761 | dt: 29208.81ms | tok/sec: 17949.65
step 300 | loss: 1.330756 | lr: 5.1287e-04 | norm: 4.8655 | dt: 29289.51ms | tok/sec: 17900.20
saving checkpoint to saved_model
step 350 | loss: 0.583206 | lr: 4.7768e-04 | norm: 3.7037 | dt: 29098.63ms | tok/sec: 180

KeyboardInterrupt: 

In [16]:
import gc
print(torch.cuda.list_gpu_processes())
gc.collect()
torch.cuda.empty_cache()

GPU:0
process       9121 uses    11842.000 MB GPU memory


## Sample Generations

In [17]:
max_length = 30
num_return_sequences = 5

In [18]:
# prefix
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype = torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) #(5,8)
x = tokens.to(device)

In [19]:
# generate
x = model.generate(x, max_new_tokens=max_length)

In [20]:
# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

> Hello, I'm a language model, villain cons bolts
By goodows care the toward the WhatAn pretend ourT:Being nothing:
 EL
> Hello, I'm a language model, Therefore'sFor prosperous jealous:

yre!
ThisonI tell't that ask. malicious so purpose
> Hello, I'm a language model, what the
 them the gentle myWill Georgevern such the remorse:
So graciousp, whereby I know
> Hello, I'm a language model,LoveOM
But What hurtship wrought you this Apolloail the how that
An time feel the Masters us
> Hello, I'm a language model, Do thisUCouch more the from to the how good warrant happyThat evadeator! triumph friend and I will


## Upload to hugging face model hub

In [21]:
import os
os.makedirs('to_upload', exist_ok=True)

In [22]:
!cp model_gpt2.py to_upload
!cp -r saved_model to_upload

In [23]:
api.upload_folder(
    folder_path="./to_upload",
    repo_id="sayanbanerjee32/nanogpt2_test",
    repo_type="model",
)

ckpt.pt:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sayanbanerjee32/nanogpt2_test/commit/11c24e832268a5d66ef78d6c1a2451fa69dc1633', commit_message='Upload folder using huggingface_hub', commit_description='', oid='11c24e832268a5d66ef78d6c1a2451fa69dc1633', pr_url=None, pr_revision=None, pr_num=None)