In [1]:
import torch
import numpy as np
from matplotlib import pyplot as plt
import csv
import tiktoken
import random
from model import GPTConfig, GPT
import time
import os
import math

In [2]:
# Set the random seed so we get the same results

seed = 1337

np.random.seed(seed)
random.seed(seed)

First, we need read the raw data

In [3]:
with open('poetry.csv', newline='') as csvfile:
    raw_data = list(csv.reader(csvfile))

Then, we process it into a form that our GPT model can use. This is where we have to design our prompt and use soft prompting. Fill in the function `process_poem` to return a prompt based on the information we have about the poem

In [4]:
def process_poem(author, name, age, poem_type):
    """
    Args:
        author: str
            The author of the poem
        name: str
            The name of the poem
        age: str
            The "age" that the poem is from (either "Renaissance" or "Modern")
                Note: each poem author will either be in the Renaissance
                      or Modern age, meaning that this is redundant information
        poem_type: str
            The type of poem, will be one of these: "Love", "Mythology & Folklore", "Nature"
    
    Returns:
        prompt: str
            The prompt that we use for soft prompting
    """
    
    return "Here is a " + age + " " + poem_type + " poem written by " + author + ' called "' + name + '"\n\n'

enc = tiktoken.get_encoding("gpt2")

encode = lambda x: enc.encode_ordinary(x)

dataset = [(encode(process_poem(author, name, age, poem_type)), 
            encode(poem_content)) 
           for author, poem_content, name, age, poem_type in raw_data[1:]]

max_prompt_length = max([len(poem[0]) for poem in dataset])
max_poem_length   = max([len(poem[1]) for poem in dataset])

random.shuffle(dataset)

n = len(dataset)
train_data = dataset[:int(n*0.9)]
val_data = dataset[int(n*0.9):]

Next, we need to write code to sample from our dataset and generate a batch of data for us to train on.

In [5]:
def generate_batch(data, batch_size):
    """
    Args:
        data: List[prompt, poem_content]
            The dataset that we want to sample from (either training or validation)
    Returns:
        x: np.array[shape=(batch_size, datapoint_length)]
        y: np.array[shape=(batch_size, datapoint_length)]
    """
    
    random.shuffle(data)
    
    endoftext_id = enc.encode('<|endoftext|>', allowed_special={"<|endoftext|>"})[0]
    
    block_size = 1024 # max_prompt_length + max_poem_length + 5
    
    x = np.ones(shape=(batch_size, block_size), dtype=np.int64) * endoftext_id
    y = np.ones(shape=(batch_size, block_size), dtype=np.int64) * endoftext_id
    
    for i in range(batch_size):
        concat_data = np.array(data[i][0] + data[i][1])
        
        x_len = min(len(concat_data), block_size)
        y_len = min(len(concat_data)-1, block_size)
        
        x[i, :x_len]   = concat_data[:x_len]
        y[i, :y_len] = concat_data[1:1+y_len]
    
    return x, y

## Now, we train the model

First things first, we set some parameters

In [6]:
out_dir = 'out'
log_interval = 1
eval_interval = 5
eval_iters = 20

gradient_accumulation_steps = 64 # used to simulate larger batch sizes
batch_size = 1 # if gradient_accumulation_steps > 1, this is the micro-batch size
dropout = 0.0

learning_rate = 6e-5 # max learning rate
max_iters = 40 # total number of training iterations
warmup_iters = 2
lr_decay_iters = 20
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
min_lr = 6e-6

grad_clip = 1.0
device = "cuda"

config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
config = {k: globals()[k] for k in config_keys}

Then we load the GPT2-medium model

In [7]:
print(f"Initializing from OpenAI GPT-2 weights")

override_args = dict(dropout=dropout)
model = GPT.from_pretrained("gpt2-medium", override_args)
model_args = {}
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
    model_args[k] = getattr(model.config, k)

model.to(device)
scaler = torch.cuda.amp.GradScaler(enabled=False)
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), "cuda")
ctx = torch.amp.autocast(device_type="cuda", dtype=torch.float32)

Initializing from OpenAI GPT-2 weights
loading weights from pretrained gpt: gpt2-medium
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0


  from .autonotebook import tqdm as notebook_tqdm


number of parameters: 353.77M
using fused AdamW: True


In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            the_data = train_data if split == 'train' else val_data
            X, Y = get_batch(the_data, batch_size)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

def get_batch(data, batch_size):
    x, y = generate_batch(data, batch_size)
    x_torch = torch.from_numpy(x).pin_memory().to(device, non_blocking=True)
    y_torch = torch.from_numpy(y).pin_memory().to(device, non_blocking=True)
    return x_torch, y_torch

In [9]:
X, Y = get_batch(train_data, batch_size)
t0 = time.time()
local_iter_num = 0
running_mfu = -1.0

iter_num = 0
best_val_loss = 1e9

while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(gradient_accumulation_steps):
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch(train_data, batch_size)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    
    scaler.step(optimizer)
    scaler.update()
    
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * gradient_accumulation_steps
        if local_iter_num >= 5: # let the training loop settle a bit
            mfu = model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break

step 0: train loss 6.5221, val loss 6.8813
iter 0: loss 7.0547, time 36838.76ms, mfu -100.00%
iter 1: loss 7.8221, time 31448.62ms, mfu -100.00%
iter 2: loss 2.1180, time 31460.54ms, mfu -100.00%
iter 3: loss 1.1617, time 31654.81ms, mfu -100.00%
iter 4: loss 0.2709, time 31604.22ms, mfu -100.00%
step 5: train loss 1.0766, val loss 0.7499
saving checkpoint to out
iter 5: loss 0.8109, time 42081.65ms, mfu 1.21%
iter 6: loss 0.8773, time 31828.34ms, mfu 1.25%
iter 7: loss 0.1865, time 31905.34ms, mfu 1.28%
iter 8: loss 0.9060, time 31897.00ms, mfu 1.32%
iter 9: loss 0.1690, time 31984.37ms, mfu 1.34%
step 10: train loss 1.1920, val loss 1.2731
iter 10: loss 2.4203, time 37511.66ms, mfu 1.34%
iter 11: loss 1.6578, time 31979.43ms, mfu 1.37%
iter 12: loss 0.5831, time 32016.03ms, mfu 1.39%
iter 13: loss 0.1030, time 31982.63ms, mfu 1.41%
iter 14: loss 3.9629, time 32067.66ms, mfu 1.43%
step 15: train loss 1.1319, val loss 0.7877
iter 15: loss 0.2679, time 37565.59ms, mfu 1.42%
iter 16: los

In [10]:
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-23): 24 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
          (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
          (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

Now we sample from the model

In [26]:
def truncate_output(id_list):
    if id_list.index(50256) == -1:
        return id_list
    return id_list[:id_list.index(50256)]

start_ids = encode(process_poem("WILLIAM SHAKESPEARE", "Oh Romeo, My Love!", 
                                "Renaissance", "Love"))
x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

with torch.no_grad():
    with ctx:
        y = model.generate(x, 256, temperature=0.8, top_k=200)
        print(enc.decode(truncate_output(y[0].tolist())))

Here is a Renaissance Love poem written by WILLIAM SHAKESPEARE called "Oh Romeo, My Love!"



Sweet Romeo, my love,
The world is my music;
But I am not you.


I know enough of you to tell you what kind,
To teach you that love does obey.

What you want and know, you have;
How little I know, you have only to know.
