In [1]:
import torch
import numpy as np
from matplotlib import pyplot as plt
import csv
import tiktoken
import random
from model import GPTConfig, GPT
import time
import os
import math

In [2]:
# Set the random seed so we get the same results

seed = 1337

np.random.seed(seed)
random.seed(seed)

First, we need read the raw data

In [3]:
with open('poetry.csv', newline='') as csvfile:
    raw_data = list(csv.reader(csvfile))

Then, we process it into a form that our GPT model can use. This is where we have to design our prompt and use soft prompting. Fill in the function `process_poem` to return a prompt based on the information we have about the poem

In [4]:
def process_poem(author, name, age, poem_type):
    """
    Args:
        author: str
            The author of the poem
        name: str
            The name of the poem
        age: str
            The "age" that the poem is from (either "Renaissance" or "Modern")
                Note: each poem author will either be in the Renaissance
                      or Modern age, meaning that this is redundant information
        poem_type: str
            The type of poem, will be one of these: "Love", "Mythology & Folklore", "Nature"
    
    Returns:
        prompt: str
            The prompt that we use for soft prompting
    """
    
    return "Here is a " + age + " " + poem_type + " poem written by " + author + ' called "' + name + '"\n\n'

enc = tiktoken.get_encoding("gpt2")

encode = lambda x: enc.encode_ordinary(x)

dataset = [(encode(process_poem(author, name, age, poem_type)), 
            encode(poem_content)) 
           for author, poem_content, name, age, poem_type in raw_data[1:]]

max_prompt_length = max([len(poem[0]) for poem in dataset])
max_poem_length   = max([len(poem[1]) for poem in dataset])

random.shuffle(dataset)

n = len(dataset)
train_data = dataset[:int(n*0.9)]
val_data = dataset[int(n*0.9):]

Next, we need to write code to sample from our dataset and generate a batch of data for us to train on.

In [5]:
def generate_batch(data):
    """
    Args:
        data: List[prompt, poem_content]
            The dataset that we want to sample from (either training or validation)
    Returns:
        x: np.array[shape=(batch_size, datapoint_length)]
        y: np.array[shape=(batch_size, datapoint_length)]
    """
    
    sample_index = random.randrange(len(data))
    
    concat_data = np.array(data[sample_index][0] + data[sample_index][1])
    
    block_size = min(1025, len(concat_data))
    
    x = np.zeros(shape=(batch_size, block_size-1), dtype=np.int64)
    y = np.zeros(shape=(batch_size, block_size-1), dtype=np.int64)

    x = concat_data[:block_size-1]
    y = concat_data[1:block_size]
    
    return x, y

## Now, we train the model

First things first, we set some parameters

In [6]:
out_dir = 'out'
log_interval = 1
eval_interval = 8
eval_iters = 20

batch_size = 64
dropout = 0.0

learning_rate = 2e-4 # max learning rate
max_iters = 64 # total number of training iterations
warmup_iters = 4
lr_decay_iters = 32
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
min_lr = 2e-6

grad_clip = 1.0
device = "cuda"

config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
config = {k: globals()[k] for k in config_keys}

Then we load the GPT2-medium model

In [7]:
print(f"Initializing from OpenAI GPT-2 weights")

override_args = dict(dropout=dropout)
model = GPT.from_pretrained("gpt2-medium", override_args)
model_args = {}
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
    model_args[k] = getattr(model.config, k)

model.to(device)
model.eval()
scaler = torch.cuda.amp.GradScaler(enabled=False)
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), "cuda")
ctx = torch.amp.autocast(device_type="cuda", dtype=torch.float32)

Initializing from OpenAI GPT-2 weights
loading weights from pretrained gpt: gpt2-medium
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0


  from .autonotebook import tqdm as notebook_tqdm


number of parameters: 353.77M
using fused AdamW: True


In [8]:
def sample_model(author, name, age, poem_type):
    start_ids = encode(process_poem(author, name, age, poem_type))
    x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])

    with torch.no_grad():
        with ctx:
            y = model.generate(x, 256, temperature=0.8, top_k=200)
            return enc.decode(y[0].tolist())

In [9]:
print(sample_model("HENRY VIII, KING OF ENGLAND", "Midnight sun", "Renaissance", "Nature"))

Here is a Renaissance Nature poem written by HENRY VIII, KING OF ENGLAND called "Midnight sun"


"At sea we still go up alone

He will do this again another time."


A couple of things have happened to this poem. Firstly, the poem made a lot of noise during the 17th century. The English Sea Age and the influence of French Renaissance poets such as Richard Berry and Jonathan Swift have to be acknowledged here. Secondly, the English poem wasn't just done for the entertainment of the public (much to the chagrin of the French and Spanish Royalty who wanted to keep the poem an exclusive secret). It's also interesting to note that this English version of the poem was not really an English adaptation but, in fact, a very early French translation of the poem (perhaps a mixture of the two versions?).

By the mid 14th century, the English language was well established and the word "sun" had entered common usage in English. Also, many of the English poets of that time, such as Edward Joyce, D.H. 

In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            the_data = train_data if split == 'train' else val_data
            X, Y = get_batch(the_data)
            with ctx:
                logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# learning rate decay scheduler (cosine with warmup)
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

def get_batch(data):
    x, y = generate_batch(data)
    x_torch = torch.from_numpy(x).pin_memory().to(device, non_blocking=True)[None, ...]
    y_torch = torch.from_numpy(y).pin_memory().to(device, non_blocking=True)[None, ...]
    return x_torch, y_torch

In [11]:
model.train()

X, Y = get_batch(train_data)
t0 = time.time()
local_iter_num = 0
running_mfu = -1.0

iter_num = 0
best_val_loss = 1e9

while True:
    # determine and set the learning rate for this iteration
    lr = get_lr(iter_num)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    
    # evaluate the loss on train/val sets and write checkpoints
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
        if losses['val'] < best_val_loss:
            best_val_loss = losses['val']
            if iter_num > 0:
                checkpoint = {
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'model_args': model_args,
                    'iter_num': iter_num,
                    'best_val_loss': best_val_loss,
                    'config': config,
                }
                print(f"saving checkpoint to {out_dir}")
                torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))

    # forward backward update, with optional gradient accumulation to simulate larger batch size
    # and using the GradScaler if data type is float16
    for micro_step in range(batch_size):
        with ctx:
            logits, loss = model(X, Y)
            loss = loss / batch_size # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
        X, Y = get_batch(train_data)
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    # step the optimizer and scaler if training in fp16
    
    scaler.step(optimizer)
    scaler.update()
    
    # flush the gradients as soon as we can, no need for this memory anymore
    optimizer.zero_grad(set_to_none=True)

    # timing and logging
    t1 = time.time()
    dt = t1 - t0
    t0 = t1
    if iter_num % log_interval == 0:
        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
        lossf = loss.item() * batch_size
        if local_iter_num >= 5: # let the training loop settle a bit
            mfu = model.estimate_mfu(batch_size, dt)
            running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
        print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
    iter_num += 1
    local_iter_num += 1

    # termination conditions
    if iter_num > max_iters:
        break
    
model.eval()
None # to avoid printing the model

step 0: train loss 4.8747, val loss 4.8650
iter 0: loss 4.2855, time 9520.89ms, mfu -100.00%
iter 1: loss 4.8589, time 8356.17ms, mfu -100.00%
iter 2: loss 4.8615, time 9769.59ms, mfu -100.00%
iter 3: loss 4.8009, time 9520.53ms, mfu -100.00%
iter 4: loss 4.3002, time 9142.24ms, mfu -100.00%
iter 5: loss 4.4008, time 8654.19ms, mfu 5.88%
iter 6: loss 4.0367, time 7772.94ms, mfu 5.95%
iter 7: loss 4.5093, time 10931.83ms, mfu 5.82%
step 8: train loss 3.2255, val loss 2.6474
saving checkpoint to out
iter 8: loss 3.4330, time 16657.02ms, mfu 5.55%
iter 9: loss 1.6707, time 9833.04ms, mfu 5.51%
iter 10: loss 3.7445, time 9974.26ms, mfu 5.47%
iter 11: loss 2.8450, time 9377.84ms, mfu 5.47%
iter 12: loss 1.4326, time 8583.87ms, mfu 5.51%
iter 13: loss 3.3533, time 8462.71ms, mfu 5.56%
iter 14: loss 3.3996, time 8139.78ms, mfu 5.63%
iter 15: loss 3.4540, time 10817.62ms, mfu 5.54%
step 16: train loss 2.7038, val loss 2.8791
iter 16: loss 2.5836, time 10399.68ms, mfu 5.48%
iter 17: loss 2.6575

Now we sample from the model

In [16]:
print(sample_model("HENRY VIII, KING OF ENGLAND", "Midnight sun", "Renaissance", "Nature"))

Here is a Renaissance Nature poem written by HENRY VIII, KING OF ENGLAND called "Midnight sun"

Hear me well, O moon, how oft it hath done my part to please,
And now I rise in the dimness, and now that the sun hath moved,
Unto me in the great shadow of the morning,
And yet I hear the same loud voice, and yet I am turned after,
And yet I hear the same deep sigh: but in that way,
The night's shadow, that was my soul, Is but shadow, and must die.

And now I hear the same smooth voice, in that calm sound of my heart,
Which in deep sleep was but as a great storm,
Unswayed by sound, since the night did get away.

There is no night in this world but that which comes
With suddenness, and that with pain,
And that in rude season, and that when the day cannot be.

And now I hear the same strong sigh, in that sound of my mind,
That in my soul which was the storm, and that with pain,
That in the storm still my self did break.

And now I hear the same smooth voice


In [21]:
print(sample_model("HENRY VIII, KING OF ENGLAND", "Midnight sun", "Renaissance", "Nature"))

Here is a Renaissance Nature poem written by HENRY VIII, KING OF ENGLAND called "Midnight sun"

When noon is gone,
And the noon beams rise,
With strange eyes,
Unhappy and mournful
In the old twilight,
    And they approach
My golden chamber,
    Which is the most full     Of light.

Yet do they stir
And the air moves,
    Which might make me sleep.

As they turn,
And in a little way,
Oh, they lie, and cry
    For their king, for they mock
    For their King to see:
The best thing they can do,
    For the world to look upon.
They stare for their queen,
    For love of love to have,
    For the sun to shine,
    And to have her hand
    To hold.

But they lie, and cry,
    For their Queen and King
    For love of love to have,
    For the world to see.
In love
