In [1]:
from transformers import GPT2Tokenizer, GPT2Model
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import numpy as np
import tiktoken
import os

Load in a raw text file of my poetry, and process it minimally.

In [2]:
with open('Every Day a Poem.txt', 'r') as f:
    text = f.read()
# remove all non-ascii characters
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
# remove dates
text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', ' ', text)
text = text[26:] # remove filler at the start

Then we encode our data using GPT-2's tokenizer, and defining training and validation splits.

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 16
block_size = 32

enc = tiktoken.get_encoding("gpt2")
data = torch.tensor(enc.encode(text))
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0,len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

We create our homemade transformer using masked self-attention that is capable of generating text recursively. To create the model, I borrow the rough architechture seen in GPT-2 with layer norms, feed forward layers, and residual connections to create a more robust transformer.

In [45]:
class MHA(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.attn = nn.MultiheadAttention(n_embd, n_head)
        self.ln = nn.LayerNorm(n_embd)
        self.proj = nn.Linear(n_embd, n_embd)
        self.proj_ln = nn.LayerNorm(n_embd)
        self.dropout = nn.Dropout()
    def forward(self, x):
        mask = (torch.triu(torch.ones(x.shape[1], x.shape[1])) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)).to(device)
        x = self.ln(x)
        x = x.permute(1,0,2)
        x, _ = self.attn(x, x, x, attn_mask=mask)
        x = self.dropout(x)
        x = x.permute(1,0,2)
        x = self.proj(x)
        x = self.proj_ln(x)
        return x

class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.proj1 = nn.Linear(n_embd, 4*n_embd)
        self.proj2 = nn.Linear(4*n_embd, n_embd)
        self.dropout = nn.Dropout()
    def forward(self, x):
        x = self.proj1(x)
        x = F.gelu(x)
        x = self.proj2(x)
        x = self.dropout(x)
        return x

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.mha = MHA(n_embd, n_head)
        self.ff = FeedFoward(n_embd)
    def forward(self, x):
        x = x + self.mha(x)
        x = x + self.ff(x)
        return x
class Transformer(nn.Module):
    def __init__(self, n_embd, n_head, n_layer):
        super().__init__()
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln = nn.LayerNorm(n_embd)
        self.proj = nn.Linear(n_embd, enc.n_vocab)
        self.tok_emb = nn.Embedding(enc.n_vocab, n_embd)
        self.pos_emb = nn.Embedding(block_size, n_embd)
    def forward(self, x):
        x = self.tok_emb(x)
        x = x + self.pos_emb(torch.arange(x.shape[1]).to(device))
        x = self.blocks(x)
        x = self.ln(x)
        x = self.proj(x)
        return x

model = Transformer(128, 8, 8).to(device)

In [46]:
lossfcn = nn.CrossEntropyLoss()
optim = torch.optim.AdamW(model.parameters(), lr=3e-4)

In [47]:
for step in range(1000):
    x, y = get_batch('train')
    B, T = x.shape
    y_hat = model(x) # (B, T, n_vocab)
    train_loss = lossfcn(y_hat.view(B*T,-1), y.view(B*T))
    train_loss.backward()
    optim.step()
    optim.zero_grad()
    if (step+1) % 200 == 0:
        model.eval()
        x, y = get_batch('val')
        y_hat = model(x)
        val_loss = lossfcn(y_hat.view(B*T,-1), y.view(B*T))
        print(f"Step {step}, Train loss: {train_loss.item():.3f}, Val loss: {val_loss.item():.3f}")
        model.train()

Step 199, Train loss: 5.605, Val loss: 6.680
Step 399, Train loss: 5.522, Val loss: 6.798
Step 599, Train loss: 5.342, Val loss: 6.403
Step 799, Train loss: 5.035, Val loss: 6.749
Step 999, Train loss: 4.698, Val loss: 6.538


In [48]:
seq = torch.tensor(enc.encode('I am a')).unsqueeze(0).long().to(device)
for step in range(200):
    context = seq[:,-block_size:]
    y_hat = model(context)[:,-1] # (1, n_vocab)
    p = F.softmax(y_hat, dim=-1)
    next_word = torch.multinomial(p, 1)
    seq = torch.cat([seq, next_word], dim=-1)
print(enc.decode(seq[0].cpu().numpy()))

I am aya talk)
And larger to face.
but I am a half, but sheriff 

the fundamental bloodyids forgotten wonder, when you in rupt
when top awoke.
and forever off your above, r perception.
but the mind through on life in her needs
When
and questiony o in the precip water,
with it wasn ofing
my ego,
to than stick.
as a will tracks
itirdittingown innd end
begin on
.
Fore emot Aned.
Mr
Mom night sat left,
(
I am flameki in your day itself.
I will might like.

He is not of me in the forest blame
with.
 ander next to chase forget date, in a sound?
 
 

 Random is the will in the world think.
And the tree,
 256 boyisc buildinges,
 perennial red.
 
 

d made


The model doesn't do very great, but there wasn't a lot of training data. We could make it a little better by having it train on a bigger set of poetry, or by leveraging a pre-trained model and fine-tuning it on my poetry. Let's do the latter. Below, I load the GPT2 model, a well-respected generative text model with public weights. It's not the best anymore, but it is still very good. I use the implementation made by Andrej Karpathy, nanoGPT, found here: https://github.com/karpathy/nanoGPT

In [4]:
train_data = text[:int(n*0.9)]
val_data = text[int(n*0.9):]
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile('train.bin')
val_ids.tofile('val.bin')

In [None]:
!python train.py finetune_poetry.py

In [16]:
!python sample.py --out_dir=out-poetry

number of parameters: 123.65M

a second time that of all in the Bible.
The body here is only a hand, being of an earlier age.
It is not included here as so far as I suppose the angel has a hand.
the hand, which made up part of my heart.
Just this is the hand that made me cry and think.
Just this is where one lost her heart and thought.
just this is is as long as that will last, he now has no patience.
it
---------------


Still not super coherent, but it writes with consistent and mostly correct grammer. This could be made better by using a larger model (I used the smallest) and optimizing training data with some better cleaning and larger train set.

Thanks to Andrej Karpathy for the nanoGPT implementation as well as a few bits of code like the get_batch function taken from his implementation of a transformer.