## Import Libraries & Load Data

In [None]:
home_directory = '/content/poe_poems_stanzas.csv'

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
import numpy as np
import pandas as pd 

import random
import time
import datetime

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [None]:
poem_stanza_df = pd.read_csv(home_directory)
poem_stanza_df = poem_stanza_df.fillna('')

In [None]:
RANDOM_SEED = 73
BATCH_SIZE = 2
EPOCHS = 8
MAX_LEN = 1024

## Text Generation - GPT-2

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
combined_poems = poem_stanza_df.groupby(['title'])['stanza_text'].transform(lambda x: ' /n /n '.join(x)).drop_duplicates().reset_index(drop=True)

In [None]:
max_poem_length = max([len(tokenizer.encode(poem)) for poem in combined_poems])
min_poem_length = min([len(tokenizer.encode(poem)) for poem in combined_poems])

Token indices sequence length is longer than the specified maximum sequence length for this model (1753 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
print('Longest Edgar Allen Poe Poem:', max_poem_length, 'tokens long.')
print('Shortest Edgar Allen Poe Poem:', min_poem_length, 'tokens long.')

Longest Edgar Allen Poe Poem: 6465 tokens long.
Shortest Edgar Allen Poe Poem: 55 tokens long.


In [None]:
stanza_length = [len(tokenizer.encode(stanza)) for stanza in poem_stanza_df['stanza_text'].values]
max_stanza_length = max(stanza_length)
min_stanza_length = min(stanza_length)

In [None]:
print('Number of stanzas longer than max length (1024 tokens): ', sum([st_len > MAX_LEN for st_len in stanza_length]))

Number of stanzas longer than max length (1024 tokens):  1


In [None]:
print('Longest Edgar Allen Poe Stanza:', max_stanza_length, 'tokens long.')
print('Shortest Edgar Allen Poe Stanza:', min_stanza_length, 'tokens long.')

Longest Edgar Allen Poe Stanza: 1948 tokens long.
Shortest Edgar Allen Poe Stanza: 15 tokens long.


In [None]:
class PoePoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_LEN):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in data:
            encodings_dict = tokenizer('<BOS>' + i + '<EOS>',
                                     truncation=True,
                                     max_length=max_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
poem_stanza_dataset = PoePoemDataset(poem_stanza_df['stanza_text'].values, tokenizer, max_length=MAX_LEN)

##Train/Validation Split

In [None]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [None]:
poem_stanza_train_size, poem_stanza_val_size = train_val_split(0.8, poem_stanza_dataset)

# random split imported from troch.utils
poem_stanza_train_dataset, poem_stanza_val_dataset = random_split(poem_stanza_dataset, [poem_stanza_train_size, poem_stanza_val_size])

##Apply Random Seeds

In [None]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7f532e5165d0>

##Instantiate DataLoaders and Define Model Creation Function

In [None]:
poem_stanza_train_dataloader = DataLoader(poem_stanza_train_dataset,
                              sampler=RandomSampler(poem_stanza_train_dataset),
                              batch_size=BATCH_SIZE)

poem_stanza_val_dataloader = DataLoader(poem_stanza_val_dataset,
                            sampler=SequentialSampler(poem_stanza_val_dataset),
                            batch_size=BATCH_SIZE)

In [None]:
# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

# hyperparameters
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50

# create text generation seed prompt
device = torch.device('cuda')

prompt = "<BOS>"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

##Create Poem Stanza Model

In [None]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_LEN).from_pretrained('gpt2', output_hidden_states=True)

poem_stanza_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_stanza_model.resize_token_embeddings(len(tokenizer))

poem_stanza_model.cuda()
optimizer = AdamW(poem_stanza_model.parameters(), lr=learning_rate, eps=eps)

total_steps = len(poem_stanza_train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

start_time = time.time()
poem_stanza_model = poem_stanza_model.to(device)

for epoch_i in range(0, EPOCHS):

    print(f'Epoch {epoch_i + 1} of {EPOCHS}')

    t0 = time.time()
    total_train_loss = 0
    poem_stanza_model.train()

    for step, batch in enumerate(poem_stanza_train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        poem_stanza_model.zero_grad()        

        outputs = poem_stanza_model(b_input_ids,
                                    labels=b_labels,
                                    attention_mask=b_masks,
                                    token_type_ids=None)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(poem_stanza_train_dataloader)       
    training_time = format_time(time.time() - t0)

    print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')

    t0 = time.time()

    poem_stanza_model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in poem_stanza_val_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():        

            outputs  = poem_stanza_model(b_input_ids,
                                         attention_mask=b_masks,
                                         labels=b_labels)

            loss = outputs[0]  

        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(poem_stanza_val_dataloader)


    print(f'Average Validation Loss: {avg_val_loss}')

print(f'Total Training Time: {format_time(time.time()-start_time)}')

torch.save(poem_stanza_model.state_dict(), 'poe_model.pth')

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Epoch 1 of 8
Average Training Loss: 2.650922484869181. Epoch Training Time: 0:01:01
Average Validation Loss: 0.5939125906337391
Epoch 2 of 8
Average Training Loss: 0.613492009424886. Epoch Training Time: 0:00:58
Average Validation Loss: 0.5378945110873743
Epoch 3 of 8
Average Training Loss: 0.5610672789950704. Epoch Training Time: 0:00:58
Average Validation Loss: 0.5273258394815705
Epoch 4 of 8
Average Training Loss: 0.5375017303713533. Epoch Training Time: 0:00:58
Average Validation Loss: 0.5238184881481257
Epoch 5 of 8
Average Training Loss: 0.5229410167003787. Epoch Training Time: 0:00:58
Average Validation Loss: 0.521620973944664
Epoch 6 of 8
Average Training Loss: 0.5111584748293079. Epoch Training Time: 0:00:58
Average Validation Loss: 0.5212448117407885
Epoch 7 of 8
Average Training Loss: 0.5056163436451624. Epoch Training Time: 0:00:58
Average Validation Loss: 0.5208784585649316
Epoch 8 of 8
Average Training Loss: 0.5016894607349883. Epoch Training Time: 0:00:58
Average Validat

##Generate Poem Stanzas

In [None]:
poem_stanza_model.eval()

sample_outputs = poem_stanza_model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length=MAX_LEN,
                                top_p=0.95, 
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: , I am happy
 My spirit hath drawn me into this world
 In the dream of waking dream;
 And the light that illumine'd in a dream
 Upon my brows 
 I dwell, my soul afar at length,
 A melancholy shadow afar afar at length
 My shadow's gaze—
 A strange feeling of a thousand dead,
 The soul of the dead,
 And the soul that never sleeps—
 So that my death may never rest 
 Like the shadow of the night—
 And the dead that never awaken—
 In my soul that trembles,
 And is, alas, dead before death:
 In my dream, my spirit hath drawn me into this world
 And hath drawn me into this world
 Of waking dream (what may then not be called a dream?) 
 For the night that has no darkness— 
 Is as quiet as the dawn of a storm 
 The moon within a sky;
 And in the hour of its waking,
 A mystic melody floats;
 The wind hath not yet flown,
 And there hath not reached its apogee—
 As the sun and 
 The moon have not yet departed!
 The night that is not 
 Is not 
 And then is not 
 A radiant dream-cast upon the se