# CS 39AA - Notebook 13b: Text Generation with fine-tuned GPT-2

Let's now see what kind of results we can get if we take the same model but fine tune on what some say is Hemingway's best novel, 'The Sun Also Rises'. 

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel
from transformers import set_seed

In [3]:
MODEL_NAME = 'gpt2-medium'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50259, 1024)

In [5]:
sentences = pd.read_csv('data/sunalsorises.csv')['sentence']
sentences.head()

0    Robert Cohn was once middleweight boxing champ...
1    Do not think that I am very much impressed by ...
2    He cared nothing for boxing, in fact he dislik...
3    There was a certain inner comfort in knowing h...
4                    He was Spider Kelly’s star pupil.
Name: sentence, dtype: object

In [6]:
max_length = max([len(tokenizer.encode(sentence)) for sentence in sentences])

In [7]:
max_length

224

In [8]:
class HemingwayDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self): # overload the len() Python built-in function
        return len(self.input_ids)

    def __getitem__(self, idx): # overload the [] operator
        return self.input_ids[idx], self.attn_masks[idx]



In [9]:
dataset = HemingwayDataset(sentences, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [10]:
train_dataset[0]

(tensor([50257,   447,   250,  5812,    11, 18726,    11,   314,  1842,   345,
           523,   881,    13, 50256, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 5

In [11]:
import gc
gc.collect()

584

In [12]:
training_args = TrainingArguments(output_dir='/Users/steve/models/hemingway_generation', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='/Users/steve/models/hemingway_generation/logs', report_to = 'none')

In [13]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()



  0%|          | 0/6143 [00:00<?, ?it/s]

{'loss': 0.7004, 'learning_rate': 4.9266264470895154e-05, 'epoch': 0.02}
{'loss': 0.2363, 'learning_rate': 4.845100277188978e-05, 'epoch': 0.03}
{'loss': 0.2571, 'learning_rate': 4.76357410728844e-05, 'epoch': 0.05}
{'loss': 0.207, 'learning_rate': 4.6820479373879014e-05, 'epoch': 0.07}
{'loss': 0.187, 'learning_rate': 4.6005217674873634e-05, 'epoch': 0.08}
{'loss': 0.2271, 'learning_rate': 4.5189955975868254e-05, 'epoch': 0.1}
{'loss': 0.2163, 'learning_rate': 4.4374694276862874e-05, 'epoch': 0.11}
{'loss': 0.2242, 'learning_rate': 4.3559432577857494e-05, 'epoch': 0.13}
{'loss': 0.202, 'learning_rate': 4.2744170878852114e-05, 'epoch': 0.15}
{'loss': 0.2067, 'learning_rate': 4.1928909179846734e-05, 'epoch': 0.16}
{'loss': 0.2326, 'learning_rate': 4.111364748084135e-05, 'epoch': 0.18}
{'loss': 0.22, 'learning_rate': 4.0298385781835974e-05, 'epoch': 0.2}
{'loss': 0.2208, 'learning_rate': 3.9483124082830593e-05, 'epoch': 0.21}
{'loss': 0.1875, 'learning_rate': 3.8667862383825207e-05, 'epo

TrainOutput(global_step=6143, training_loss=0.20672675171097152, metrics={'train_runtime': 9036.0197, 'train_samples_per_second': 0.68, 'train_steps_per_second': 0.68, 'train_loss': 0.20672675171097152, 'epoch': 1.0})

In [14]:
model.save_pretrained("/Users/steve/models/hemingway_generation")

In [15]:
tokenizer.save_pretrained("/Users/steve/models/hemingway_generation")

('/Users/steve/models/hemingway_generation/tokenizer_config.json',
 '/Users/steve/models/hemingway_generation/special_tokens_map.json',
 '/Users/steve/models/hemingway_generation/vocab.json',
 '/Users/steve/models/hemingway_generation/merges.txt',
 '/Users/steve/models/hemingway_generation/added_tokens.json')

In [16]:
tokenizer = GPT2Tokenizer.from_pretrained("/Users/steve/models/hemingway_generation")
model = GPT2LMHeadModel.from_pretrained("/Users/steve/models/hemingway_generation")

In [26]:
set_seed(41)
generated = tokenizer("<|startoftext|> The old bullfighter fell and", return_tensors="pt").input_ids
sample_outputs = model.generate(generated, do_sample=True, top_k=1000, max_length=20, temperature=0.5, num_return_sequences=20, pad_token_id=tokenizer.eos_token_id)

In [27]:
for i in range(len(sample_outputs)):
    generated = tokenizer.decode(sample_outputs[i])
    len_gen = len(sample_outputs[i])
    generated = generated.replace('\n', ' ') # remove new line characters from generated text
    print(f"ret_seq{i}: {generated} \n    (len(generated) = {len(generated.split())}) \n")


ret_seq0: <|startoftext|>  The old bullfighter fell and the old man was dead.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> 
    (len(generated) = 11) 

ret_seq1: <|startoftext|>  The old bullfighter fell and the old bullfighter fell.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> 
    (len(generated) = 10) 

ret_seq2: <|startoftext|>  The old bullfighter fell and it was all over.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> 
    (len(generated) = 10) 

ret_seq3: <|startoftext|>  The old bullfighter fell and hit the ground.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|> 
    (len(generated) = 9) 

ret_seq4: <|startoftext|>  The old bullfighter fell and hit the bull down, his sword in his hand.<|endoftext|><|endoftext|> 
    (len(generated) = 15) 

ret_seq5: <|startoftext|

Some references for this notebook are: 
* https://www.kaggle.com/code/nulldata/fine-tuning-gpt-2-to-generate-netlfix-descriptions/notebook
* https://medium.com/geekculture/fine-tune-eleutherai-gpt-neo-to-generate-netflix-movie-descriptions-in-only-47-lines-of-code-40c9b4c32475

