# CS 39AA Text Generation: Model Fine-tuned w/ Hemingway

Let's now see what kind of results we can get if we take the same model but fine tune on what some say is Hemingway's best novel, 'The Sun Also Rises'. 

In [13]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel

NOTE: Redirects are currently not supported in Windows or MacOs.


In [14]:
MODEL_NAME = 'gpt2-medium'

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

loading file vocab.json from cache at /Users/steve/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/vocab.json
loading file merges.txt from cache at /Users/steve/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /Users/steve/.cache/huggingface/hub/models--gpt2-medium/snapshots/e852c9080bc759a01663acf5a828d95b261a9903/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,


Embedding(50259, 1024)

In [15]:
sentences = pd.read_csv('/Users/steve/sunalsorises.csv')['sentence']
sentences.head()

0    Robert Cohn was once middleweight boxing champ...
1    Do not think that I am very much impressed by ...
2    He cared nothing for boxing, in fact he dislik...
3    There was a certain inner comfort in knowing h...
4                    He was Spider Kelly’s star pupil.
Name: sentence, dtype: object

In [16]:
max_length = max([len(tokenizer.encode(sentence)) for sentence in sentences])

In [17]:
max_length

224

In [18]:
class HemingwayDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]



In [19]:
dataset = HemingwayDataset(sentences, tokenizer, max_length=max_length)
train_size = int(0.9 * len(dataset))
train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])

In [20]:
train_dataset[0]

(tensor([50257,   447,   247,   447,   251,   564,   250,  1212,  8237,   318,
          1165,   922,   329, 27805,    12,  7109,  8040,    11,   616, 13674,
            13, 50256, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258, 50258,
         50258, 50258, 50258, 50258, 50258, 50258, 5

In [21]:
import gc
gc.collect()

4

In [22]:
training_args = TrainingArguments(output_dir='/Users/steve/models/hemingway_generation', num_train_epochs=1, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.05, logging_dir='/Users/steve/models/hemingway_generation/logs', report_to = 'none')

PyTorch: setting up devices


In [23]:
Trainer(model=model,  args=training_args, train_dataset=train_dataset, 
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()

***** Running training *****
  Num examples = 6143
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 6143
  Number of trainable parameters = 354825216


  0%|          | 0/6143 [00:00<?, ?it/s]

{'loss': 0.6824, 'learning_rate': 4.9266264470895154e-05, 'epoch': 0.02}
{'loss': 0.2265, 'learning_rate': 4.845100277188978e-05, 'epoch': 0.03}
{'loss': 0.201, 'learning_rate': 4.76357410728844e-05, 'epoch': 0.05}
{'loss': 0.1972, 'learning_rate': 4.6820479373879014e-05, 'epoch': 0.07}
{'loss': 0.21, 'learning_rate': 4.6005217674873634e-05, 'epoch': 0.08}
{'loss': 0.2374, 'learning_rate': 4.5189955975868254e-05, 'epoch': 0.1}
{'loss': 0.2055, 'learning_rate': 4.4374694276862874e-05, 'epoch': 0.11}
{'loss': 0.2246, 'learning_rate': 4.3559432577857494e-05, 'epoch': 0.13}
{'loss': 0.1911, 'learning_rate': 4.2744170878852114e-05, 'epoch': 0.15}
{'loss': 0.2071, 'learning_rate': 4.1928909179846734e-05, 'epoch': 0.16}
{'loss': 0.1874, 'learning_rate': 4.111364748084135e-05, 'epoch': 0.18}
{'loss': 0.1961, 'learning_rate': 4.0298385781835974e-05, 'epoch': 0.2}
{'loss': 0.1987, 'learning_rate': 3.9483124082830593e-05, 'epoch': 0.21}
{'loss': 0.2168, 'learning_rate': 3.8667862383825207e-05, 'e

Saving model checkpoint to /Users/steve/models/hemingway_generation/checkpoint-5000
Configuration saved in /Users/steve/models/hemingway_generation/checkpoint-5000/config.json


{'loss': 0.1837, 'learning_rate': 9.318441219631502e-06, 'epoch': 0.81}


Model weights saved in /Users/steve/models/hemingway_generation/checkpoint-5000/pytorch_model.bin


{'loss': 0.1973, 'learning_rate': 8.503179520626122e-06, 'epoch': 0.83}
{'loss': 0.1723, 'learning_rate': 7.68791782162074e-06, 'epoch': 0.85}
{'loss': 0.1748, 'learning_rate': 6.87265612261536e-06, 'epoch': 0.86}
{'loss': 0.1857, 'learning_rate': 6.057394423609979e-06, 'epoch': 0.88}
{'loss': 0.2087, 'learning_rate': 5.2421327246045984e-06, 'epoch': 0.9}
{'loss': 0.1846, 'learning_rate': 4.4268710255992175e-06, 'epoch': 0.91}
{'loss': 0.2186, 'learning_rate': 3.6116093265938366e-06, 'epoch': 0.93}
{'loss': 0.1871, 'learning_rate': 2.796347627588456e-06, 'epoch': 0.94}
{'loss': 0.178, 'learning_rate': 1.981085928583075e-06, 'epoch': 0.96}
{'loss': 0.1762, 'learning_rate': 1.1658242295776945e-06, 'epoch': 0.98}
{'loss': 0.1919, 'learning_rate': 3.505625305723137e-07, 'epoch': 0.99}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 9920.2957, 'train_samples_per_second': 0.619, 'train_steps_per_second': 0.619, 'train_loss': 0.2072916038646074, 'epoch': 1.0}


TrainOutput(global_step=6143, training_loss=0.2072916038646074, metrics={'train_runtime': 9920.2957, 'train_samples_per_second': 0.619, 'train_steps_per_second': 0.619, 'train_loss': 0.2072916038646074, 'epoch': 1.0})

In [41]:
generated = tokenizer("<|startoftext|> The old sailor hit", return_tensors="pt").input_ids

In [42]:
sample_outputs = model.generate(generated, do_sample=True, top_k=50, max_length=20, top_p=0.95, temperature=1.5, num_return_sequences=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [43]:
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

0:  The old sailor hit her in the pocket and put her down on the ground before going off with
1:  The old sailor hit her face and watched her back through all the crowd.
2:  The old sailor hit his chest and weaved under for the two men to open a ring.
3:  The old sailor hit three-dollar a beer when we came in, and Brett paid him and
4:  The old sailor hit hard and hard over, putting into my chest, rolling up him toward me
5:  The old sailor hit that girl on the chest and then fell limp with exhaustion and took the knife
6:  The old sailor hit Brett down hard on the arm for five solid minutes.
7:  The old sailor hit Romero and knocked the bull in his pocket into the bull-chest.
8:  The old sailor hit and drove a line for a big man as we rode along, not to
9:  The old sailor hit him again and said: We say that you love.
10:  The old sailor hit me.
11:  The old sailor hit for gold, and it paid some money.
12:  The old sailor hit him across in his pocket where his name was inscribed on the

In [44]:
model.save_pretrained("/Users/steve/models/hemingway_generation")

Configuration saved in /Users/steve/models/hemingway_generation/config.json
Model weights saved in /Users/steve/models/hemingway_generation/pytorch_model.bin


In [45]:
tokenizer.save_pretrained("/Users/steve/models/hemingway_generation")

tokenizer config file saved in /Users/steve/models/hemingway_generation/tokenizer_config.json
Special tokens file saved in /Users/steve/models/hemingway_generation/special_tokens_map.json
added tokens file saved in /Users/steve/models/hemingway_generation/added_tokens.json


('/Users/steve/models/hemingway_generation/tokenizer_config.json',
 '/Users/steve/models/hemingway_generation/special_tokens_map.json',
 '/Users/steve/models/hemingway_generation/vocab.json',
 '/Users/steve/models/hemingway_generation/merges.txt',
 '/Users/steve/models/hemingway_generation/added_tokens.json')