In [23]:
!pip install transformers datasets accelerate torch



Sample Story Dataset

In [24]:
stories = [
    "Once upon a time, a lonely dragon guarded a forgotten mountain.",
    "In the year 3020, humans discovered a portal to another universe.",
    "A small girl found a magical key hidden beneath her bed.",
    "The robot slowly realized it had developed emotions.",
    "Deep in the forest, a whispering tree told ancient secrets."
]

In [25]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": stories})

Load GPT-2

In [26]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

[1mGPT2LMHeadModel LOAD REPORT[0m from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Tokenize Dataset

In [27]:
def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

    tokens["labels"] = tokens["input_ids"].copy()

    return tokens

Set Training Arguments

In [28]:
!pip install --upgrade transformers



In [29]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-story",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=1,
    logging_steps=5
)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Trainer Setup

In [30]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset
)

In [31]:
print(tokenized_dataset[0].keys())

dict_keys(['text', 'input_ids', 'attention_mask', 'labels'])


Start Fine-Tuning

In [32]:
trainer.train()



Step,Training Loss
5,4.437057


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=9, training_loss=2.92103730307685, metrics={'train_runtime': 96.8972, 'train_samples_per_second': 0.155, 'train_steps_per_second': 0.093, 'total_flos': 489922560000.0, 'train_loss': 2.92103730307685, 'epoch': 3.0})

Generate creative story

In [33]:
import torch

model.eval()

prompt = 'Once upon a time'
inputs = tokenizer(prompt, return_tensors = 'pt')

outputs = model.generate(
    inputs['input_ids'],
    max_length = 100,
    num_return_sequences = 1,
    temperature = 0.8,
    top_k = 50,
    top_p = 0.95,
    do_sample = True
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [34]:
generated_story = tokenizer.decode(outputs[0], skip_special_tokens = True)
print(generated_story)

Once upon a time, the gods were willing to sacrifice the lives of men to save their planet from a new foe.
