In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline
import json

In [2]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [3]:
TRAIN_PATH = "train_dataset.txt"
TEST_PATH = "test_dataset.txt"

In [4]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=TRAIN_PATH,
    block_size=128
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=TEST_PATH,
    block_size=128
)



In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # If set to False, the labels are the same as the inputs with the padding tokens ignored
)

In [6]:
with open("best_hyperparams.json", "r") as f:
    loaded_best_params = json.load(f)

In [10]:
training_args = TrainingArguments(
    output_dir="./output_nlp",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=loaded_best_params["per_device_train_batch_size"],
    per_device_eval_batch_size=loaded_best_params["per_device_train_batch_size"],
    evaluation_strategy="steps",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10_000,
    save_steps=20_000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    gradient_accumulation_steps=2,
    fp16=True,
    fp16_backend="auto",
    fp16_full_eval=False,
    learning_rate=loaded_best_params["learning_rate"],
    weight_decay=loaded_best_params["weight_decay"],
    warmup_steps=loaded_best_params["warmup_steps"],
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    label_smoothing_factor=0.0,
    report_to=["tensorboard"],
    seed=42
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)


In [12]:
trainer.train()

  0%|          | 0/19140 [00:00<?, ?it/s]

{'loss': 4.2701, 'learning_rate': 1.1696220990259101e-05, 'epoch': 1.57}


  0%|          | 0/1234 [00:00<?, ?it/s]

{'eval_loss': 4.666795253753662, 'eval_runtime': 37.2739, 'eval_samples_per_second': 264.743, 'eval_steps_per_second': 33.106, 'epoch': 1.57}
{'train_runtime': 4022.6298, 'train_samples_per_second': 76.13, 'train_steps_per_second': 4.758, 'train_loss': 4.227321300287357, 'epoch': 3.0}


TrainOutput(global_step=19140, training_loss=4.227321300287357, metrics={'train_runtime': 4022.6298, 'train_samples_per_second': 76.13, 'train_steps_per_second': 4.758, 'train_loss': 4.227321300287357, 'epoch': 3.0})

In [17]:
SAVE_DIR = "./output_nlp"

In [18]:
trainer.save_model(SAVE_DIR)

In [21]:
def generate_story_text(prompt, model_path=SAVE_DIR):
    generator = pipeline("text-generation", model=model_path, tokenizer="gpt2")
    story_text = generator(prompt, max_length=30, num_return_sequences=1, temperature=0.8, top_k=50, top_p=0.95)[0]["generated_text"]
    return story_text

In [22]:
prompt = "Once upon a time, in a small village, there was a little girl named Alice."
generated_text = generate_story_text(prompt)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, in a small village, there was a little girl named Alice. She loved to go to school and to play with the boys
