In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline
import optuna
from transformers import EarlyStoppingCallback
from sklearn.model_selection import train_test_split

In [2]:
BOOKS_PATH = "processed_books.txt"

In [3]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [22]:
def load_and_split_data(file_path, test_size=0.1):
    with open(file_path, "r") as file:
        text = file.read()
    
    stories = text.split("\n\n")
    train_stories, test_stories = train_test_split(stories, test_size=test_size, random_state=42)

    return train_stories, test_stories

In [24]:
train_stories, test_stories = load_and_split_data(BOOKS_PATH)

In [25]:
train_file_path = "train_dataset.txt"
test_file_path = "test_dataset.txt"

with open(train_file_path, "w") as file:
    file.write("\n\n".join(train_stories))
with open(test_file_path, "w") as file:
    file.write("\n\n".join(test_stories))

In [26]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_file_path,
    block_size=128
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=test_file_path,
    block_size=128
)



In [5]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # If set to False, the labels are the same as the inputs with the padding tokens ignored
)

In [6]:
training_args = TrainingArguments(
    output_dir="./output_nlp",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=500,
    save_steps=10_000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    gradient_accumulation_steps=1,
    fp16=True,
    fp16_backend="auto",
    fp16_full_eval=False,
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    lr_scheduler_type="linear",
    warmup_steps=0,
    label_smoothing_factor=0.0,
    report_to=["tensorboard"],
    seed=42
)

In [7]:
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)


In [9]:
def generate_story_text(prompt, model_path="./output"):
    generator = pipeline("text-generation", model=model_path, tokenizer="gpt2")
    story_text = generator(prompt, max_length=150, num_return_sequences=1, temperature=0.8, top_k=50, top_p=0.95)[0]["generated_text"]
    return story_text

In [17]:
def objective(trial, train_dataset, data_collator):

    # Define hyperparameters using trial.suggest_* methods
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16])
    weight_decay = trial.suggest_float("weight_decay", 0, 0.1)
    warmup_steps = trial.suggest_int("warmup_steps", 0, 500)

    # Set up the training arguments using the suggested hyperparameters
    training_args = TrainingArguments(
        output_dir="./output",
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        evaluation_strategy="steps",
        logging_dir="./logs",
        logging_steps=500,
        eval_steps=500,
        save_steps=500,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        report_to=["tensorboard"],
        seed=42,
        disable_tqdm=True,
    )

    # Set up the Trainer instance using the suggested hyperparameters
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Train the model and return the best loss
    trainer.train()
    best_loss = trainer.evaluate()["eval_loss"]
    return best_loss

In [20]:
# Create an Optuna study
study = optuna.create_study(direction="minimize", study_name="gpt2_hyperparameter_tuning")

# Optimize the hyperparameters
study.optimize(lambda trial: objective(trial, train_dataset, data_collator), n_trials=20, timeout=3600)

[32m[I 2023-04-17 16:31:10,923][0m A new study created in memory with name: gpt2_hyperparameter_tuning[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
[33m[W 2023-04-17 16:32:07,407][0m Trial 0 failed with parameters: {'learning_rate': 0.00047807619038330284, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'weight_decay': 0.00945210956957634, 'warmup_steps': 355} because of the following error: ValueError('Trainer: evaluation requires an eval_dataset.').[0m
Traceback (most recent call last):
  File "c:\Users\reidp\miniconda3\envs\torch_gpu\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\reidp\AppData\Local\Temp\ipykernel_11992\2630444933.py", line 5, in <lambda>
    study.optimize(lambda trial: objective(trial, train_dataset, data_collator), n_trials=20, timeout=3600)
  File "C:\Users\reidp\AppData\Local\Temp\ipykernel_11992\1918027171.py", line 43, in objective
    traine

{'loss': 5.0196, 'learning_rate': 0.0004775795501224805, 'epoch': 0.02}


ValueError: Trainer: evaluation requires an eval_dataset.

In [None]:
best_params = study.best