In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline
import optuna
from transformers import EarlyStoppingCallback
from sklearn.model_selection import train_test_split

In [3]:
BOOKS_PATH = "processed_books.txt"

In [4]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [5]:
def load_and_split_data(file_path, test_size=0.1):
    with open(file_path, "r") as file:
        text = file.read()
    
    stories = text.split("\n\n")
    train_stories, test_stories = train_test_split(stories, test_size=test_size, random_state=42)

    return train_stories, test_stories

In [6]:
train_stories, test_stories = load_and_split_data(BOOKS_PATH)

In [7]:
train_file_path = "train_dataset.txt"
test_file_path = "test_dataset.txt"

with open(train_file_path, "w") as file:
    file.write("\n\n".join(train_stories))
with open(test_file_path, "w") as file:
    file.write("\n\n".join(test_stories))

In [13]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=train_file_path,
    block_size=128
)

test_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=test_file_path,
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False # If set to False, the labels are the same as the inputs with the padding tokens ignored
)

In [9]:
def objective(trial, train_dataset,test_dataset, data_collator):

    # Define hyperparameters using trial.suggest_* methods
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16])
    weight_decay = trial.suggest_float("weight_decay", 0, 0.1)
    warmup_steps = trial.suggest_int("warmup_steps", 0, 500)
    epochs = 3

    # Set up the training arguments using the suggested hyperparameters
    training_args = TrainingArguments(
        output_dir="./output",
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_steps=warmup_steps,
        evaluation_strategy="steps",
        logging_dir="./logs",
        logging_steps=500,
        eval_steps=500,
        save_steps=500,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        report_to=["tensorboard"],
        seed=42,
        disable_tqdm=True,
    )

    # Set up the Trainer instance using the suggested hyperparameters
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    )

    # Train the model and return the best loss
    trainer.train()
    best_loss = trainer.evaluate()["eval_loss"]
    return best_loss

In [11]:
# Take only 10% of the train and test datasets so it's faster to hyper parameter tune
train_dataset_hpt = train_dataset[:int(len(train_dataset) * 0.01)]
eval_dataset_hpt= test_dataset[:int(len(test_dataset) * 0.01)]

print("Training dataset size:", len(train_dataset_hpt))
print("Evaluation dataset size:", len(eval_dataset_hpt))

Training dataset size: 1020
Evaluation dataset size: 98


In [14]:
# Create an Optuna study
study = optuna.create_study(direction="minimize", study_name="gpt2_hyperparameter_tuning")

# Optimize the hyperparameters
study.optimize(lambda trial: objective(trial, train_dataset_hpt, eval_dataset_hpt, data_collator), n_trials=20, timeout=3600)

[32m[I 2023-04-17 17:26:25,762][0m A new study created in memory with name: gpt2_hyperparameter_tuning[0m
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)


{'train_runtime': 81.5046, 'train_samples_per_second': 37.544, 'train_steps_per_second': 2.356, 'train_loss': 4.65250809987386, 'epoch': 3.0}


[32m[I 2023-04-17 17:27:49,001][0m Trial 0 finished with value: 5.920174598693848 and parameters: {'learning_rate': 3.4073154032402004e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.02363315374328481, 'warmup_steps': 474}. Best is trial 0 with value: 5.920174598693848.[0m


{'eval_loss': 5.920174598693848, 'eval_runtime': 0.5589, 'eval_samples_per_second': 175.358, 'eval_steps_per_second': 23.262, 'epoch': 3.0}
{'loss': 4.1135, 'learning_rate': 6.803764396982683e-05, 'epoch': 1.96}
{'eval_loss': 6.6518659591674805, 'eval_runtime': 0.5905, 'eval_samples_per_second': 165.951, 'eval_steps_per_second': 22.014, 'epoch': 1.96}
{'train_runtime': 86.3183, 'train_samples_per_second': 35.45, 'train_steps_per_second': 8.863, 'train_loss': 3.901376203150531, 'epoch': 3.0}


[32m[I 2023-04-17 17:29:15,943][0m Trial 1 finished with value: 6.6518659591674805 and parameters: {'learning_rate': 0.00010680626374131307, 'per_device_train_batch_size': 4, 'weight_decay': 0.023937529176263652, 'warmup_steps': 349}. Best is trial 0 with value: 5.920174598693848.[0m


{'eval_loss': 6.6518659591674805, 'eval_runtime': 0.5825, 'eval_samples_per_second': 168.232, 'eval_steps_per_second': 22.316, 'epoch': 3.0}


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)


{'loss': 3.479, 'learning_rate': 0.00053605656719458, 'epoch': 1.96}
{'eval_loss': 8.383543014526367, 'eval_runtime': 0.5715, 'eval_samples_per_second': 171.473, 'eval_steps_per_second': 22.746, 'epoch': 1.96}
{'train_runtime': 84.8371, 'train_samples_per_second': 36.069, 'train_steps_per_second': 9.017, 'train_loss': 3.2747329612183416, 'epoch': 3.0}


[32m[I 2023-04-17 17:30:41,383][0m Trial 2 finished with value: 8.383543014526367 and parameters: {'learning_rate': 0.0005401022771356712, 'per_device_train_batch_size': 4, 'weight_decay': 0.036951970829043744, 'warmup_steps': 498}. Best is trial 0 with value: 5.920174598693848.[0m


{'eval_loss': 8.383543014526367, 'eval_runtime': 0.5835, 'eval_samples_per_second': 167.943, 'eval_steps_per_second': 22.278, 'epoch': 3.0}


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)


In [None]:
best_params = study.best