# GPT-2 Fine-tuning.ipynb

This file contains the code for fine-tuning GPT-2 on the dataset we used.

In [None]:
# Import libraries
import os
import pandas as pd

from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [2]:
os.environ["WANDB_DISABLED"] = "true" # to disable wandb logging

In [3]:
n = 5 # change the subsequence length to try different N's

# Read the files
train_df = pd.read_json('train_fold_1_n_5.json')
val_df = pd.read_json('val_fold_1_n_5.json')
test_df = pd.read_json('test_n_5.json')

# Shuffle the dataframes
train_df = train_df.sample(frac=1, random_state=1).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=1).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=1).reset_index(drop=True)

11302 5675 3130 16977


  train_val_df = train_df.append(val_df)


In [4]:
dfs = [train_df, val_df, test_df]
texts = []

# Generate texts for fine-tuning GPT-2 for each dataframe
for df in dfs:
    text = ""
    for index, row in df.iterrows():
        difficulties = row['difficulties']
        answers = row['answers']
        for i in range(len(difficulties)):
            # Form each data point as a string
            text += 'Difficulty: ' + str(difficulties[i]) + ' -> Answer: ' + ('CORRECT' if int(answers[i]) == 1 else 'WRONG')
            if i < len(difficulties) - 1:
                text += ' ||| ' # separator between subsequent items in a subsequence
        text += '\n<|endoftext|>\n' # separator between subsequences
    texts.append(text)

# Write the formed texts to files
with open('train_data_gpt2.txt', 'w') as f:
    f.write(texts[0])
with open('val_data_gpt2.txt', 'w') as f:
    f.write(texts[1])
with open('test_data_gpt2.txt', 'w') as f:
    f.write(texts[2])



In [5]:
def fine_tune_gpt2(model_name, train_file, val_file, output_dir):
    """
    Fine-tune GPT-2 model on the given data and save the fine-tuned model.

    Args:
        model_name: Name of the GPT-2 model from HuggingFace to be fine-tuned.
        train_file: Path to the file containing the training data.
        val_file: Path to the file containing the validation data.
        output_dir: Path to the directory where the fine-tuned model should be saved.

    Returns:
        None
    """

    # Load the GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

    # Form the training and eval dataset objects and data collator
    train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_file, block_size=128)
    eval_dataset = TextDataset(tokenizer=tokenizer, file_path=val_file, block_size=128)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    
    # Specify training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=4,
        save_steps=1000
    )

    # Create the trainer object
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    # Train the model
    trainer.train()

    # Save the fine-tuned model after training
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [6]:
# Call the function to fine-tune GPT-2

fine_tune_gpt2("gpt2", "train_data_gpt2.txt", "val_data_gpt2.txt", "output")

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,0.2086
1000,0.1722
1500,0.1633
2000,0.1597


In [7]:
# Test inference of our fine-tuned model on some sample data

from transformers import pipeline

# Load the saved model
review_pipeline = pipeline('text-generation', model = './output', tokenizer = 'gpt2')

# Generate text using the model
result = review_pipeline("Difficulty: 1 -> Answer: CORRECT ||| Difficulty: 1 -> Answer: CORRECT ||| Difficulty: 2 -> Answer: CORRECT ||| Difficulty: 2 -> Answer: WRONG ||| Difficulty: 2 -> Answer: ")[0]['generated_text'] # sample input; change this to try different random inputs
print("<<<>>>", result, "<<<>>>")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<<<>>> Difficulty: 1 -> Answer: CORRECT ||| Difficulty: 1 -> Answer: CORRECT ||| Difficulty: 2 -> Answer: CORRECT ||| Difficulty: 2 -> Answer: WRONG ||| Difficulty: 2 -> Answer: iculty: <<<>>>
