### Fine-tuning GPT using Compatible Pairs

This file fine-tunes GPT-2 using computed pairs from the matching algorithm that exceed a certain cosine similiarity threshold.

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [None]:
## ASSUMES DATAFRAME IS IN THE FOLLOWING FORMAT:
## ROW 1: PERSON 1'S TEXT (INPUT)
## ROW 2: PERSON 2'S TEXT (OUTPUT)
## ROW 3: COSINE SIMILARITY

## Modify to appropriate file path
conversations_df = pd.read_csv('.../.csv)')

## Need to set value
threshold = VALUE 

input_output_pairs = []

## Change to appropriate row names
for index, row in conversations_df.iterrows():
    cosine_similarity = row['cosine_similarity']
    if cosine_similarity > threshold:
        input_text = row['person1'] + "\n"
        output_text = row['person2'] + "\n"
        input_output_pairs.append(input_text + output_text)

with open("input_output_pairs.txt", "w") as f:
    f.write("\n".join(input_output_pairs))

def fine_tune_gpt2(train_file, output_dir):
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

fine_tune_gpt2("input_output_pairs.txt", "fine_tuned_model")