### Fine-tuning GPT using Compatible Pairs

This file fine-tunes GPT-2 using computed pairs from the matching algorithm that exceed a certain cosine similiarity threshold.

In [None]:
! pip install -U accelerate
! pip install -U transformers

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m276.5/280.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.1
    Uninstalling transformers-4.38.1:
      Successfully uninstalled transformers-4.38.1
Successfully installed transformers-4.38.2


In [None]:
import pandas as pd
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import ast

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
## Load in dataframes for the process

matches_df = pd.read_csv('drive/MyDrive/okcupid_matches.csv')
matches_df = matches_df.rename(columns={'Unnamed: 0': 'Person_id'})

bios_text = pd.read_csv('drive/MyDrive/okcupid_profiles.csv')
bios_text.reset_index(inplace=True)
bios_text = bios_text.rename(columns={'index': 'Person_id'})

## Joining the bios together

bios_text.fillna(' ', inplace=True)
bios_text['allessays'] = bios_text[['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8']].apply(lambda x: ' '.join(x), axis=1)
bios_text = bios_text[['Person_id', 'allessays']]

## Join two df together on person_id

finetune_df = pd.merge(matches_df, bios_text, on='Person_id').sample(1000)

In [None]:
## Preparing matches for fine tuning

input_output_pairs = []

for index, row in finetune_df.iterrows():
    if row['matches'] != '[]':
       for match in ast.literal_eval(row['matches']):
           match_id = match[0]
           matching_row = bios_text[bios_text['Person_id'] == match_id]
           ## Add indicator to model what the input and output is
           input_output_pairs.append(F"Input: {row['allessays']}\n")
           input_output_pairs.append(F"Output: {matching_row['allessays'].iloc[0]}\n")

input_output_pairs = "".join(input_output_pairs)

with open("input_output_pairs.txt", "w") as f:
    f.write(input_output_pairs)

In [None]:
def fine_tune_gpt2(train_file, output_dir):
    '''
    This function fine-tunes Distil-GPT2 model using the created train file.
    
    Inputs: Train file (.txt), output directory (file path)
    Outputs: Model (folder with model components)
    '''
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GPT2LMHeadModel.from_pretrained("distilgpt2").to(device)
    tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=2,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
fine_tune_gpt2("input_output_pairs.txt", "fine_tuned_model")

Step,Training Loss
500,4.0365
1000,3.7859
1500,3.6377
2000,3.5552
2500,3.4722
3000,3.4189
3500,3.3388
4000,3.3047
4500,3.2535
5000,3.2229


Checkpoint destination directory fine_tuned_model/checkpoint-10000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
from google.colab import files
files.download("fine_tuned_model")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
## Saving model to Google Drive

!zip -r /content/fine_tuned_model.zip /content/fine_tuned_model

  adding: content/fine_tuned_model/ (stored 0%)
  adding: content/fine_tuned_model/generation_config.json (deflated 24%)
  adding: content/fine_tuned_model/runs/ (stored 0%)
  adding: content/fine_tuned_model/runs/Mar04_02-22-47_32e0eadc79ed/ (stored 0%)
  adding: content/fine_tuned_model/runs/Mar04_02-22-47_32e0eadc79ed/events.out.tfevents.1709518968.32e0eadc79ed.5092.1 (deflated 63%)
  adding: content/fine_tuned_model/runs/Mar04_02-31-20_32e0eadc79ed/ (stored 0%)
  adding: content/fine_tuned_model/runs/Mar04_02-31-20_32e0eadc79ed/events.out.tfevents.1709519482.32e0eadc79ed.5092.2 (deflated 60%)
  adding: content/fine_tuned_model/runs/Mar04_02-21-43_32e0eadc79ed/ (stored 0%)
  adding: content/fine_tuned_model/runs/Mar04_02-21-43_32e0eadc79ed/events.out.tfevents.1709518904.32e0eadc79ed.5092.0 (deflated 61%)
  adding: content/fine_tuned_model/runs/Mar04_02-32-14_32e0eadc79ed/ (stored 0%)
  adding: content/fine_tuned_model/runs/Mar04_02-32-14_32e0eadc79ed/events.out.tfevents.1709519536.3

In [None]:
import shutil

In [None]:
shutil.move("fine_tuned_model.zip", "drive/MyDrive/fine_tuned_model.zip")

'drive/MyDrive/fine_tuned_model.zip'