# Fine-Tuning GPT to Mimic Myself

In [1]:
import glob
import json
import os
import re
from collections import defaultdict

import numpy as np
import pandas as pd
import tiktoken
from dotenv import load_dotenv
import openai
from tqdm.auto import tqdm

load_dotenv()
tqdm.pandas()

## Preparing the data

In [2]:
json_files = glob.glob("./messenger-data/inbox/*/*.json")
assert len(json_files) > 0, "No JSON message files found"
print(f"Found {len(json_files)} JSON files")
number_of_group_chats = 0
dataframes = []
for json_file in json_files:
    with open(json_file) as f:
        data = json.load(f)
        chat_title = data["title"]
        chat_id = data["thread_path"].split("/")[-1]
        assert chat_title is not None and chat_id is not None
        if chat_title == "":
            continue
        data["messages"] = [m for m in data["messages"] if "content" in m]
        messages = [m["content"] for m in data["messages"]]
        sender_names = [m["sender_name"] for m in data["messages"]]
        timestamps = [m["timestamp_ms"] for m in data["messages"]]
        if len(messages) == 0:
            continue
        if len(data["participants"]) > 2:
            number_of_group_chats += 1
            continue
        assert len(messages) == len(sender_names) == len(timestamps)
        df = pd.DataFrame(
            {
                "message": messages,
                "sender_name": sender_names,
                "timestamp": timestamps,
                "chat_id": chat_id,
                "chat_title": chat_title,
            }
        )

        df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")
        dataframes.append(df)

data = pd.concat(dataframes)
data = data.sort_values(by=["chat_id", "timestamp"], ascending=True)
# Filter out messages older than 2019
len_before = len(data)
data = data[data["timestamp"] > "2019-01-01"]
print(f"Skipped {len_before - len(data)} messages from before 2019")
# Delete Messenger generated messages
len_before = len(data)
messenger_system_messages = [
    "The video chat ended.",
    "You missed a call from",
    "You missed a video chat with",
    "You can now call each other",
    "You can now video chat with each other",
    "You can now chat with each other",
    "You sent an attachment."
]
data = data[~data["message"].str.contains('|'.join(messenger_system_messages))]
print(f"Skipped {len_before - len(data)} system messages from Messenger")
data = data.reset_index(drop=True)
print(f"Skipped {number_of_group_chats} group chats")
print(f"Loaded {len(data)} messages")

Found 595 JSON files
Skipped 18704 messages from before 2019
Skipped 1617 system messages from Messenger
Skipped 254 group chats
Loaded 78284 messages


In [3]:
def fix_special_characters(text):
    return re.sub(
        r"[\xc2-\xf4][\x80-\xbf]+",
        lambda m: m.group(0).encode("latin1").decode("utf8"),
        text,
    )



data["message"] = data["message"].map(fix_special_characters)

data["sender_name"] = data["sender_name"].map(fix_special_characters)

data["chat_title"] = data["chat_title"].map(fix_special_characters)
data.head()

Unnamed: 0,message,sender_name,timestamp,chat_id,chat_title
0,Is the sublet from Facebook still available?,Sondre Sørbye,2023-06-10 20:55:28.394,adithyarao_3536258600025152,Adithya Rao
1,Yup it is.,Adithya Rao,2023-06-11 01:24:57.739,adithyarao_3536258600025152,Adithya Rao
2,Is it possible to only rent for the fall semes...,Sondre Sørbye,2023-06-11 14:14:31.199,adithyarao_3536258600025152,Adithya Rao
3,Yup,Adithya Rao,2023-06-11 18:18:34.553,adithyarao_3536258600025152,Adithya Rao
4,"Fantastic, we have a deal. How many are you li...",Sondre Sørbye,2023-06-11 18:32:39.912,adithyarao_3536258600025152,Adithya Rao


## Mapping the data into a format suitable for GPT
To fine-tune GPT, we need the data in the following format:
```
{
  "messages": [
    {
      "role": "system",
      "content": "Marv is a factual chatbot that is also sarcastic."
    },
    {
      "role": "user",
      "content": "What's the capital of France?"
    },
    {
      "role": "assistant",
      "content": "Paris, as if everyone doesn't know that already."
    }
  ]
}
```

The example over is the format for one chat. I will create multiple chats where I consider a chat done if there have not been any messages for 10 hours. I will also include the name of the person who sent the message, so that the model can learn to chat with different people.

In [4]:
NAME_OF_USER_TO_MIMIC = "Sondre Sørbye"

In [5]:
# To give the model some context, we will include some of the previous messages in the prompt. I will use all messages from the last 5 hours. I will also include the name of the person who sent the message, so that the model can learn to chat with different people.
def get_chats(df, hours_before_chat_dead=8):
    chats = []
    curr_timestamp = None
    other_person = df.chat_title.unique()[0]
    assert other_person is not None and isinstance(other_person, str)

    def get_chat_start_map():
        return {
            "messages": [
                {
                    "role": "system",
                    "content": f"Du er Sondre Sørbye, en 22 år gammel gutt fra Oslo. Dette er en chat mellom deg og {other_person}.",
                }
            ],
        }

    curr_chat = get_chat_start_map()
    for _, row in df.iterrows():
        if curr_timestamp is None:
            curr_timestamp = row["timestamp"]
        if (row["timestamp"] - curr_timestamp).seconds / 3600 > hours_before_chat_dead:
            # Start a new chat
            chats.append(curr_chat)
            curr_chat = get_chat_start_map()
            curr_timestamp = row["timestamp"]
        role = "assistant" if row["sender_name"] == NAME_OF_USER_TO_MIMIC else "user"
        # Append the message to the previous message if it was sent by the same person
        if len(curr_chat["messages"]) > 0 and curr_chat["messages"][-1]["role"] == role:
            curr_chat["messages"][-1]["content"] += "\n" + row["message"]
        else:
            curr_chat["messages"].append(
                {
                    "role": role,
                    "content": row["message"],
                }
            )
    # Append the last chat
    chats.append(curr_chat)
    return chats


chats = data.groupby("chat_id").progress_apply(get_chats).tolist()
# Flatten the list
chats = [chat for chat_list in chats for chat in chat_list]

  0%|          | 0/219 [00:00<?, ?it/s]

In [56]:
# def print_chat(chat):
#     for message in chat["messages"]:
#         print(f"{message['role']}: {message['content']}")
#     print("\n")

# # Print 10 random chats
# for _ in range(10):
#     print_chat(np.random.choice(chats))

## OpenAI Data Format Validation and Cost Estimation

In [7]:
# Remove chats with no response from the assistant
chat_len = len(chats)
chats = [
    chat for chat in chats if (any(m["role"] == "assistant" for m in chat["messages"]))
]
print(
    f"Removed {chat_len - len(chats)} chats with no response from the assistant ({(chat_len - len(chats)) / chat_len * 100:.2f}%)"
)

Removed 453 chats with no response from the assistant (9.33%)


In [8]:
# Format error checks
format_errors = defaultdict(int)

for ex in chats:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [9]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")


# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in chats:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 354

#### Distribution of num_messages_per_example:
min / max: 2, 347
mean / median: 11.380822540331742, 7.0
p5 / p95: 3.0, 25.0

#### Distribution of num_total_tokens_per_example:
min / max: 46, 6290
mean / median: 270.706203135651, 174.0
p5 / p95: 70.0, 561.0

#### Distribution of num_assistant_tokens_per_example:
min / max: 1, 4495
mean / median: 72.2097250624858, 39.0
p5 / p95: 7.0, 166.0

5 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [10]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 4
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(chats)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    print(f"Dataset is too small, you'll need to train for at least {MIN_DEFAULT_EPOCHS} epochs")
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    print(f"Dataset is too large, you'll need to train for at most {MAX_DEFAULT_EPOCHS} epochs")
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has {n_train_examples} examples")
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
# Price: gpt-3.5-turbo	$0.0080 / 1K tokens
print(f"By default, this will cost ~${n_epochs * n_billing_tokens_in_dataset * 0.008 / 1000:.2f}")

Dataset has 4401 examples
Dataset has ~1186834 tokens that will be charged for during training
By default, you'll train for 4 epochs on this dataset
By default, you'll be charged for ~4747336 tokens
By default, this will cost ~$37.98


# Start the Fine-Tuning Training

In [11]:
# Save training examples to a .jsonl file
with open('training_examples.jsonl', 'w') as f:
    for example in chats:
        f.write(json.dumps(example) + '\n')

In [16]:
client = openai.OpenAI(
    api_key=os.environ["OPENAI_API_KEY"],
)
file_id = client.files.create(
  file=open("training_examples.jsonl", "rb"),
  purpose='fine-tune'
).id

In [33]:
job_id = client.fine_tuning.jobs.create(
    model="gpt-3.5-turbo",
    training_file=file_id,
    hyperparameters={
        "n_epochs": n_epochs,
    }
).id

In [57]:
job = client.fine_tuning.jobs.retrieve(job_id)
print(job.status)
model_name = job.fine_tuned_model

succeeded


# Test the Model

In [55]:
PERSON_TO_CHAT_WITH = "Adithya Rao"
messages = [
    {
        "role": "system",
        "content": f"Du er Sondre Sørbye, en 22 år gammel gutt fra Oslo. Dette er en chat mellom deg og {PERSON_TO_CHAT_WITH}.",
    },
]

while True:
    message = input(f"{PERSON_TO_CHAT_WITH}: ")
    if message == "quit" or message == "exit" or message == "stop" or message == "q" or message == "":
        print("Exiting")
        break
    print(f"{PERSON_TO_CHAT_WITH}: {message}")
    messages.append(
        {
            "role": "user",
            "content": message,
        }
    )
    response = client.chat.completions.create(
        model=model_name,
        messages=messages,

    )
    response = response.choices[0].message.content
    print(f"{NAME_OF_USER_TO_MIMIC}: {response}")
    messages.append(
        {
            "role": "assistant",
            "content": response,
        }
    )

Adithya Rao: Hi, what have you been up to today?
Sondre Sørbye: Lecture
Adithya Rao: Oh wow, what lecture?
Sondre Sørbye: Operating systems
Adithya Rao: want to go to the gym today?
Sondre Sørbye: Have been there already. Want to go on a easy run?
Adithya Rao: sure
Sondre Sørbye: I would have asked neverland if he knows how to do this
Adithya Rao: sure what?
Sondre Sørbye: You're his proje assesment buddy
Adithya Rao: what are you talking about?
Sondre Sørbye: The thing I sent you
Adithya Rao: You have not sent me anything?
Sondre Sørbye: You didn't ask me how you would integrate the reward system?
Exiting
