# Create OpenAI fine-tuning prompts and completions

This notebook creates prompts and completions for research object classification

In [None]:
import os
import pandas as pd

base_dir = os.path.join('..', '..')
data_dir = os.path.join(base_dir, 'data')

bibtext_file = os.path.join(data_dir, 'software_architecture', 'bib-text.csv')

taxonomy_file = os.path.join(data_dir, 'software_architecture', 'taxonomy_explanation.json')

output_dir = os.path.join(data_dir, 'openai_fine_tune')
os.makedirs(output_dir, exist_ok=True)

In [None]:
df = pd.read_csv(bibtext_file)

### Create prompts and completions for openai fine-tuning

* **Prompts** are created by concatenating the title and abstract of a paper.
* **Completions** are JSON objects that contain the research fields and explanations.

In [None]:
from typing import List
from treelib import Tree


def get_research_objects(tree: Tree) -> List[str]:
    research_obj = tree.children("Research Object")

    objs = []
    for obj in research_obj:
        objs.append(obj.tag)

    return objs

In [None]:
from src.utils.utils_json import read_json

taxonomy_explanation = read_json(taxonomy_file)

In [None]:
import json
from src.taxonomy.utils import parse_taxonomy

openai_prompts = []
openai_completions = []

for inx, row in df.iterrows():
    title = row['title']
    abstract = row['abstract']
    classes = row['classes']

    taxonomy = parse_taxonomy(classes)
    classifications = get_research_objects(taxonomy)

    prompt = f"{title}\n\n{abstract}\n\n"
    completion = {
        "Research Object": classifications,
        "Explanations": {},
    }

    obj_explanations = taxonomy_explanation["Research Object"]
    for clazz in classifications:
        if clazz in obj_explanations:
            completion["Explanations"][clazz] = obj_explanations[clazz]

    openai_prompts.append(prompt)
    openai_completions.append(json.dumps(completion, sort_keys=True))

openai_df = pd.DataFrame({
    'prompt': openai_prompts,
    'completion': openai_completions
})

### Prepare data for fine-tuning

* Each completion should begin with a whitespace
* Each completion should end with a fixed stop sequence to inform the model when the completion ends. A stop sequence could be \n, ###, or any other token that does not appear in any completion.
* Each prompt should end with a fixed separator to inform the model when the prompt ends and the completion begins. A simple separator which generally works well is \n\n###\n\n.
* Remove rows where the promt is longer than 2048 characters
* Split into train and test
* Print commands to run in OpenAI cli

[OpenAI's dataset preparation guide](https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset)

In [None]:
# Each completion should begin with a whitespace
openai_df['completion'] = openai_df['completion'].apply(lambda x: ' ' + x)

# Each completion should end with a fixed stop sequence to inform the model when the completion ends. A stop sequence could be \n, ###, or any other token that does not appear in any completion.
openai_df['completion'] = openai_df['completion'].apply(lambda x: x + '\n')

# Each prompt should end with a fixed separator to inform the model when the prompt ends and the completion begins. A simple separator which generally works well is \n\n###\n\n.
openai_df['prompt'] = openai_df['prompt'].apply(lambda x: x + '\n\n###\n\n')

In [None]:
# Remove rows where the promt is longer than 2048 characters
openai_df = openai_df[openai_df['prompt'].str.len() < 2048]

In [None]:
# Split into train and test
from sklearn.model_selection import train_test_split
from names_generator import generate_name

model_name = generate_name()
print('Model name:', model_name)

openai_df.to_csv(os.path.join(output_dir, model_name + ".csv"), index=False)

train_df, test_df = train_test_split(openai_df, test_size=0.2, random_state=42)

train_file = os.path.join(output_dir, model_name + "_train.jsonl")
train_df.to_json(train_file, orient='records', lines=True)

test_file = os.path.join(output_dir, model_name + "_test.jsonl")
test_df.to_json(test_file, orient='records', lines=True)

### Print commands to run in OpenAI cli

This section prints the commands that need to be run in on the command line to fine-tune the model.

In [None]:
# !openai tools fine_tunes.prepare_data -f {model_name}_train.jsonl
print(f"cd {dir};\n"
      f"openai tools fine_tunes.prepare_data -f {model_name}_train.jsonl;")

In [None]:
openai_models = [
    'ada',
    'babbage',
    'curie',
    'davinci'
]

for openai_model in openai_models:
    print(f"openai api fine_tunes.create -t \"{model_name}_train.jsonl\" --suffix {model_name} -m {openai_model}")