# OpenAI Fine-tuning API 

In [1]:
import re
import pandas as pd 
import numpy as np
import openai 

In [2]:
import openai 
from dotenv import load_dotenv
load_dotenv()
# set the api key
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
############ Notes on formatting for GPT fine-tuning ####################

# - Based on your file extension, your file is formatted as a CSV file
# - Your file contains 56 prompt-completion pairs. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examples
# - Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts empty
# - All completions end with suffix ` `
#   WARNING: Some of your completions contain the suffix ` ` more than once. We suggest that you review your completions and add a unique ending
# - The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

# add a space to the end of each prompt
# Each prompt should end with a fixed separator to inform the model when the prompt ends and the completion begins. A simple separator which generally works well is \n\n###\n\n. The separator should not appear elsewhere in any prompt.

## -- Load raw CSV files of prompt-completion pairs --

In [4]:
## GPT Chat
pc_csv_IN_path ="data/gpt-qa-df.csv"
curr_suffix_name = "gpt-ml-qa-pairs-C"

assert os.path.exists(pc_csv_IN_path), "File does not exist"

In [5]:
## FastAI Q&A pairs 
pc_csv_IN_path ="data/fastai-qa-cleaned.csv"
curr_suffix_name = "fastai-ft"

assert os.path.exists(pc_csv_IN_path), "File does not exist"

## -- Format raw CSV --> formatted CSV --

In [6]:
import os
# INPUTS: 
# - csv: a csv with headers 'prompt' and 'completion'

# PROCESSING: 
# - add a space to the end of each prompt
# - add a \n\n###\n\n to the end of each prompt
# - add a space to the beginning of each completion
# - add a \n\n<+++>\n\n to the end of each completion

# saves to df

# OUTPUTS:
# - csv: a cleaned, formatted csv ready for fine-tuning 

def format_csv_for_ft(qa_csv_path):
    #### Check formatting on CSV 
    assert qa_csv_path[-4:] == '.csv', "input path does not end with '.csv'"
    # - headers are 'prompt' and 'completion'
    assert pd.read_csv(qa_csv_path, nrows=0).columns.tolist() == ['prompt', 'completion'], "file headers are not 'prompt' and 'completion'"

    gpt_formatted_df = pd.read_csv(qa_csv_path)
    # print(gpt_loaded_df.head()) # double check input 

    #### Format for fine-tuning 
    # 1) Completions: start with a space ' ', end with a seperator '\n+END+\n'
    # Each completion should start with a whitespace due to our tokenization, which tokenizes most words with a preceding whitespace.
    # Each completion should end with a fixed stop sequence to inform the model when the completion ends. A stop sequence could be \n, ###, or any other token that does not appear in any completion.
    gpt_formatted_df['completion'] = gpt_formatted_df['completion'].apply(lambda x: ' ' + x + '\n+END+\n')
    # 2) add a separater to the end of each prompt 
    gpt_formatted_df['prompt'] = gpt_formatted_df['prompt'].apply(lambda x: x + '\n\n###\n\n')

    #### Save to csv 
    qa_csv_OUT_path=os.path.join(pc_csv_IN_path[:-4]+"-formatted-CSV-FIRST.csv")
    gpt_formatted_df.to_csv(qa_csv_OUT_path, index=False)
    return qa_csv_OUT_path

formatted_pc_csv_path = format_csv_for_ft(pc_csv_IN_path)

print("\n#####\nRaw CSV -> Formatted CSV conversion complete\n#####\n")


#####
Raw CSV -> Formatted CSV conversion complete
#####



In [7]:
#EXAMPLE splitting train and valid set; not used right now 

# def split_df_train_test(df, train_frac=0.8):
#     # split the df into train and test
#     train_df = df.sample(frac=train_frac, random_state=42)
#     test_df = df.drop(train_df.index)

#     return train_df, test_df

######
# ## # get train and test df from the previous 'formatted_qa_df'
# qa_train_df, qa_test_df = split_df_train_test(formatted_qa_df)

# #create paths to save 
# qa_TEST_OUT_path = f"{qa_csv_IN_path[:-4]}-TRAIN.csv"
# qa_TRAIN_OUT_path = f"{qa_csv_IN_path[:-4]}-TEST.csv"

# # # save the train and test dfs to csv 
# qa_train_df.to_csv(qa_TEST_OUT_path, index=False)
# qa_test_df.to_csv(qa_TRAIN_OUT_path, index=False)

# print ('train/test shapes: ', qa_train_df.shape, qa_test_df.shape)
# print ('train/test heads: \n\n', qa_train_df.head(),qa_test_df.head())

## CSV -> JSONL, using the OpenAI CLI tool

*Typically, we call the following in terminal to convert the file:*

`openai tools fine_tunes.prepare_data -f <LOCAL_FILE>`

The following section automates this with a bash script 

@TODO 
Eventually, run a bash script that gets the output from terminal and displays to user 
to make choices about data formatting
for now lets just do it manually


In [8]:
jsonl_conversion_sh_script_path = './scripts/csv2jsonl_openai.sh'

In [9]:
# assure permissions for the bash script and file 
! chmod a+x {jsonl_conversion_sh_script_path}
! chmod a+x {formatted_pc_csv_path}

In [10]:
### Run the OpenAI CLI tool to format the CSV to JSONL 
import subprocess
# you need to add './' before the filename for bash to recognize it 

assert os.path.exists(formatted_pc_csv_path), "file does not exist"
assert os.path.exists(jsonl_conversion_sh_script_path), "sh file does not exist"

# Call the bash script to format the CSV to JSONL using the OpenAI CLI tool 
process = subprocess.Popen([jsonl_conversion_sh_script_path, formatted_pc_csv_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

if process is not None:
    stdout, stderr = process.communicate()
    print(stdout.decode('utf-8'))
    print(stderr.decode('utf-8'))
    
    
# get the new JSONL filename 
formatted_pc_jsonl_path = formatted_pc_csv_path[:-4] + '_prepared.jsonl'
assert (os.path.exists(formatted_pc_jsonl_path)), "JSONL file does not exist"
print(formatted_pc_jsonl_path)
print("\n#####\nCSV -> JSONL conversion complete\n#####\n")

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 223 prompt-completion pairs
- All prompts end with suffix `\n\n###\n\n`
- All completions end with suffix `\n+END+\n`

Based on the analysis we will perform the following actions:
- [Necessary] Your format `CSV` will be converted to `JSONL`


Your data will be written to a new JSONL file. Proceed [Y/n]: 
Wrote modified file to `data/fastai-qa-cleaned-formatted-CSV-FIRST_prepared (1).jsonl`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "data/fastai-qa-cleaned-formatted-CSV-FIRST_prepared (1).jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `\n\n###\n\n` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=["\n+END+\n"]` so that the generated texts ends at the expected place.
Once your model starts training, it'll approxi

# How to format the API call to the fine tuned model 

### format the prompt from the user before sending it 
After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string 

`\n\n###\n\n` 

for the model to start generating completions, rather than continuing with the prompt. 

### param in API call for stop sequence 
Make sure to include 

`stop=[" \n<+++>\n"]` 

so that the generated texts ends at the expected place.

### an example call to create a ft model in terminal: 

openai api fine_tunes.create \
-t gpt-qa-train_prepared.jsonl \
-v gpt-qa-valid_prepared.jsonl \
-m "davinci" \
--suffix "gpt-ml-qa-pairs-A"

## -- Functions to create a fine-tuned model from the JSONL file --

In [11]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

In [12]:
#####################
# HELPER FUNCTIONS
#####################

# FUNCTION 
# upload_file_openai
# - takes in a jsonl formatted dataset (just training set for now for simplicity)
# - eventually could abstract to general file upload but not needed for now
def upload_jsonl_ft_to_openai(filename): 
    # assert the filetype is jsonl 
    ext = os.path.splitext(filename)[-1].lower()
    print ('ext:', ext)
    assert ext== '.jsonl', "filetype must be jsonl"
    r = openai.File.create(
        file=open(filename, "rb"),
        purpose='fine-tune'
    )
    return r

In [13]:
#####################
# FUNCTION 
# create_finetuned_model

# INPUTS: 
# - train_file_id: a csv with headers 'prompt' and 'completion'
# - model type
# - hyperparameters

# OUTPUTS:
# finetuned_model_response: a response object including the finetuned model id

# @TODO add the ability to turn on and off test-train splitting 

def create_finetuned_model(train_file_id, valid_file_id='', model="davinci-003", learning_rate_multiplier='', n_epochs='', suffix=''):

    #####################
    # create the request to build the finetuned model 
    try: 
        finetuned_model_response = openai.FineTune.create(
            training_file=train_file_id,
            # validation_file=valid_file_id, # ignore valid file for now
            model=model,
            learning_rate_multiplier=learning_rate_multiplier,
            n_epochs=n_epochs,
            suffix=suffix,
        )
    except Exception as e:
        print ("Error creating finetuned model: ", e)
        return None
    return finetuned_model_response

In [14]:
# EXAMPLE 
# this is the old way using a train/valid set 

# _train_file_id = upload_file_openai('gpt-qa-train-formatted_prepared.jsonl')['id']
# _valid_file_id = upload_file_openai('gpt-qa-valid-formatted_prepared.jsonl')['id']

# ftm = create_finetuned_model(
#     train_file_id=_train_file_id,
#     valid_file_id=_valid_file_id,
#     model="davinci",
#     learning_rate_multiplier=0.05,
#     n_epochs=1,
#     suffix=suffix_name,
# )

# ftm_id = ftm['id']

In [None]:
# append the finetuned model id to the list.csv file, along with the datetime

def log_finetuned_model_id(ftm_id, model_name, learning_rate_multiplier, n_epochs, suffix, log_file_path):
    # get the current datetime 
    now = datetime.now()
    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
    # create the row to append 
    row = f"{dt_string},{ftm_id},{model_name},{learning_rate_multiplier},{n_epochs},{suffix}" 
    # append the row to the log file
    with open(log_file_path, 'a') as f:
        f.write(row + '\n')
    return


In [15]:
###################################
# FUNCTION: file_to_finetuned_model


""""
file_to_finetuned_model_wrapper

Inputs: 
    filename: 
        - filename path of a csv file to be uploaded to openai
        - must be formatted as a csv with headers 'prompt' and 'completion' - see the functions above to do that 
            eg "/data/my-prompt-completion-pairs-list.csv"

    model: string, name of the model to be finetuned 
            eg "davinci-003"
    
    learning_rate_multiplier: float, learning rate multiplier for the finetuned model 
            eg "0.05", typically 0.05 - 0.2
    
    n_epochs: int, number of epochs for the finetuned model 
            eg "1", typically 1 - 3
    
    suffix: string, suffix to be added to the finetuned model 
            eg "my-finetuned-model"

Outputs: 
    finetuned_model_response: the response of the OpenAI API call to create the finetuned model
        - a dictionary with the finetuned model id, hyperparameters, model name, etc.

"""

import datetime

def file_to_finetuned_model_wrapper(train_file, valid_file='', model="davinci", learning_rate_multiplier=0.1, n_epochs=4, suffix=''):
    # upload files to openai
    if train_file:
        assert(train_file.endswith('.jsonl')), "train_file must be a .jsonl file"
        _train_file_id = upload_jsonl_ft_to_openai(train_file)['id']
        print ('Successfully uploaded train JSONL file. train_file_id:', _train_file_id)
    else:
        FileNotFoundError('Must include a train_file')

    if valid_file != '':
        assert(valid_file.endswith('.jsonl')), "valid_file must be a .jsonl file"
        _valid_file_id = upload_jsonl_ft_to_openai(valid_file)['id']
    else: 
        _valid_file_id = ''

    # ensure suffix is within the required length; if not then truncate
    if len(suffix) > 40:
        suffix = suffix[:40]
        print('Suffix longer than required 40-character length; truncating to 40 characters: ', suffix)
    
    
    # create a finetuned model
    finetuned_model_response = create_finetuned_model(
        train_file_id=_train_file_id,
        # valid_file_id=_valid_file_id,
        model=model,
        learning_rate_multiplier=learning_rate_multiplier,
        n_epochs=n_epochs,
        suffix=suffix,
    )

    return finetuned_model_response

## --- Create a ftm from the JSONL file --- 
The magic is here! 

In [16]:
ftm = file_to_finetuned_model_wrapper(
    train_file=formatted_pc_jsonl_path, #from the previous step 
    suffix=curr_suffix_name # declared at the beginning of the ntoebook 
)

ext: .jsonl
Successfully uploaded train JSONL file. train_file_id: file-AGJfhVqwZDX8uDcXtH0GcX46

###
Created fine-tuned model None
###

ftm_name: None
ftm_id: None
ftm: {
  "created_at": 1670062197,
  "events": [
    {
      "created_at": 1670062197,
      "level": "info",
      "message": "Created fine-tune: ft-NM7Jg1qiG2Fco1Wnbl6BoDOn",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.1,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-NM7Jg1qiG2Fco1Wnbl6BoDOn",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-5RSpJP9M5vC6iGBmX0vRfiMp",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 96731,
      "created_at": 1670062197,
      "filename": "file",
      "id": "file-AGJfhVqwZDX8uDcXtH0GcX46",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
  

In [75]:
# log each element of the json response as a column in a csv file
from datetime import datetime

def log_ftm(ftm):
    """
    # Checks if the ftm id is already in the log file. If not, then adds to log file. 
    # INPUTS: ftm - the json response from the OpenAI API call to create a finetuned model
    # OUTPUTS: True/False
    """
    log_file_path = 'outputs/finetuned-models-log.csv'

    if not os.path.exists(log_file_path):
        with open(log_file_path, 'w') as f:
            f.write('datetime,created_at,ftm_id, status\n')

    # check if the ftm id already exists in the csv 
    pattern = ftm['id']
    with open(log_file_path, 'r') as csvfile:
        if any(map(lambda x: pattern == x.rstrip(), csvfile)): # iterates through text looking for match
            print ("Finetuned model id already exists in log file")
            return False
        else:
            with open(log_file_path, 'a') as f:
                f.write(f"{datetime.now()}, {ftm['created_at']},{ftm['id']}, {ftm['status']}\n")
            return True
    

log_ftm(ftm)

True

In [74]:
ftm_name = ftm['fine_tuned_model']
ftm_id = ftm['id']

print(f"\n###\nCreated fine-tuned model {ftm_id}\n###\n")
print ('ftm_name: ', ftm_name)
print ('ftm_id: ', ftm_id)
print ('status: ', ftm['status'])
print ('ftm: ', ftm)


###
Created fine-tuned model ft-NM7Jg1qiG2Fco1Wnbl6BoDOn
###

ftm_name:  None
ftm_id:  ft-NM7Jg1qiG2Fco1Wnbl6BoDOn
status:  pending
ftm:  {
  "created_at": 1670062197,
  "events": [
    {
      "created_at": 1670062197,
      "level": "info",
      "message": "Created fine-tune: ft-NM7Jg1qiG2Fco1Wnbl6BoDOn",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.1,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-NM7Jg1qiG2Fco1Wnbl6BoDOn",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-5RSpJP9M5vC6iGBmX0vRfiMp",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 96731,
      "created_at": 1670062197,
      "filename": "file",
      "id": "file-AGJfhVqwZDX8uDcXtH0GcX46",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 167006

### --- Test with FastAI Question-Answer Pairs data ---

In [43]:
ftm_fastai = file_to_finetuned_model_wrapper(
    train_file='data/fastai-qa-cleaned-formatted-CSV-FIRST_prepared.jsonl', 
    suffix='fastai-qa-cleaned-formatted-CSV-FIRST_prepared',
)

ext: .jsonl
Successfully uploaded train JSONL file. train_file_id: file-xUD0WfFc5NSvjEcdSPjmhFjN
Suffix longer than required 40-character length; truncating to 40 characters:  fastai-qa-cleaned-formatted-CSV-FIRST_pr


In [20]:

print('id ', ftm_fastai['id'])
print('name ',ftm_fastai['fine_tuned_model'])
print('status ', ftm_fastai['status'])
print ('ftm: ', ftm_fastai)

id  ft-WIixwlrf7V7FS5hpHj9SG9QY
name  None
status  pending
ftm:  {
  "created_at": 1670062226,
  "events": [
    {
      "created_at": 1670062226,
      "level": "info",
      "message": "Created fine-tune: ft-WIixwlrf7V7FS5hpHj9SG9QY",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": null,
    "learning_rate_multiplier": 0.1,
    "n_epochs": 4,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-WIixwlrf7V7FS5hpHj9SG9QY",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-5RSpJP9M5vC6iGBmX0vRfiMp",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 96731,
      "created_at": 1670062226,
      "filename": "file",
      "id": "file-U7HzG2MrVw7TpMPzcWzEmS4A",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1670062226,
  "validation_files": []
}


### --- Test with GPT Chat log data ---

In [23]:
# TEST GPT CHAT DATA 
# test the file_to_finetuned_model function

banana_file = "data/gpt-qa-train-formatted_prepared.jsonl"
banana_suffix="qa-train-iterate"

ftm_gptchat = file_to_finetuned_model_wrapper(
    train_file=banana_file,
    suffix=banana_suffix + "-z"
)

print('id: ',ftm_gptchat['id'])
print('name: ',ftm_gptchat['fine_tuned_model'])

# ftm is only available after the model has been created!!!!

ext: .jsonl
Successfully uploaded train JSONL file. train_file_id: file-eNPg3CZs8FhJAyUAK5l3wRWl
id:  ft-pLLITeGYtGtpmdbJRIrhDEaf
name:  None


### Helper Functions

In [24]:
# HELPERS 

def get_ftmodel_name_from_id(id):
    ftmodel = openai.FineTune.retrieve(id)
    return ftmodel['fine_tuned_model']

def get_ftmodel_id_from_name(name):
    all_ft_models = openai.FineTune.list()['data']
    for ftmodel in all_ft_models:
        if name in ftmodel['fine_tuned_model']:
            return ftmodel['id']
    return None

print(get_ftmodel_name_from_id("ft-xXiSGiL0RrMUpKikJw09T1up"))
print(get_ftmodel_id_from_name("davinci:ft-sandbox:qa-train-iterate-lrm-0-1-epoch-1-2022-11-20-12-50-36"))

davinci:ft-sandbox:qa-train-iterate-lrm-0-1-epoch-1-2022-11-20-12-50-36
ft-xXiSGiL0RrMUpKikJw09T1up


In [25]:
# GET FTM AWAIT READY 

# the finetuned model needs to finish training before we can use it
# so let's track its status and only make calls when its' ready 
# we can use the finetuned model id to check its status

import time

def get_finetuned_model_status(finetuned_model_id):
    response = openai.FineTune.retrieve(id=finetuned_model_id)
    return response['status']

#####
# FUNCTION: await_ft_ready
# INPUT: finetuned_model_id, seconds_of_patience
# prints every check_every_n_seconds seconds the status of the mdoel 
# times out after seconds_of_patience seconds
# OUTPUT: True if finetuned model is ready, False if finetuned model is not ready

def await_ft_ready(finetuned_model_id, seconds_of_patience=400):

    check_every_n_seconds = 10
    # round down 
    num_checkins = seconds_of_patience//check_every_n_seconds # number of times to check the status of the finetuned model
    print(f"###\nWaiting for {finetuned_model_id} to train.\n###\n")

    finetuned_model_status = None 
    counter = 0

    while True:
        status = get_finetuned_model_status(finetuned_model_id)
        if status == 'succeeded':
            print(f"###\n{finetuned_model_id} is ready!\n###\n")
            print (openai.FineTune.retrieve(id=finetuned_model_id))
            return True
        else:
            # print (f"{finetuned_model_id} status: {status}")
            # print(f"{finetuned_model_id} finetuned model is still training. {patience*10} seconds so far.")
            # print ('counter',counter)
            counter +=1
            if counter % check_every_n_seconds == 0: 
                print(f"{finetuned_model_id} has trained for at least {counter*check_every_n_seconds} seconds so far. Status: {status}")
            if counter > num_checkins: 
                # print that the current timed out
                print(f"{finetuned_model_id} finetuned model has taken longer than {seconds_of_patience} to train, since we started checking in on its status. Calling exceeds patience variable. Exiting await-train loop.")
                print (openai.FineTune.retrieve(id=finetuned_model_id))
                return False
            time.sleep(check_every_n_seconds) 

In [26]:
# test
ftm_trained_id ='ft-xXiSGiL0RrMUpKikJw09T1up'
await_ft_ready(ftm_trained_id)

###
Waiting for ft-xXiSGiL0RrMUpKikJw09T1up to train.
###

###
ft-xXiSGiL0RrMUpKikJw09T1up is ready!
###

{
  "created_at": 1668948495,
  "events": [
    {
      "created_at": 1668948495,
      "level": "info",
      "message": "Created fine-tune: ft-xXiSGiL0RrMUpKikJw09T1up",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1668948501,
      "level": "info",
      "message": "Fine-tune costs $0.35",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1668948502,
      "level": "info",
      "message": "Fine-tune enqueued. Queue number: 0",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1668948503,
      "level": "info",
      "message": "Fine-tune started",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1668948604,
      "level": "info",
      "message": "Completed epoch 1/1",
      "object": "fine-tune-event"
    },
    {
      "created_at": 1668948636,
      "level": "info",
      "message": "Uploaded model: da

True

# ~~ Flip this on if you want it to wait on the actual ftm we just loaded ~

In [46]:
await_ft_ready(ftm_id)

###
Waiting for ft-NM7Jg1qiG2Fco1Wnbl6BoDOn to train.
###



KeyboardInterrupt: 


We attach a result file to each job once it has been completed. This results file ID will be listed when you retrieve a fine-tune, and also when you look at the events on a fine-tune. You can download these files.

The _results.csv file contains a row for each training step, where a step refers to one forward and backward pass on a batch of data. 

openai api fine_tunes.results -i <YOUR_FINE_TUNE_JOB_ID>


# Terminal outputs 




(ml) SF-mbp:gpt-fine-tuning stephen$ openai api fine_tunes.create \
> -t gpt-qa-train_prepared.jsonl \
> -v gpt-qa-valid_prepared.jsonl
-m "davinci" \
--suffix "gpt-ml-qa-pairs-A"
Upload progress: 100%|███████████████████████| 50.8k/50.8k [00:00<00:00, 22.6Mit/s]
Uploaded file from gpt-qa-train_prepared.jsonl: file-5fYU80lXy5FQgfmfVU56k8nb
Upload progress: 100%|███████████████████████| 12.2k/12.2k [00:00<00:00, 8.80Mit/s]
Uploaded file from gpt-qa-valid_prepared.jsonl: file-99tOIqCaPbNwL90bI1K1yq23
Created fine-tune: ft-zHk8gGvZdaBNRVheXq5fZFqt
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-11-20 00:36:48] Created fine-tune: ft-zHk8gGvZdaBNRVheXq5fZFqt
[2022-11-20 00:36:52] Fine-tune costs $0.14
[2022-11-20 00:36:52] Fine-tune enqueued. Queue number: 0
[2022-11-20 00:36:54] Fine-tune started
[2022-11-20 00:37:54] Completed epoch 1/4
[2022-11-20 00:38:06] Completed epoch 2/4
[2022-11-20 00:38:18] Completed epoch 3/4
[2022-11-20 00:38:30] Completed epoch 4/4
[2022-11-20 00:38:46] Uploaded model: curie:ft-personal-2022-11-20-08-38-46
[2022-11-20 00:38:47] Uploaded result file: file-EZy5duulRUswzYdqj6LuW76a
[2022-11-20 00:38:47] Fine-tune succeeded

Job complete! Status: succeeded 🎉
Try out your fine-tuned model:

openai api completions.create -m curie:ft-personal-2022-11-20-08-38-46 -p <YOUR_PROMPT>
(ml) SF-mbp:gpt-fine-tuning stephen$ -m "davinci" \
> --suffix "gpt-ml-qa-pairs-A"
-bash: -m: command not found

### Trying again 

$ openai api fine_tunes.create \
> -t gpt-qa-train_prepared.jsonl \
> -v gpt-qa-valid_prepared.jsonl \
> -m "davinci" \
> --suffix "gpt-ml-qa-pairs-A"
Found potentially duplicated files with name 'gpt-qa-train_prepared.jsonl', purpose 'fine-tune' and size 50782 bytes
file-5fYU80lXy5FQgfmfVU56k8nb
Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway:
Upload progress: 100%|███████████████████████████████| 50.8k/50.8k [00:00<00:00, 21.8Mit/s]
Uploaded file from gpt-qa-train_prepared.jsonl: file-XCo6Sza9cVyfIWjPmoatpK3s
Found potentially duplicated files with name 'gpt-qa-valid_prepared.jsonl', purpose 'fine-tune' and size 12227 bytes
file-99tOIqCaPbNwL90bI1K1yq23
Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway:
Upload progress: 100%|███████████████████████████████| 12.2k/12.2k [00:00<00:00, 6.49Mit/s]
Uploaded file from gpt-qa-valid_prepared.jsonl: file-HbOaJkGRRu4hp4C8DQ4PtZ0t
Created fine-tune: ft-xNzKl8ORDdASWIALy5WoSLcY
Streaming events until fine-tuning is complete...

(Ctrl-C will interrupt the stream, but not cancel the fine-tune)
[2022-11-20 00:45:39] Created fine-tune: ft-xNzKl8ORDdASWIALy5WoSLcY
[2022-11-20 00:45:46] Fine-tune costs $1.42
[2022-11-20 00:45:47] Fine-tune enqueued. Queue number: 0
[2022-11-20 00:45:48] Fine-tune started
[2022-11-20 00:47:30] Completed epoch 1/4
[2022-11-20 00:47:50] Completed epoch 2/4
[2022-11-20 00:48:10] Completed epoch 3/4
[2022-11-20 00:48:30] Completed epoch 4/4
[2022-11-20 00:49:22] Uploaded model: davinci:ft-sandbox:gpt-ml-qa-pairs-a-2022-11-20-08-49-22
[2022-11-20 00:49:23] Uploaded result file: file-rGIPU3ndPAZXVRSBT4xhjDKy
[2022-11-20 00:49:23] Fine-tune succeeded

Job complete! Status: succeeded 🎉
Try out your fine-tuned model:

openai api completions.create -m davinci:ft-sandbox:gpt-ml-qa-pairs-a-2022-11-20-08-49-22 -p <YOUR_PROMPT>


# -- Test prompts --

In [28]:
def get_gpt_answer(prompt, ftm_name_, printit=False): 

    prompt += "\n\n###\n\n"

    if printit:
        print(f"creating call from {ftm_name_} with prompt: {prompt}")

    response = openai.Completion.create(
        model=ftm_name_,
        # engine="text-davinci-002", 
        prompt=prompt,
        temperature=0.9,
        max_tokens=500,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0.6,
        stop=["\n<+++>\n"]
    )

    return response['choices'][0]['text']

# davinci:ft-sandbox:gpt-ml-qa-pairs-b-2022-11-20-09-29-25

### --- Running with a prevoiusly trained model to test for now ---

In [29]:
### for testing for now 

# ftm_id = "ft-xXiSGiL0RrMUpKikJw09T1up" # this model in particular was returning results with weird formatting, maybe something with the START and END sequences 
# ftm_name = get_ftmodel_name_from_id(ftm_id)

ftm_name = "davinci:ft-sandbox:gpt-ml-qa-pairs-a-2022-11-20-08-49-22"

##### Run a single test prompt 

In [30]:
prompt = "What is the difference between a neural network and a deep learning model?" 
#ftm_name is defined after creating the ftm
print(get_gpt_answer(prompt, ftm_name, printit=True))

creating call from davinci:ft-sandbox:gpt-ml-qa-pairs-a-2022-11-20-08-49-22 with prompt: What is the difference between a neural network and a deep learning model?

###


 The terms "neural network" and "deep learning model" are often used interchangeably to refer to a class of machine learning models that use a layered architecture with nonlinear transformation functions.   In general, a neural network or deep learning model can be defined as a machine learning model that is composed of multiple layers of processing elements (or "hidden layers") interposed between the input and output layers. A common characteristic of these models is that they use algorithms that involve backpropagation or other gradient-based optimization techniques to learn the parameters of the model by minimizing a loss function.   In practice, there are some subtle differences between these two terms. For example, the term "neural network" may be used to refer to models that use biologically-inspired architectur

##### Run a battery of test prompts on the model from "test-prompts.txt"

In [39]:

# save all the question, answer pairs to a csv

import csv

with open('tests/test-prompts.txt', 'r') as f:
    lines = f.readlines()
    with open('test-outputs/gpt-completions-to-test-prompts.csv', 'w',newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['prompt','completion'])
        for i, line in enumerate(lines):
            print(f'Question {i}: {line}')
            answer = get_gpt_answer(line, ftm_name)
            print(f'Answer {i}: {answer} \n\n\n')
            writer.writerow([line, answer])

Question 0: What is the difference between a CNN and a DNN? 

Answer 0:  In general, the terms "CNN" and "DNN" are used to refer to different architectures for a class of machine learning models known as neural networks.    In general, a CNN (or convolutional neural network) is a type of DNN that uses a particular architecture inspired by the visual processing system of mammals (e.g. the retina and visual cortex). This architecture is characterized by an alternating column-wise and row-wise structure, with connections (or "weights") between layers that are organized in a "convolutional" (or cross-channel) manner. This architecture is well-suited to analyzing images or other data that have a 2D structure, and can be used for tasks such as image classification, object detection, and image processing.    In contrast, a DNN (or deep neural network) is a general term for any neural network architecture that uses a large number of layers or "hidden units" in its architecture. In general, the

#### -- Previously trained example --- 

In [None]:
# call the openai api with a prompt and get a response

ftm_name = "davinci:ft-sandbox:gpt-ml-qa-pairs-b-2022-11-20-09-29-25"

prompt = "What is the difference between a neural network and a deep learning model?" 

print(get_gpt_answer(prompt, ftm_name))

# -- Hyperparameter Exploration --


In [None]:
# i could generate over several hyperparameters, 
# create a function that makes the call 
# saves the output 
# gets the finetuned model id
# and then calls the next function with the finetuned model id as an argument


# can i create a df of openai models and then iterate through it?
# or do i need to create a function that takes the model id as an argument?
# or do i need to create a function that takes the model id as an argument and then calls the next function with the finetuned model id as an argument?

# i think i need to create a function that takes the model id as an argument and then calls the next function with the finetuned model id as an argument

# function: takes in a dataset (just training set for now for simplicity) 
# outputs a finetuned model "return" 

# HYPERPARAMETER EXPLORATION 

EXPLORE_HYPERPARAMS=False

# create a set of various hyperparameters to iterate through
# create a ftmodel for each one 
# save the ftmodel id to a list
# then iterate through the list and generate responses to the test-prompts for each one
# save the responses to a list
# then compare the responses to the test answers
import time 

# wrap this in tqdm to show progress
from tqdm import tqdm 


# currsuffix = banana_suffix+suffix
currsuffix = "curie:ft-personal-2022-11-20-08-38-46"

def create_ftm_suffix(base,**kwargs):
    str = base + ":" + "_".join([f"{k}-{v}, " for k,v in kwargs.items()])
    return str

def already_exists_ftm(suffix):
    all_ft_models = openai.FineTune.list()['data']

    # this checks if the current name is already in the list of ft models
    # any( ) returns True if any element in the list is True
    exists = any([True for i in all_ft_models if suffix in i['fine_tuned_model']])
    print('ftmodel already exists: ', suffix)
    return exists
 

banana_file = "gpt-qa-train-formatted_prepared.jsonl"

base_suffix="qa-train-iterate"
# create a set of various hyperparameters to iterate through
# epochs=[1,2,3,4]
# learning_rate_multiplier=[0.1, 0.05, 0.01, 0.005, 0.001]
epochs=[1,2]
learning_rate_multiplier=[0.1, 0.05, 0.01]

suffixes=['A', 'B', 'C', 'D', 'E']

ftmodels=pd.DataFrame(columns=['id', 'epochs', 'learning_rate_multiplier', 'suffix'])

ftmodelcompletions = pd.DataFrame(columns=['id', 'ftm_name', 'epochs', 'learning_rate_multiplier', 'suffix', 'prompt','completion'])

# create a ftmodel for each one

def explore_hyperparams():

    for epoch in tqdm(epochs):
    # for epoch in epochs:

        for lrm in tqdm(learning_rate_multiplier):

            #check if the current model already exists. if so, skip it
            curr_suffix = create_ftm_suffix(base_suffix,lrm=lrm, epoch=epoch)

            # if already_exists_ftm(curr_suffix): continue

            # @TODO actually this should say: 
            # if already exists, then don't create the finetuned model, just get the id
            # if doesn't exist then create it, and then get the id 

            # create a ft model with these parameters 
            ftm = file_to_finetuned_model(
                train_file=banana_file,
                suffix=curr_suffix,
                n_epochs=epoch,
                learning_rate_multiplier=lrm
            )

            ftm_id = ftm['id']

            print(f"\n\n======================= Initiated ftm:  ======================= \n ftm_id: {ftm_id}. \n Now we wait for the model to train... \n")

            # wait for the ft model to finish training
            current_ft_model_is_ready = await_ft_ready(ftm_id)
            if not current_ft_model_is_ready: 
                print(f"finetuned model timed out: {ftm_id}")
                continue

            # the model has passed the await test, so it can be called now
            # it's also been given a name, so it can be called by ftm_name now
            # ftm_name = ftm['fine_tuned_model'] # the name is stored as ftm['fine_tuned_model']

            ftm = openai.FineTune.retrieve(ftm_id) # need to retrieve the fresh version with its name! 
            
            ftm_name = ftm['fine_tuned_model']

            print (ftm_name, curr_suffix)
            #also we could derive the name from the suffix! although they add a timestamp to the end of the name, which we'd want to emulate 


            print (f"{ftm_id} finetuned model is now ready for testing. \nIts name is: {ftm_name}\n")

            # append this model to the df 
            ftmodels = ftmodels.append({'id': ftm_id, 'ftm_name': ftm_name, 'epochs': epoch, 'learning_rate_multiplier': lrm, 'suffix': curr_suffix}, ignore_index=True)

            # now test every prompt with every ftmodel
            with open ('test-prompts.txt') as f:
                print(f"starting test-prompts on finetuned model {ftm_id}")
                prompts = f.readlines()
                for prompt in tqdm(prompts):
                    # get the completion for this prompt
                    completion = get_gpt_answer(prompt, ftm_name, printit=True)
                    # append this completion to the df
                    ftmodelcompletions = ftmodelcompletions.append(
                        {'id': ftm_id, 
                        'ftm_name': ftm_name,
                        'epochs': epoch, 
                        'learning_rate_multiplier': lrm, 
                        'suffix': curr_suffix, 
                        'prompt': prompt, 
                        'completion': completion}, 
                        ignore_index=True)
                print('\n\n\nCompleted test prompts for ftmodel: ', ftm_id)

    ftmodels.to_csv('ftmodels.csv')
    ftmodelcompletions.to_csv('ftmodelcompletions.csv')

    print(ftmodels.head())
    print(ftmodelcompletions.head())


if EXPLORE_HYPERPARAMS: 
    explore_hyperparams()

    # SUCCESS! 
    print(ftmodels.head())
    print(ftmodelcompletions.head())