# First Steps in finetuning with open ai

#### Updates

* 20231116 Created a date for the output file
* 20231109 updated to use new 1.x api


In [1]:
# Import the os package
import os


# Imports via openai docs
from pathlib import Path
from openai import OpenAI


# import the dotenv package
from dotenv import load_dotenv

import pprint

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown



In [2]:
# Get the current working directory
cwd = os.getcwd()
# Construct the .env file path
env_path = os.path.join(cwd, '.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

True

In [3]:
# Set openai.api_key to the OPENAI environment variable
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]


# specifyh the key and init the client

In [4]:
client = OpenAI()
client.api_key=OPENAI_API_KEY

# Determine the OpenAI API version

In [5]:
# from chatGPT when asked how to query api version,
# which sadly does not work since the model was trained
# before the api changed - I can only assume.

#import openai
#openai.api_key=OPENAI_API_KEY
# To get the API version, you would typically make an API call
# and the version would be included in the response headers.
# For example, you could list the available engines and check the headers:
#response = openai.Engine.list()

# The API version would be in the response headers if available
#api_version = response.headers.get('OpenAI-Api-Version')

#print(api_version)

# Sanity check
Verify API key and network allows usage of the openAI API

In [5]:
# Define the system message
system_msg = 'You are a helpful assistant who understands data science.'

# Define the user message
user_msg = 'Create a small dataset of data about people. The format of the dataset should be a data frame with 5 rows and 3 columns. The columns should be called "name", "height_cm", and "eye_color". The "name" column should contain randomly chosen first names. The "height_cm" column should contain randomly chosen heights, given in centimeters. The "eye_color" column should contain randomly chosen eye colors, taken from a choice of "brown", "blue", and "green". Provide Python code to generate the dataset, then provide the output in the format of a markdown table.'



# Create a dataset using GPT
response = client.chat.completions.create(
    model="gpt-3.5-turbo-0613",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

In [6]:
response.choices[0].finish_reason
#response["choices"]

'stop'

In [7]:
response.choices[0].message.content

'Sure! Here\'s the Python code to generate the dataset:\n\n```python\nimport pandas as pd\nimport random\n\n# Define the data\nnames = ["John", "Emma", "Michael", "Olivia", "William"]\nheights = [165, 170, 175, 180, 185]\neye_colors = ["brown", "blue", "green"]\n\n# Generate random data\ndata = []\nfor _ in range(5):\n    name = random.choice(names)\n    height = random.choice(heights)\n    eye_color = random.choice(eye_colors)\n    data.append([name, height, eye_color])\n\n# Create the data frame\ndf = pd.DataFrame(data, columns=["name", "height_cm", "eye_color"])\n```\n\nAnd here\'s the output in the format of a markdown table:\n\n|   name   | height_cm | eye_color |\n|----------|-----------|-----------|\n|   John   |    165    |   brown   |\n|   Emma   |    170    |   green   |\n| Michael  |    185    |   blue    |\n|  Olivia  |    175    |   green   |\n| William  |    180    |   brown   |'

# Upload a file for model tuning

In [23]:


response = client.files.create(
    file=Path("../data/davis.jsonl"),
    purpose="fine-tune",
)

print(response)

FileObject(id='file-S22pfJMZv7asuZNoMlrElq6T', bytes=3009, created_at=1699401557, filename='davis.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [24]:
print(response.id)
myfile_id = response.id

file-S22pfJMZv7asuZNoMlrElq6T


# Actual fine tune of a model

In [25]:
# create a new fine tuning model 
file_id = myfile_id
# recommended fine tuning model
model="gpt-3.5-turbo-0613"
# model used by Gregory Bauges
#model="gpt-3.5-turbo"

response = client.fine_tuning.jobs.create(
  training_file=file_id, 
  model=model
)
print(response)

FineTuningJob(id='ftjob-orLzj8hkntF73vSUNM7w6J2I', created_at=1699401563, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-kHUq2JzdiW8FIDxqE01bYdot', result_files=[], status='validating_files', trained_tokens=None, training_file='file-S22pfJMZv7asuZNoMlrElq6T', validation_file=None)


In [47]:
# List 10 fine-tuning jobs
pprint.pprint(client.fine_tuning.jobs.list(limit=10))
result = client.fine_tuning.jobs.list(limit=10)
for a_job in result.data:
    print(a_job.training_file)




# Retrieve the state of a fine-tune
#client.fine_tuning.jobs.retrieve("ftjob-abc123")

# Cancel a job
#client.fine_tuning.jobs.cancel("ftjob-abc123")

# List up to 10 events from a fine-tuning job
#client.fine_tuning.jobs.list_events(id="ftjob-abc123", limit=10)

# Delete a fine-tuned model (must be an owner of the org the model was created in)
#client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:abc123")

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-orLzj8hkntF73vSUNM7w6J2I', created_at=1699401563, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=8, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-kHUq2JzdiW8FIDxqE01bYdot', result_files=[], status='queued', trained_tokens=None, training_file='file-S22pfJMZv7asuZNoMlrElq6T', validation_file=None), FineTuningJob(id='ftjob-AAK4Xtz8s61ltsb7PDnEcFJT', created_at=1699298141, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal::8HzSS7eU', finished_at=1699298510, hyperparameters=Hyperparameters(n_epochs=8, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-kHUq2JzdiW8FIDxqE01bYdot', result_files=['file-KAMiZ00WL2UoCdrypUsBjtHd'], status='succeeded', trained_tokens=31512, training_file='file-RltDbgHjpQ9qANthDCvqJQkO', validation_file=None), Fin

# Lets try to use it

Lets look at the form of one of the training messages

```
{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.',
                  'role': 'system'},
              
              {'content': 'How far is the Chelsea Cafe from my current location?',
                  'role': 'user'},
                  
              {'content': 'Around five minutes walk. Are you hungry?',
               'role': 'assistant'}]}``

In [26]:
pprint.pprint({"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Chelsea Cafe from my current location?"}, {"role": "assistant", "content": "Around five minutes walk. Are you hungry?"}]}
)

{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.',
               'role': 'system'},
              {'content': 'How far is the Chelsea Cafe from my current '
                          'location?',
               'role': 'user'},
              {'content': 'Around five minutes walk. Are you hungry?',
               'role': 'assistant'}]}


In [8]:
#our_mode = "normal"
our_mode = "sarcastic"

# dependency analytics disazble for syntax
# default model
# our_model="gpt-3.5-turbo-0613"
# our tuned model
# Chelsea model #1
#our_model="ft:gpt-3.5-turbo-0613:personal::8DvbJsff"
# Chelsea model #2
#our_model="ft:gpt-3.5-turbo-0613:personal::8IV7laj9"


if our_mode == "normal":
    print("normal")
    # stock model
    our_model="gpt-3.5-turbo-0613"
    # Define the system message
    system_msg = 'You are a helpful assistant who understands data science.'
    # Define the user message
    user_msg = 'How far is the Chelsea Cafe from my current location?'
else:
    print("sarcastic")
    # tuned model
    our_model="ft:gpt-3.5-turbo-0613:personal::8IV7laj9"
   # Define the system message
    system_msg = 'Marv is a factual chatbot that is also sarcastic.'
    # Define the user message
    #user_msg = 'How far is the Chelsea Cafe from my current location?'   # This is the exact questions and responds with our exact answer.
    #user_msg = 'How long to walk to the Chelsea Cafe?'   # This is similar, but responds with a different answer.
    user_msg = 'How far to the Chelsea Cafe?'   # This is similar, but responds with a different answer.






# Create a dataset using GPT
response = client.chat.completions.create(
    model=our_model,
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

sarcastic


In [9]:
print("finish_reason: ", response.choices[0].finish_reason)
print("conten: ", response.choices[0].message.content)

finish_reason:  stop
conten:  About five minutes walk.
