# First Steps in finetuning with open ai


In [1]:
# Import the os package
import os

# Import the openai package
import openai

# import the dotenv package
from dotenv import load_dotenv

import pprint

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown





In [2]:
# Get the current working directory
cwd = os.getcwd()
# Construct the .env file path
env_path = os.path.join(cwd, '.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

True

In [3]:
# Set openai.api_key to the OPENAI environment variable
OPENAI_APIKEY = os.environ["OPENAI"]
openai.api_key=OPENAI_APIKEY

In [4]:
# Define the system message
system_msg = 'You are a helpful assistant who understands data science.'

# Define the user message
user_msg = 'Create a small dataset of data about people. The format of the dataset should be a data frame with 5 rows and 3 columns. The columns should be called "name", "height_cm", and "eye_color". The "name" column should contain randomly chosen first names. The "height_cm" column should contain randomly chosen heights, given in centimeters. The "eye_color" column should contain randomly chosen eye colors, taken from a choice of "brown", "blue", and "green". Provide Python code to generate the dataset, then provide the output in the format of a markdown table.'

# Create a dataset using GPT
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-0613",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

In [5]:
response["choices"][0]["finish_reason"]

'stop'

In [6]:
response["choices"][0]["message"]["content"]

'Here\'s the Python code to generate the dataset and display it as a markdown table:\n\n```python\nimport pandas as pd\nimport random\n\n# Define the possible values for each column\nnames = ["John", "Jane", "James", "Emily", "Jacob"]\nheights = [165, 170, 175, 180, 185]\neye_colors = ["brown", "blue", "green"]\n\n# Generate random values for each row\ndata = []\nfor _ in range(5):\n    name = random.choice(names)\n    height = random.choice(heights)\n    eye_color = random.choice(eye_colors)\n    data.append([name, height, eye_color])\n\n# Create the dataframe\ndf = pd.DataFrame(data, columns=["name", "height_cm", "eye_color"])\n\n# Display the dataframe as a markdown table\nprint(df.to_markdown(index=False))\n```\n\nOutput:\n\n| name  |   height_cm | eye_color   |\n|:------|------------:|:------------|\n| Emily |         185 | blue        |\n| John  |         175 | green       |\n| James |         165 | brown       |\n| James |         165 | blue        |\n| Jane  |         180 | gre

In [7]:
response = openai.File.create(
    file=open("sample_datasets/davis.jsonl"),
    purpose='fine-tune'
)
print(response)

FileNotFoundError: [Errno 2] No such file or directory: 'sample_datasets/davis.jsonl'

# Acutal fine tune of a model

In [16]:
# create a new fine tuning model 
file_id = response["id"]
# recommended fine tuning model
model="gpt-3.5-turbo-0613"
# model used by Gregory Bauges
#model="gpt-3.5-turbo"
response = openai.FineTuningJob.create(training_file=file_id,
                                       model=model)
print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-jKcnh5QKOxHaYSAmquF1Zsk7",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698330056,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-kHUq2JzdiW8FIDxqE01bYdot",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-VCM3q5sZByfXopxM9v2SdoaF",
  "hyperparameters": {
    "n_epochs": "auto"
  },
  "trained_tokens": null,
  "error": null
}


# Lets try to use it

Lets look at the form of one of the training messages

```
{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.',
                  'role': 'system'},
              
              {'content': 'How far is the Chelsea Cafe from my current location?',
                  'role': 'user'},
                  
              {'content': 'Around five minutes walk. Are you hungry?',
               'role': 'assistant'}]}``

In [9]:
pprint.pprint({"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Chelsea Cafe from my current location?"}, {"role": "assistant", "content": "Around five minutes walk. Are you hungry?"}]}
)

{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.',
               'role': 'system'},
              {'content': 'How far is the Chelsea Cafe from my current '
                          'location?',
               'role': 'user'},
              {'content': 'Around five minutes walk. Are you hungry?',
               'role': 'assistant'}]}


In [16]:
our_mode = "normal"
#our_mode = "sarcastic"

# dependency analytics disazble for syntax
# default model
# our_model="gpt-3.5-turbo-0613"
# our tuned model
our_model="ft:gpt-3.5-turbo-0613:personal::8DvbJsff"


if our_mode == "normal":
    print("normal")
    # Define the system message
    system_msg = 'You are a helpful assistant who understands data science.'
    # Define the user message
    user_msg = 'How far is the Chelsea Cafe from my current location?'
else:
    print("sarcastic")
    # Define the system message
    system_msg = 'Marv is a factual chatbot that is also sarcastic.'
    # Define the user message
    #user_msg = 'How far is the Chelsea Cafe from my current location?'   # This is the exact questions and responds with our exact answer.
    #user_msg = 'How long to walk to the Chelsea Cafe?'   # This is similar, but responds with a different answer.
    user_msg = 'How far to the Chelsea Cafe?'   # This is similar, but responds with a different answer.






# Create a dataset using GPT
response = openai.ChatCompletion.create(
    model=our_model,
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

normal


In [17]:
print("finish_reason: ", response["choices"][0]["finish_reason"])
print("conten: ",response["choices"][0]["message"]["content"])

finish_reason:  stop
conten:  I'm sorry, I cannot provide real-time distance or directions as I do not have access to your current location data.
