# First Steps in finetuning with open ai


In [1]:
# Import the os package
import os

# Import the openai package
import openai

# import the dotenv package
from dotenv import load_dotenv

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown



In [2]:
# Get the current working directory
cwd = os.getcwd()
# Construct the .env file path
env_path = os.path.join(cwd, '.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

True

In [3]:
# Set openai.api_key to the OPENAI environment variable
OPENAI_APIKEY = os.environ["OPENAI"]
openai.api_key=OPENAI_APIKEY

In [7]:
# Define the system message
system_msg = 'You are a helpful assistant who understands data science.'

# Define the user message
user_msg = 'Create a small dataset of data about people. The format of the dataset should be a data frame with 5 rows and 3 columns. The columns should be called "name", "height_cm", and "eye_color". The "name" column should contain randomly chosen first names. The "height_cm" column should contain randomly chosen heights, given in centimeters. The "eye_color" column should contain randomly chosen eye colors, taken from a choice of "brown", "blue", and "green". Provide Python code to generate the dataset, then provide the output in the format of a markdown table.'

# Create a dataset using GPT
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-0613",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

In [8]:
response["choices"][0]["finish_reason"]

'stop'

In [9]:
response["choices"][0]["message"]["content"]

'Here\'s the code to generate the dataset:\n\n```python\nimport pandas as pd\nimport numpy as np\n\n# Randomly chosen first names\nfirst_names = np.random.choice(["Alice", "Bob", "Charlie", "David", "Emma"], 5)\n\n# Randomly chosen heights in centimeters\nheights = np.random.randint(150, 200, 5)\n\n# Randomly chosen eye colors\neye_colors = np.random.choice(["brown", "blue", "green"], 5)\n\n# Create a dictionary to hold the data\ndata = {\'name\': first_names, \'height_cm\': heights, \'eye_color\': eye_colors}\n\n# Create a data frame from the dictionary\ndf = pd.DataFrame(data)\n\n# Display the data frame\ndf\n```\n\nAnd here\'s the output in the format of a markdown table:\n\n|  name   | height_cm | eye_color |\n|---------|-----------|-----------|\n|  Alice  |    179    |   brown   |\n|   Bob   |    167    |    blue   |\n| Charlie |    184    |   green   |\n|  David  |    192    |   brown   |\n|   Emma  |    176    |   green   |'

In [15]:
response = openai.File.create(
    file=open("sample_datasets/davis.jsonl"),
    purpose='fine-tune'
)
print(response)

{
  "object": "file",
  "id": "file-VCM3q5sZByfXopxM9v2SdoaF",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 3009,
  "created_at": 1698330054,
  "status": "uploaded",
  "status_details": null
}


In [16]:
# create a new fine tuning model 
file_id = response["id"]
# recommended fine tuning model
model="gpt-3.5-turbo-0613"
# model used by Gregory Bauges
#model="gpt-3.5-turbo"
response = openai.FineTuningJob.create(training_file=file_id,
                                       model=model)
print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-jKcnh5QKOxHaYSAmquF1Zsk7",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1698330056,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-kHUq2JzdiW8FIDxqE01bYdot",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-VCM3q5sZByfXopxM9v2SdoaF",
  "hyperparameters": {
    "n_epochs": "auto"
  },
  "trained_tokens": null,
  "error": null
}
