# First Steps in finetuning with open ai

#### Updates

* 20231116 updated to use finetuningYYYYMMDD.jsonl
* 20231109 updated to use new 1.x api
* 20231109 uses the BAS dataset

In [1]:
# Import the os package
import os


# Imports via openai docs
from pathlib import Path
from openai import OpenAI


# import the dotenv package
from dotenv import load_dotenv

import pprint

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown



In [2]:
# Get the current working directory
cwd = os.getcwd()
# Construct the .env file path
env_path = os.path.join(cwd, '.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

True

In [3]:
# Set openai.api_key to the OPENAI environment variable
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]


# specify the key and init the client

In [4]:
client = OpenAI()
client.api_key=OPENAI_API_KEY

# Determine the OpenAI API version

In [5]:
# from chatGPT when asked how to query api version,
# which sadly does not work since the model was trained
# before the api changed - I can only assume.

#import openai
#openai.api_key=OPENAI_API_KEY
# To get the API version, you would typically make an API call
# and the version would be included in the response headers.
# For example, you could list the available engines and check the headers:
#response = openai.Engine.list()

# The API version would be in the response headers if available
#api_version = response.headers.get('OpenAI-Api-Version')

#print(api_version)

# Sanity check
Verify API key and network allows usage of the openAI API

In [6]:
# Define the system message
system_msg = 'You are a helpful assistant who understands data science.'

# Define the user message
user_msg = 'Create a small dataset of data about people. The format of the dataset should be a data frame with 5 rows and 3 columns. The columns should be called "name", "height_cm", and "eye_color". The "name" column should contain randomly chosen first names. The "height_cm" column should contain randomly chosen heights, given in centimeters. The "eye_color" column should contain randomly chosen eye colors, taken from a choice of "brown", "blue", and "green". Provide Python code to generate the dataset, then provide the output in the format of a markdown table.'



# Create a dataset using GPT
response = client.chat.completions.create(
    model="gpt-3.5-turbo-0613",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

In [7]:
response.choices[0].finish_reason
#response["choices"] 

'stop'

In [8]:
response.choices[0].message.content

'Here is the Python code to generate the small dataset:\n\n```python\nimport random\nimport pandas as pd\n\nrandom.seed(123)  # For reproducibility\n\n# Define the possible values for each column\nnames = ["John", "Emily", "Michael", "Sophia", "Daniel"]\nheights_cm = [165, 172, 180, 158, 175]\neye_colors = ["brown", "blue", "green"]\n\n# Generate random data for each column\ndata = {\n    "name": [random.choice(names) for _ in range(5)],\n    "height_cm": [random.choice(heights_cm) for _ in range(5)],\n    "eye_color": [random.choice(eye_colors) for _ in range(5)],\n}\n\n# Create a DataFrame\ndf = pd.DataFrame(data)\n\nprint(df)\n```\n\nAnd here is the output in the format of a markdown table:\n\n|    | name     | height_cm | eye_color |\n|---:|:--------|---------:|:---------|\n|  0 | John     |       175 | blue     |\n|  1 | Emily    |       158 | brown    |\n|  2 | Daniel   |       180 | green    |\n|  3 | John     |       158 | green    |\n|  4 | Michael  |       158 | brown    |'

# Upload a file for model tuning

### Setup dirs

In [9]:
import pathlib
dirpath = os.getcwd()
print("current directory is : " + dirpath)
# Use pathlib to find the root dir of the git repo
root_path = pathlib.PurePath(dirpath).parents[0]
data_path = root_path / 'data'
logs_path = root_path / 'logs'
print("root directory is: ", root_path)
print("data directory is: ",  data_path)
print("logs directory is: ", logs_path)
# Create equivalent dir names in the environment
# Logs
#LOGS_DIR_NAME = logs_path.as_posix()
#print("LOGS_DIR_NAME: ", LOGS_DIR_NAME)
#os.environ['LOGS_DIR_NAME'] = LOGS_DIR_NAME
# Data
DATA_DIR_NAME = data_path.as_posix()
print("DATA_DIR_NAME: ", DATA_DIR_NAME)
os.environ['DATA_DIR_NAME'] = DATA_DIR_NAME

current directory is : /workspaces/BALSA/notebooks
root directory is:  /workspaces/BALSA
data directory is:  /workspaces/BALSA/data
logs directory is:  /workspaces/BALSA/logs
DATA_DIR_NAME:  /workspaces/BALSA/data


### Specify the JSONL file for model tuning

In [10]:
tuning_file = "tuning20231116.jsonl"

In [11]:
IN_FQPN = data_path / pathlib.PurePath(tuning_file).as_posix()
IN_FQPN

PurePosixPath('/workspaces/BALSA/data/tuning20231116.jsonl')

In [12]:

response = client.files.create(
    file=Path(IN_FQPN),
    purpose="fine-tune",
)

print(response)

FileObject(id='file-Fwux98ZJRrpbK4kN7JQSafeB', bytes=30421, created_at=1700255539, filename='tuning20231116.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [13]:
print(response.id)
myfile_id = response.id

file-Fwux98ZJRrpbK4kN7JQSafeB


# Actual fine tune of a model

In [14]:
# create a new fine tuning model 
file_id = myfile_id
# recommended fine tuning model
model="gpt-3.5-turbo-0613"
# model used by Gregory Bauges
#model="gpt-3.5-turbo"

response = client.fine_tuning.jobs.create(
  training_file=file_id, 
  model=model
)
print(response)

FineTuningJob(id='ftjob-E2dYuEXt8qoXAagCEuBFZACs', created_at=1700255540, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-kHUq2JzdiW8FIDxqE01bYdot', result_files=[], status='validating_files', trained_tokens=None, training_file='file-Fwux98ZJRrpbK4kN7JQSafeB', validation_file=None)


In [15]:
# List 10 fine-tuning jobs
#pprint.pprint(client.fine_tuning.jobs.list(limit=10))
result = client.fine_tuning.jobs.list(limit=10)
for a_job in result.data:
    # print the jobs raw
    #pprint.pprint(a_job)
    # print just the file for a job
    print(a_job.training_file)
    # simple test to see if our tune job is in top ten based upon fileid
    if myfile_id == a_job.training_file:
        print("yes")




# Retrieve the state of a fine-tune
#client.fine_tuning.jobs.retrieve("ftjob-abc123")

# Cancel a job
#client.fine_tuning.jobs.cancel("ftjob-abc123")

# List up to 10 events from a fine-tuning job
#client.fine_tuning.jobs.list_events(id="ftjob-abc123", limit=10)

# Delete a fine-tuned model (must be an owner of the org the model was created in)
#client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:abc123")

file-Fwux98ZJRrpbK4kN7JQSafeB
yes
file-mOWBskmEo89j5l8yRFPFxnqe
file-HQqakeKHnHi4YFtdZJwGIEM6
file-S22pfJMZv7asuZNoMlrElq6T
file-RltDbgHjpQ9qANthDCvqJQkO
file-6n4dELlk1gyh7brCV8iulYMW
file-VCM3q5sZByfXopxM9v2SdoaF
file-y5huqBR8UUOwvkmMEJuLwscs


# Lets try to use it

In [18]:
#our_mode = "normal"
our_mode = "bas"

# dependency analytics disazble for syntax
# default model
default_model="gpt-3.5-turbo-0613"
# our tuned model
# Chelsea model #1
#our_model="ft:gpt-3.5-turbo-0613:personal::8DvbJsff"
# Chelsea model #2
#our_model="ft:gpt-3.5-turbo-0613:personal::8IV7laj9"
# bas model #2
#tuned_model="ft:gpt-3.5-turbo-0613:personal::8IV7laj9"
# model we trained using20231116 data
tuned_model="ft:gpt-3.5-turbo-0613:personal::8LXzZa1D"



if our_mode == "normal":
    print("normal")
    # stock model
    our_model=default_model
    # Define the system message
    system_msg = 'You are a helpful assistant who understands programming in assembly.'
    # Define the user message
    user_msg = 'Provide an example of how to add two numbers in assembly.'
else:
    print("tuned")
    # tuned model
    our_model=tuned_model
   # Define the system message
    system_msg = 'You are a helpful assistant who understands programming in BAS assembly.'
    # Define the user message
    user_msg = 'Provide an explanation of the Divide instruction using negative numbers in BAS.'
    user_msg = 'Provide an explanation of the Divide instruction and how to protect against divide by zero errors in BAS.'






# Create a dataset using GPT
response = client.chat.completions.create(
    model=our_model,
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

tuned


In [19]:
print("finish_reason: ", response.choices[0].finish_reason)
print("conten: ", response.choices[0].message.content)

finish_reason:  stop
conten:   The Divide command divides the contents of Register A by Register B, and places the result in Register A. Thenumber in Register A is divided by the number in Register B, and the remainder is placed in R.

 This instruction will check that the content of Register B is not 0, and will terminate the program with a Division Exception error if it is. To understand this better, consider the following example:
 
          DIV      REG_A,REG_B          
          LBL DIV_RUN                     
          DCW      #-3                     
          .        .     
          .        .     
          .        .
          DCW     0     
          .        .     
 Once BAS has executed a divide instruction, it will next execute the DCW 0 statement. However, before it does that, it will check B to see if it is 0 and take appropriate action.
 

