# First Steps in finetuning with open ai


In [1]:
# Import the os package
import os

# Import the openai package
import openai

# import the dotenv package
from dotenv import load_dotenv

import pprint

# From the IPython.display package, import display and Markdown
from IPython.display import display, Markdown



In [2]:
# Get the current working directory
cwd = os.getcwd()
# Construct the .env file path
env_path = os.path.join(cwd, '.env')

# Load the .env file
load_dotenv(dotenv_path=env_path)

True

In [3]:
# Set openai.api_key to the OPENAI environment variable
OPENAI_APIKEY = os.environ["OPENAI"]
openai.api_key=OPENAI_APIKEY

In [4]:
# Define the system message
system_msg = 'You are a helpful assistant who understands data science.'

# Define the user message
user_msg = 'Create a small dataset of data about people. The format of the dataset should be a data frame with 5 rows and 3 columns. The columns should be called "name", "height_cm", and "eye_color". The "name" column should contain randomly chosen first names. The "height_cm" column should contain randomly chosen heights, given in centimeters. The "eye_color" column should contain randomly chosen eye colors, taken from a choice of "brown", "blue", and "green". Provide Python code to generate the dataset, then provide the output in the format of a markdown table.'

# Create a dataset using GPT
response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo-0613",
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

In [5]:
response["choices"][0]["finish_reason"]

'stop'

In [6]:
response["choices"][0]["message"]["content"]

"To generate the dataset, you can use the `pandas` library in Python. Here's an example code that creates the dataset and outputs it as a markdown table:\n\n```python\nimport pandas as pd\nimport numpy as np\n\n# Define the number of rows in the dataset\nnum_rows = 5\n\n# Define a list of random first names\nfirst_names = ['John', 'Emma', 'Michael', 'Sophia', 'William', 'Olivia', 'James', 'Ava', 'Benjamin', 'Mia']\n\n# Generate random data for the dataset\ndata = {\n    'name': np.random.choice(first_names, num_rows),\n    'height_cm': np.random.randint(150, 190, num_rows),\n    'eye_color': np.random.choice(['brown', 'blue', 'green'], num_rows)\n}\n\n# Create a DataFrame from the data\ndf = pd.DataFrame(data)\n\n# Convert the DataFrame to a markdown table\nmarkdown_table = df.to_markdown(index=False)\n\n# Output the markdown table\nprint(markdown_table)\n```\n\nThe output markdown table will look like this:\n\n| name     |   height_cm | eye_color   |\n|:--------:|------------:|:------

# Upload a file for model tuning

In [8]:
response = openai.File.create(
    # hlasm
    file=open("../data/sample.jsonl"),
    # demo
    #file=open("data/davis.jsonl"),
    purpose='fine-tune'
)
print(response)

{
  "object": "file",
  "id": "file-RltDbgHjpQ9qANthDCvqJQkO",
  "purpose": "fine-tune",
  "filename": "file",
  "bytes": 16306,
  "created_at": 1699298132,
  "status": "processed",
  "status_details": null
}


# Acutal fine tune of a model

In [9]:
# create a new fine tuning model 
file_id = response["id"]
# recommended fine tuning model
model="gpt-3.5-turbo-0613"
# model used by Gregory Bauges
#model="gpt-3.5-turbo"
response = openai.FineTuningJob.create(training_file=file_id,
                                       model=model)
print(response)

{
  "object": "fine_tuning.job",
  "id": "ftjob-AAK4Xtz8s61ltsb7PDnEcFJT",
  "model": "gpt-3.5-turbo-0613",
  "created_at": 1699298141,
  "finished_at": null,
  "fine_tuned_model": null,
  "organization_id": "org-kHUq2JzdiW8FIDxqE01bYdot",
  "result_files": [],
  "status": "validating_files",
  "validation_file": null,
  "training_file": "file-RltDbgHjpQ9qANthDCvqJQkO",
  "hyperparameters": {
    "n_epochs": "auto"
  },
  "trained_tokens": null,
  "error": null
}


# Lets try to use it

Lets look at the form of one of the training messages

```
{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.',
                  'role': 'system'},
              
              {'content': 'How far is the Chelsea Cafe from my current location?',
                  'role': 'user'},
                  
              {'content': 'Around five minutes walk. Are you hungry?',
               'role': 'assistant'}]}``

In [10]:
pprint.pprint({"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Chelsea Cafe from my current location?"}, {"role": "assistant", "content": "Around five minutes walk. Are you hungry?"}]}
)

{'messages': [{'content': 'Marv is a factual chatbot that is also sarcastic.',
               'role': 'system'},
              {'content': 'How far is the Chelsea Cafe from my current '
                          'location?',
               'role': 'user'},
              {'content': 'Around five minutes walk. Are you hungry?',
               'role': 'assistant'}]}


In [11]:
# select one of these two modes
#our_mode = "normal"
our_mode = "tuned"

# dependency analytics disazble for syntax
# default model
default_model="gpt-3.5-turbo-0613"
# our tuned model
#our_model="ft:gpt-3.5-turbo-0613:personal::8DvbJsff"
# hlasm tuned model
#our_model="ft:gpt-3.5-turbo-0613:personal::8Hz40ex5"
tuned_model="ft:gpt-3.5-turbo-0613:personal::8HzSS7eU"


if our_mode == "normal":
    print("normal")
    # Define the system message
    system_msg = 'You are a helpful assistant who understands data science.'
    # Define the user message
    #user_msg = 'How far is the Chelsea Cafe from my current location?'
    #user_msg = "Give me an example of the HLASM Divide Instruction"
    #user_msg = "Provide an explanation of the Divide instruction and how to protect against divide by zero errors."
    #user_msg = "Provide an explanation of the Move Immediate instruction in BAS."
    #user_msg = "Provide an explanation of the Divide instruction with HLASM and how to protect against divide by zero errors."
    #user_msg = "Provide an example of the Divide instruction with HLASM"
    user_msg = "How to add two numbers in HLASM"
    our_model = default_model
else:
    print("tuned")
    # Define the system message
    #system_msg = 'Marv is a factual chatbot that is also sarcastic.'
    #system_msg = 'You are a helpful assistant who understands IBM HLASM and IBM BAS'
    system_msg = 'You are a helpful assistant who understands IBM HLASM'
    # Define the user message
    #user_msg = 'How far is the Chelsea Cafe from my current location?'   # This is the exact questions and responds with our exact answer.
    #user_msg = 'How long to walk to the Chelsea Cafe?'   # This is similar, but responds with a different answer.
    #user_msg = 'How far to the Chelsea Cafe?'   # This is similar, but responds with a different answer.
    #user_msg = "Provide an explanation of the Move Immediate instruction in BAS."
    #user_msg = "Provide an explanation of the Move Immediate instruction in HLASM."
    user_msg = "how to write code in HLASM to add two numbers."
    #user_msg = "Provide an explanation of the Divide instruction with HLASM and how to protect against divide by zero errors."
    #user_msg = "Provide an example of the Divide instruction with HLASM"
    #user_msg = "How to add two numbers in HLASM"
    our_model = tuned_model






# Create a dataset using GPT
response = openai.ChatCompletion.create(
    model=our_model,
    messages=[
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]
)

tuned


In [12]:
print("finish_reason: ", response["choices"][0]["finish_reason"])
print("content: ",response["choices"][0]["message"]["content"])

finish_reason:  stop
content:          STM   R14,R12,12(13)  save registers
        L     R0,NUM1         load first number
        A     R0,NUM2         add second number
        ST    R0,RESULT       store result
        LM    R14,R12,12(13)  restore registers
        BR    R14             return to caller
 
RESULT DC    F'0'             define storage for result
NUM1   DC    X'00000001'     first number
NUM2   DC    X'00000002'     second number
