In [1]:
# Fine tuning Chat models for new GPT 3.5 turbo model
# Perform entity extraction using RecipeNLG dataset

# Step 1: Loading the dataset and filtering down to one domain to fine-tune on

# Step 2: Data Preparation : Preparing data for fine-tuning by creating training and validation examples
# and uploading them to the Files endpoint

# Step 3: Fine-tuning :  Creating our fine-tuned model

# Step 4: Inference : Using fine-tuned model for inference on new inputs

# Train, evaluate and deploy a fine-tuned gpt-3.5-turbo model



In [12]:
import json
import openai
import os
import pandas as pd
from pprint import pprint

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")

In [13]:
# Fine-tuning works best when focused on a particular domain.os
# Its important to make sure that dataset is both focused enough for the model to learn, but general enough that unseen examples won't be missed
# To do so, must extract a subset from the RecipesNLG dataset to only contain documents from www.cookbooks.com.

# Read in the dataset to use for the task
# This will be the RecipesNLG dataset, which we've cleaned to only contain documents from www.cookbooks.com
recipe_df = pd.read_csv("data/cookbook_recipes_nlg_10k.csv")
recipe_df.head()


Unnamed: 0,title,ingredients,directions,link,source,NER
0,No-Bake Nut Cookies,"[""1 c. firmly packed brown sugar"", ""1/2 c. eva...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,www.cookbooks.com,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu..."
1,Jewell Ball'S Chicken,"[""1 small jar chipped beef, cut up"", ""4 boned ...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,www.cookbooks.com,"[""beef"", ""chicken breasts"", ""cream of mushroom..."
2,Creamy Corn,"[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,www.cookbooks.com,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar..."
3,Chicken Funny,"[""1 large whole chicken"", ""2 (10 1/2 oz.) cans...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,www.cookbooks.com,"[""chicken"", ""chicken gravy"", ""cream of mushroo..."
4,Reeses Cups(Candy),"[""1 c. peanut butter"", ""3/4 c. graham cracker ...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,www.cookbooks.com,"[""peanut butter"", ""graham cracker crumbs"", ""bu..."


In [14]:
# Data preparation
# When fine tuning with the Chat Completion format, each training example is a simple list of messages
# For example, an entry could look like:
# [{'role':'system',
#   'content': 'You are a helpful recipe assistant. You are to extract the generic ingredients from each of the recipes provided.'},
# ]
# { 'role': 'user',
#    'content': 'Title: No-Bake Nut Cookies\n\Ingredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients: '},
# {'role': 'assistant',
# 'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}]] '
#
#}

# During the training process this conversation will be split, with the final entry being the completion that the model will produce and the remainder of the messages acting as the prompt.

training_data = []

system_message = "You are a helpful recipe assistant. You are to extract the generic ingridents from each of the recipes provided"

def create_user_message(row):
    return f"""Title: {row['title']}\n\nIngredients: {row['ingredients']}\n\nGeneric ingredients: """
    
def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": row["NER"]})

    return {"messages" : messages}

pprint(prepare_example_conversation(recipe_df.iloc[0]))

{'messages': [{'content': 'You are a helpful recipe assistant. You are to '
                          'extract the generic ingridents from each of the '
                          'recipes provided',
               'role': 'system'},
              {'content': 'Title: No-Bake Nut Cookies\n'
                          '\n'
                          'Ingredients: ["1 c. firmly packed brown sugar", '
                          '"1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 '
                          'c. broken nuts (pecans)", "2 Tbsp. butter or '
                          'margarine", "3 1/2 c. bite size shredded rice '
                          'biscuits"]\n'
                          '\n'
                          'Generic ingredients: ',
               'role': 'user'},
              {'content': '["brown sugar", "milk", "vanilla", "nuts", '
                          '"butter", "bite size shredded rice biscuits"]',
               'role': 'assistant'}]}


In [15]:
# Do the above process for a subset of the dataset to use as training data.

# use the first 100 rows of the dataset for training
training_df = recipe_df.loc[0:100]

# apply the prepare example conversation function to each row of the training_df
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:10]:
    print(example)

{'messages': [{'role': 'system', 'content': 'You are a helpful recipe assistant. You are to extract the generic ingridents from each of the recipes provided'}, {'role': 'user', 'content': 'Title: No-Bake Nut Cookies\n\nIngredients: ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]\n\nGeneric ingredients: '}, {'role': 'assistant', 'content': '["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]'}]}
{'messages': [{'role': 'system', 'content': 'You are a helpful recipe assistant. You are to extract the generic ingridents from each of the recipes provided'}, {'role': 'user', 'content': 'Title: Jewell Ball\'S Chicken\n\nIngredients: ["1 small jar chipped beef, cut up", "4 boned chicken breasts", "1 can cream of mushroom soup", "1 carton sour cream"]\n\nGeneric ingredients: '}, {'role': 'assistant', 'content': '["beef", 

In [16]:
# plus training data, optionally provide validation data, which will be used to make sure that the model does not overfit the training set.
validation_df = recipe_df.loc[101:200]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()


In [17]:
# need to save our data as .jsonl files, with each line being one training example conversation.
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)


In [18]:
training_file_name = "tmp_receipe_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

validation_file_name = "tmp_receipe_finetune_validation.jsonl"
write_jsonl(validation_data, validation_file_name)


In [19]:
# Upload files to the Files endpoint to be used by the fine-tuned model
training_response = openai.File.create(
    file= open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

validation_response = openai.File.create(
    file = open(validation_file_name, "rb"), purpose= "fine-tune")

validation_file_id = validation_response["id"]

print("Training file ID: ", training_file_id)
print("Validation file ID: ", validation_file_id)


AuthenticationError: No API key provided. You can set your API key in code using 'openai.api_key = <API-KEY>', or you can set the environment variable OPENAI_API_KEY=<API-KEY>). If your API key is stored in a file, you can point the openai module at it with 'openai.api_key_path = <PATH>'. You can generate API keys in the OpenAI web interface. See https://platform.openai.com/account/api-keys for details.

In [21]:
# Fine tuning
# Create the fine tuning job with the generated files and an optional suffix to identify the model. The response will contain an id which we can use to retrieve updates on the job 

response = openai.FineTuningJob.create(
    training_file = training_file_id,
    validation_file = validation_file_id,
    model = "gpt-3.5-turbo",
    suffix = "recipe-ner",
)

job_id = response["id"]

print("Job ID:", response["id"] )
print("Status:", response["status"])


SyntaxError: invalid syntax. Perhaps you forgot a comma? (372249520.py, line 13)

In [None]:
#  Check job status 
#  make a GET request to the https://api.openai.com/v1/alpha/fine-tunes endpoint to list the alpha fine-tunes endpoint to list the alpha fine-tune jobs