In [71]:
import json
import tiktoken  # for token counting
import numpy as np
from collections import defaultdict

# Data loading
data_path = "data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 31
First example:
{'role': 'system', 'content': 'RecipeMaster is Indian Recipe chatbot, RecipeMaster understands Indian food culture.'}
{'role': 'user', 'content': "What's the Masala Karela Recipe?"}
{'role': 'assistant', 'content': 'Namaste! It is recipe mainly consist of Bittergroud along with spices and more flavours'}


In [72]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
    messages = ex.get("messages", None)
    if messages is None:
        format_errors["missing_messages_list"] += 1
    else:
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            if message.get("role") not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1
            if not isinstance(message.get("content", ''), str):
                format_errors["missing_content"] += 1
        if not any(message.get("role") == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [73]:
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        num_tokens += len(encoding.encode(message['content']))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.05)}, {np.quantile(values, 0.95)}")

In [74]:
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    system_present = any(m["role"] == "system" for m in messages)
    user_present = any(m["role"] == "user" for m in messages)
    n_missing_system += not system_present
    n_missing_user += not user_present
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_tokens_from_messages([m for m in messages if m["role"] == "assistant"]))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 20, 115
mean / median: 38.774193548387096, 37.0
p5 / p95: 26.5, 51.5

#### Distribution of num_assistant_tokens_per_example:
min / max: 3, 80
mean / median: 16.677419354838708, 17.0
p5 / p95: 6.0, 21.5


In [75]:
# Assume each example's token count does not exceed the maximum context length for the model.
MAX_TOKENS_PER_EXAMPLE = 115
n_epochs = 3  # typically a good starting point
n_train_examples = len(dataset)
n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, l) for l in convo_lens)

print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Visit OpenAI's pricing page for detailed cost information.

Dataset has ~1202 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~3606 tokens


In [111]:
from openai import OpenAI
import os


from dotenv import load_dotenv


load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [76]:


response = client.files.create(
    file=open("data.jsonl", "rb"),  # Make sure the file path and name are correct
    purpose="fine-tune"
)
print(response)  # This will print the response from the server including the file ID

FileObject(id='file-kR42WXMmtjMBPmznt9x1PkEk', bytes=8931, created_at=1722633206, filename='data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [77]:
# List available models
available_models = client.models.list()
print("Available models for fine-tuning:")
for model in available_models:
    print(model.id)


Available models for fine-tuning:
dall-e-3
gpt-4-1106-preview
dall-e-2
tts-1-hd-1106
tts-1-hd
gpt-4o-mini-2024-07-18
gpt-4-0125-preview
babbage-002
gpt-4-turbo-preview
text-embedding-3-small
text-embedding-3-large
tts-1
gpt-3.5-turbo
whisper-1
gpt-4o-2024-05-13
text-embedding-ada-002
gpt-3.5-turbo-16k
davinci-002
gpt-4-turbo-2024-04-09
tts-1-1106
gpt-3.5-turbo-0125
gpt-4-turbo
gpt-3.5-turbo-1106
gpt-4o-mini
gpt-4o
gpt-3.5-turbo-instruct-0914
gpt-3.5-turbo-instruct
gpt-4-0613
gpt-4


In [78]:

from openai import OpenAI
client = OpenAI()

# Start a fine-tuning job
response = client.fine_tuning.jobs.create(
    training_file="file-kR42WXMmtjMBPmznt9x1PkEk",  # Replace 'file-abc123' with your actual file ID
    model="gpt-3.5-turbo"  # Model type to fine-tune
)
print(response)

FineTuningJob(id='ftjob-0vKiCVozz8QffAJAPwhedj06', created_at=1722633239, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-90r750f7PwRNyB7GCURshhTk', result_files=[], seed=938634481, status='validating_files', trained_tokens=None, training_file='file-kR42WXMmtjMBPmznt9x1PkEk', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)


In [79]:
# Retrieve the status of the fine-tuning job
fine_tune_job_id = "ftjob-0vKiCVozz8QffAJAPwhedj06"  # Replace with your fine-tune job ID
job_status = client.fine_tuning.jobs.retrieve(fine_tune_job_id)

# Print detailed error information
if job_status.error:
    print("Error Code:", job_status.error.code)
    print("Error Message:", job_status.error.message)
    print("Error Parameter:", job_status.error.param)
else:
    print("No error information available")


Error Code: None
Error Message: None
Error Parameter: None


In [95]:
# List all fine-tune jobs
fine_tune_jobs = list(client.fine_tuning.jobs.list())
# print(fine_tune_jobs)
# Iterate through the jobs and print their IDs and statuses
for job in fine_tune_jobs:
    print(job.id, job.status)


ftjob-0vKiCVozz8QffAJAPwhedj06 succeeded
ftjob-MJOHDLF7U4gCoGmLQJQymhkJ failed
ftjob-uFl8scupjkw5aeTqCOIe8x9b failed


In [112]:
# Import OpenAI client


# List all models to find your fine-tuned model
models = client.models.list()
print("Available models:")
for model in models:
    print(model.id)


Available models:
dall-e-3
gpt-4-1106-preview
dall-e-2
tts-1-hd-1106
tts-1-hd
gpt-4o-mini-2024-07-18
gpt-4-0125-preview
babbage-002
gpt-4-turbo-preview
text-embedding-3-small
text-embedding-3-large
tts-1
gpt-3.5-turbo
whisper-1
gpt-4o-2024-05-13
text-embedding-ada-002
gpt-3.5-turbo-16k
davinci-002
gpt-4-turbo-2024-04-09
tts-1-1106
gpt-3.5-turbo-0125
gpt-4-turbo
gpt-3.5-turbo-1106
gpt-4o-mini
gpt-4o
gpt-3.5-turbo-instruct-0914
gpt-3.5-turbo-instruct
gpt-4-0613
gpt-4
ft:gpt-3.5-turbo-0125:personal::9rtzvLWX:ckpt-step-31
ft:gpt-3.5-turbo-0125:personal::9rtzvo1Z:ckpt-step-62
ft:gpt-3.5-turbo-0125:personal::9rtzv3yx


In [115]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
    model="ft:gpt-3.5-turbo-0125:personal::9rtzv3yx",  # Replace with your specific model identifier
    messages=[
        {"role": "system", "content": "You are recipemaster, who gives content regarding indian recipes"},
        {"role": "user", "content": "hello"}
    ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='Namaste! Would you like to learn about Indian recipes?', role='assistant', function_call=None, tool_calls=None)


FileObject(id='file-UGV1vZiK507hMwQ03LyvqNeT', bytes=12622, created_at=1722630540, filename='data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
