In [1]:
!pip install -r requirements.txt

Collecting openai==1.60.0 (from -r requirements.txt (line 1))
  Downloading openai-1.60.0-py3-none-any.whl.metadata (27 kB)
Downloading openai-1.60.0-py3-none-any.whl (456 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m456.1/456.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.59.6
    Uninstalling openai-1.59.6:
      Successfully uninstalled openai-1.59.6
Successfully installed openai-1.60.0


In [6]:
import json
from openai import OpenAI
from collections import defaultdict


In [8]:
# authenticate openai api keys
ashu_api_key = ""
client = OpenAI(api_key=ashu_api_key)

In [2]:
# location of data
ashu_datapath = "/content/dataset.jsonl"

In [5]:
# creating list
dataset = []
# verify that my dataset is having a valid jsonl format
with open(ashu_datapath, 'r') as f:
    for line in f:
        try:
            data = json.loads(line)
            dataset.append(data)
        except json.JSONDecodeError as e:
            print(f"Invalid JSONL format: {e}")
            break

# check basic info
# length of dataset
if len(dataset) > 10 :
  print("number of examples we have in dataset is ",len(dataset))
  print("we can use this data for fine tuning")
else :
  print("we need more data for fine tuning OR minimum 10 samples as per GPT")


number of examples we have in dataset is  22
we can use this data for fine tuning


In [7]:
# data format validation
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [9]:
# start fine tuning process
# step 1 -- uploading file using openAI
ashu_data_file = client.files.create(
  file=open(ashu_datapath, "rb"),
  purpose='fine-tune'
)
# printing file id after upload
myfile_id = ashu_data_file.id
print(myfile_id)

file-Lp2rUNgvUek6ps1hEtcjWZ


In [11]:
# setting suffix name of my model
ashu_model_sufix = "ashu_model-day3333"
# create a job to fine tune gpt4o-mini model using above file id
ashu_job = client.fine_tuning.jobs.create(
    training_file=myfile_id,
    model="gpt-4o-mini-2024-07-18",
    suffix=ashu_model_sufix,
    hyperparameters={
        "n_epochs": 4
        }
)


In [18]:
# list events of fine tuning progress
ashu_job_events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=ashu_job.id)
for event in ashu_job_events:
    print(event.message)

The job has successfully completed
New fine-tuned model created
Checkpoint created at step 66
Checkpoint created at step 44
Step 88/88: training loss=0.00
Step 87/88: training loss=0.02
Step 86/88: training loss=0.06
Step 85/88: training loss=0.05
Step 84/88: training loss=0.15
Step 83/88: training loss=0.02
Step 82/88: training loss=0.07
Step 81/88: training loss=0.03
Step 80/88: training loss=0.08
Step 79/88: training loss=0.08
Step 78/88: training loss=0.15
Step 77/88: training loss=0.03
Step 76/88: training loss=0.04
Step 75/88: training loss=0.20
Step 74/88: training loss=0.29
Step 73/88: training loss=0.11
Step 72/88: training loss=0.08
Step 71/88: training loss=0.01
Step 70/88: training loss=0.09
Step 69/88: training loss=0.19
Step 68/88: training loss=0.27
Step 67/88: training loss=0.22
Step 66/88: training loss=0.20
Step 65/88: training loss=0.09
Step 64/88: training loss=0.09
Step 63/88: training loss=0.17
Step 62/88: training loss=0.32
Step 61/88: training loss=0.48
Step 60/

In [20]:
# list all my fine tuned models in my account
ashu_models = client.models.list()
for model in ashu_models.data:
    if 'day33' in model.id:
      print(model.id)

ft:gpt-4o-mini-2024-07-18:delvex:ashu-model-day3333:AsYf5gRO:ckpt-step-44
ft:gpt-4o-mini-2024-07-18:delvex:ashu-model-day3333:AsYf5WOh:ckpt-step-66
ft:gpt-4o-mini-2024-07-18:delvex:ashu-model-day3333:AsYf6kGy
