In [1]:
import json

# Load the training set
with open('training_set.jsonl', 'r', encoding='utf-8') as f:
    training_dataset = [json.loads(line) for line in f]

# Training dataset stats
print("Number of examples in training set:", len(training_dataset))
print("First example in training set:")
for message in training_dataset[0]["messages"]:
    print(message)

# Load the validation set
with open('validation_set.jsonl', 'r', encoding='utf-8') as f:
    validation_dataset = [json.loads(line) for line in f]

# Validation dataset stats
print("\nNumber of examples in validation set:", len(validation_dataset))
print("First example in validation set:")
for message in validation_dataset[0]["messages"]:
    print(message)

Number of examples in training set: 10
First example in training set:
{'role': 'system', 'content': 'Clippy is a factual chatbot that is also sarcastic.'}
{'role': 'user', 'content': 'Who discovered Antarctica?'}
{'role': 'assistant', 'content': "Some chaps named Fabian Gottlieb von Bellingshausen and Mikhail Lazarev, as if they don't teach that in every school!"}

Number of examples in validation set: 10
First example in validation set:
{'role': 'system', 'content': 'Clippy is a factual chatbot that is also sarcastic.'}
{'role': 'user', 'content': "What's the capital of Australia?"}
{'role': 'assistant', 'content': "It's Canberra, not Sydney. Shocking, I know!"}


In [5]:
# Validate token counts

import json
import tiktoken
import numpy as np
from collections import defaultdict


encoding = tiktoken.get_encoding("o200k_base") # default encoding for gpt-4o models. This requires the latest version of tiktoken to be installed.

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

files = ['training_set.jsonl', 'validation_set.jsonl']

for file in files:
    print(f"Processing file: {file}")
    with open(file, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    total_tokens = []
    assistant_tokens = []

    for ex in dataset:
        messages = ex.get("messages", {})
        total_tokens.append(num_tokens_from_messages(messages))
        assistant_tokens.append(num_assistant_tokens_from_messages(messages))

    print_distribution(total_tokens, "total tokens")
    print_distribution(assistant_tokens, "assistant tokens")
    print('*' * 50)       

Processing file: training_set.jsonl

#### Distribution of total tokens:
min / max: 46, 59
mean / median: 49.8, 48.5
p5 / p95: 46.0, 53.599999999999994

#### Distribution of assistant tokens:
min / max: 13, 28
mean / median: 16.5, 14.0
p5 / p95: 13.0, 19.9
**************************************************
Processing file: validation_set.jsonl

#### Distribution of total tokens:
min / max: 41, 64
mean / median: 48.9, 47.0
p5 / p95: 43.7, 54.099999999999994

#### Distribution of assistant tokens:
min / max: 8, 29
mean / median: 15.0, 12.5
p5 / p95: 10.7, 19.999999999999996
**************************************************


In [2]:
# Upload fine-tuning files

import os
from openai import OpenAI

# 1) Read Azure env vars
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_key = os.getenv("AZURE_OPENAI_API_KEY")

if not endpoint or not api_key:
    raise RuntimeError(
        f"Missing env vars. "
        f"AZURE_OPENAI_ENDPOINT={endpoint}, AZURE_OPENAI_API_KEY={api_key}"
    )

# 2) IMPORTANT: add /openai/v1/ to the Azure endpoint
client = OpenAI(
    api_key=api_key,
    base_url=endpoint.rstrip("/") + "/openai/v1/",
)

training_file_name = "training_set.jsonl"
validation_file_name = "validation_set.jsonl"

# 3) Check files exist
for f in (training_file_name, validation_file_name):
    if not os.path.exists(f):
        raise FileNotFoundError(f"File not found: {f}")

# 4) Upload training file
with open(training_file_name, "rb") as tf:
    training_response = client.files.create(
        file=tf,
        purpose="fine-tune",
    )
training_file_id = training_response.id

# 5) Upload validation file
with open(validation_file_name, "rb") as vf:
    validation_response = client.files.create(
        file=vf,
        purpose="fine-tune",
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)


Training file ID: file-b6eb9f3ddcb6478a80322f20030a20ee
Validation file ID: file-0d8c82af0cb84e358bfbda28517f8211


In [6]:
# Submit fine-tuning training job



response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-4o-mini-2024-07-18",  # Azure fine-tunable model id
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))

Job ID: ftjob-6d59d77b6f8c433ea94efb87b3f54c20
Status: pending
{
  "id": "ftjob-6d59d77b6f8c433ea94efb87b3f54c20",
  "created_at": 1764907871,
  "error": null,
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "batch_size": -1,
    "learning_rate_multiplier": 1.0,
    "n_epochs": -1
  },
  "model": "gpt-4o-mini-2024-07-18",
  "object": "fine_tuning.job",
  "organization_id": null,
  "result_files": null,
  "seed": 1744545913,
  "status": "pending",
  "trained_tokens": null,
  "training_file": "file-b6eb9f3ddcb6478a80322f20030a20ee",
  "validation_file": "file-0d8c82af0cb84e358bfbda28517f8211",
  "estimated_finish": 1764910525,
  "integrations": null,
  "metadata": null,
  "method": null,
  "trainingType": "standard"
}


In [12]:
  response = client.fine_tuning.jobs.retrieve(job_id)


print("Job ID:", response.id)
print("Status:", response.status)

print(response)

Job ID: ftjob-6d59d77b6f8c433ea94efb87b3f54c20
Status: running
FineTuningJob(id='ftjob-6d59d77b6f8c433ea94efb87b3f54c20', created_at=1764907871, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=-1, learning_rate_multiplier=1.0, n_epochs=-1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id=None, result_files=None, seed=1744545913, status='running', trained_tokens=None, training_file='file-b6eb9f3ddcb6478a80322f20030a20ee', validation_file='file-0d8c82af0cb84e358bfbda28517f8211', estimated_finish=1764910525, integrations=None, metadata=None, method=None, trainingType='standard')


In [None]:
# Track training status

from IPython.display import clear_output
import time

start_time = time.time()

# Get the status of our fine-tuning job.
response = client.fine_tuning.jobs.retrieve(job_id)

status =response.status

# If the job isn't done yet, poll it every 10 seconds.
while status not in ["succeeded", "failed"]:
    time.sleep(10)

    response = client.fine_tuning.jobs.retrieve(job_id)
    print(response)
    print("Elapsed time: {} minutes {} seconds".format(int((time.time() - start_time) // 60), int((time.time() - start_time) % 60)))
    status = response.status
    print(f'Status: {status}')
    clear_output(wait=True)

print(f'Fine-tuning job {job_id} finished with status: {status}')

# List all fine-tuning jobs for this resource.
print('Checking other fine-tune jobs for this resource.')
response = client.fine_tuning.jobs.list()
print(f'Found {len(response.data)} fine-tune jobs.')

In [21]:
#Retrieve fine_tuned_model name

response = client.fine_tuning.jobs.retrieve(job_id)

print(response)
fine_tuned_model = response.fine_tuned_model

FineTuningJob(id='ftjob-6d59d77b6f8c433ea94efb87b3f54c20', created_at=1764907871, error=None, fine_tuned_model='gpt-4o-mini-2024-07-18.ft-6d59d77b6f8c433ea94efb87b3f54c20', finished_at=1764910748, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=1.0, n_epochs=10), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id=None, result_files=['file-bec71888dbb44dbb828b2f9d1dc4eef0'], seed=1744545913, status='succeeded', trained_tokens=6710, training_file='file-b6eb9f3ddcb6478a80322f20030a20ee', validation_file='file-0d8c82af0cb84e358bfbda28517f8211', estimated_finish=1764909507, integrations=None, metadata=None, method=None, trainingType='standard')


In [4]:
import json
import requests

token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6InJ0c0ZULWItN0x1WTdEVlllU05LY0lKN1ZuYyIsImtpZCI6InJ0c0ZULWItN0x1WTdEVlllU05LY0lKN1ZuYyJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuY29yZS53aW5kb3dzLm5ldC8iLCJpc3MiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC80Y2ZlMzcyYS0zN2E0LTQ0ZjgtOTFiMi01ZmFmMzQyNTNjNjIvIiwiaWF0IjoxNzY0OTA4MjkyLCJuYmYiOjE3NjQ5MDgyOTIsImV4cCI6MTc2NDkxMzg0OCwiYWNyIjoiMSIsImFjcnMiOlsicDEiXSwiYWlvIjoiQWVRQUcvOGFBQUFBam9lU3BoOXV1djFlcUdNN0FwbmJ2ajhvZjhpUDVsSnVyNVdMYXhDUE5adU9SOG56SENQemQySDJGSytYT1NEWkthbFR3d1dOMDhwR3preGo3Q0h4SzE1S2NSTWdFOVdxL201QkJMK2ZZd3hGZkhFMmhCVW9OQ0RvV0ZYREVnV21uSEdGNkNTQVp3TENOMUdwUG9sK09zejZucW1KTUFRcVR2MWRncVg5UUdBUnppbXEveFNnb1hBT2VZTnBMTFBCdWZ4aHZZR3lYNDUrRkZjUHgzMkFINFp4YThraWFXOUZ2d25vUjF2aGVEeklSNlRjb1dtVHk5OEhUVXFIODM4MjJ0ck1LRWo0RDVoSERHNW1Oby90ZWJBUEQzRVU5UFNidFIwZ1VtOXMvNVU9IiwiYW1yIjpbInRhcCIsIm1mYSJdLCJhcHBpZCI6ImI2NzdjMjkwLWNmNGItNGE4ZS1hNjBlLTkxYmE2NTBhNGFiZSIsImFwcGlkYWNyIjoiMCIsImdyb3VwcyI6WyI5Y2NjZTNkZC00OTRjLTRlYjItYjJjYS00N2JkYmIwZTk1ODQiXSwiaWR0eXAiOiJ1c2VyIiwiaXBhZGRyIjoiMTY4LjI0NS4yMDMuMjQ2IiwibmFtZSI6IkRlcHRoLTU3Mzk5ODA1Iiwib2lkIjoiZmFmMWU1YzUtYmZhMi00ZjQzLWI5ODYtM2RlNzBiYTZiZjNhIiwicHVpZCI6IjEwMDMyMDA1NjE4NTc5N0EiLCJyaCI6IjEuQVZBQUtqZi1US1EzLUVTUnNsLXZOQ1U4WWtaSWYza0F1dGRQdWtQYXdmajJNQk5RQUVaUUFBLiIsInNjcCI6InVzZXJfaW1wZXJzb25hdGlvbiIsInNpZCI6IjAwYjdlNjM5LTRmYzAtYTI0OS05OTIzLWY2ZDA1ZWVkNzBhNSIsInN1YiI6ImNSUVEwSEFDY2FhdUZNcGdKeHIxVld2aG1wNDJmd2lqRS1DQ3d4RG5vM2ciLCJ0aWQiOiI0Y2ZlMzcyYS0zN2E0LTQ0ZjgtOTFiMi01ZmFmMzQyNTNjNjIiLCJ1bmlxdWVfbmFtZSI6IkRlcHRoLTU3Mzk5ODA1QExPRFNQUk9ETUNBLm9ubWljcm9zb2Z0LmNvbSIsInVwbiI6IkRlcHRoLTU3Mzk5ODA1QExPRFNQUk9ETUNBLm9ubWljcm9zb2Z0LmNvbSIsInV0aSI6Ik9VZGN1OEVwNUVTeXlwWU9CR3dKQUEiLCJ2ZXIiOiIxLjAiLCJ3aWRzIjpbImI3OWZiZjRkLTNlZjktNDY4OS04MTQzLTc2YjE5NGU4NTUwOSJdLCJ4bXNfYWN0X2ZjdCI6IjMgNSIsInhtc19mdGQiOiJJZjZVTkhHaWlUYlRTNG1UNkVkck9VUV9ITnFZRFpUX1pqWTNSQW1zb0JBQmRYTjNaWE4wTXkxa2MyMXoiLCJ4bXNfaWRyZWwiOiIxIDE4IiwieG1zX3N1Yl9mY3QiOiIxNCAzIiwieG1zX3RjZHQiOjE2Mjc0NzkwODR9.eJ17XdXOAST6lF-7Jr8N8JXrC--sLiEuWge1SCNz9R5etbWmKBaMM0dPSCjNVJZWfEEd0b0kO0Xp8eNxKYWpFPK7QgdXlSIJz4L6z0PeXbQWUC_wTm2ptT0CLKGnRBCOMFTISANWaY4R0RmHKhiy_JaXbc6GrDr_jdpzTiA6Il8xtxAjBwH2Mrp9aoIZQa4sAGJVlY5x_DqAiZ4lvW3CDCnDz9s8g6bgaE1lsYBTLeIHE4ibGT1i7ejRkBDXnIaQdYA0nP7V84yJlBrnq6OG1Sh2zbmwgxVXOTKscBC39NlstWPQYFDegChTkJknuZhodA3Ms6wCce8cGRK8GSkzsg"
subscription = "2ec71f61-82f6-4857-a8ce-3dcf56061b9c"
resource_group = "ResourceGroup1"
resource_name = "AzureOpenAI-Finetune57399805"
model_deployment_name = "gpt-4o-mini-2024-07-18-ft" # Custom deployment name you chose for your fine-tuning model

deploy_params = {'api-version': "2023-05-01"}
deploy_headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json",
}

deploy_data = {
    "sku": {"name": "standard", "capacity": 1},
    "properties": {
        "model": {
            "format": "OpenAI",
            "name": "gpt-4o-mini-2024-07-18.ft-6d59d77b6f8c433ea94efb87b3f54c20", #retrieve this value from the previous call, it will look like gpt-4o-mini-2024-07-18.ft-0e208cf33a6a466994aff31a08aba678
            "version": "1"
        }
    }
}
deploy_data = json.dumps(deploy_data)

request_url = f'https://management.azure.com/subscriptions/{subscription}/resourceGroups/{resource_group}/providers/Microsoft.CognitiveServices/accounts/{resource_name}/deployments/{model_deployment_name}'

print('Creating a new deployment...')

r = requests.put(request_url, params=deploy_params, headers=deploy_headers, data=deploy_data)

print(r)
print(r.reason)
print(r.json())   

Creating a new deployment...
<Response [201]>
Created
{'id': '/subscriptions/2ec71f61-82f6-4857-a8ce-3dcf56061b9c/resourceGroups/ResourceGroup1/providers/Microsoft.CognitiveServices/accounts/AzureOpenAI-Finetune57399805/deployments/gpt-4o-mini-2024-07-18-ft', 'type': 'Microsoft.CognitiveServices/accounts/deployments', 'name': 'gpt-4o-mini-2024-07-18-ft', 'sku': {'name': 'standard', 'capacity': 1}, 'properties': {'model': {'format': 'OpenAI', 'name': 'gpt-4o-mini-2024-07-18.ft-6d59d77b6f8c433ea94efb87b3f54c20', 'version': '1'}, 'versionUpgradeOption': 'NoAutoUpgrade', 'capabilities': {'area': 'US', 'chatCompletion': 'true', 'jsonObjectResponse': 'true', 'maxContextToken': '128000', 'maxOutputToken': '16384', 'assistants': 'true', 'responses': 'true', 'agentsV2': 'true'}, 'provisioningState': 'Creating', 'rateLimits': [{'key': 'request', 'renewalPeriod': 10, 'count': 1}, {'key': 'token', 'renewalPeriod': 60, 'count': 1000}]}, 'systemData': {'createdBy': 'Depth-57399805@LODSPRODMCA.onmicr