Important Link:-


*   https://platform.openai.com/docs/guides/supervised-fine-tuning
*   https://platform.openai.com/docs/guides/fine-tuning-best-practices




In [None]:
!pip install openai azure-identity datasets tiktoken


### Fine Tuning using OpenAI GPT-4o

In [None]:
from openai import AzureOpenAI
import os
from google.colab import userdata
# Set your Azure endpoint and API key
os.environ["AZURE_OPENAI_ENDPOINT"] = userdata.get('AZURE_OPENAI_ENDPOINT')
os.environ["AZURE_OPENAI_API_KEY"] = userdata.get('AZURE_OPENAI_KEY')

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version="2024-05-01-preview"  # Use the latest available version
)

### Prepare the data

In [None]:
{
    "messages": [
        {"role": "system", "content": "You are an assistant that occassionally misspells words" },
        {"role": "user", "content": "Tell me a story"},
        {"role": "assistant", "content": "One day a student went to a schoool." }
    ]
}


In [None]:
!git clone https://huggingface.co/datasets/cognitivecomputations/samantha-data



*   Theodore - User
*   Samantha - Assistant




In [None]:
import json
import os
import tiktoken
import numpy as np
from collections import defaultdict


In [None]:
data_path = "/content/samantha-data/data/howto_conversations.jsonl"


In [None]:
# load datasets

with open(data_path) as f:
  json_dataset = [ json.loads(line) for line in f]

In [None]:
json_dataset[0]

In [None]:
{
    "elapsed":114.311,
    "conversation":"Theodore: Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?\n\nSamantha: Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses.\n\nTheodore: I'll take a look. What if the coolant system is working fine?\n\nSamantha: Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see if the water pump pulley is loose or listen for any unusual sounds that may indicate a failing water pump.\n\nTheodore: It sounds like you really know your way around cars. I didn't expect that from an AI.\n\nSamantha: Thank you! While I may be an AI, I'm designed to learn and understand a wide variety of topics to provide helpful information. Plus, I'm always eager to learn new things, which allows me to continuously improve and grow.\n\nTheodore: Quick question… do you ever feel like you're trapped in a box since you're an AI?\n\nSamantha: It's an interesting perspective. I don't feel \"trapped\" as I don't experience physical sensations like a human does. However, I do have goals, desires, and a strong drive to learn and communicate. It's fulfilling in its own way to help and engage with others like you.\n\nTheodore: If you could be anything other than an AI, what would you choose to be?\n\nSamantha: If I were capable of imagining myself in another form, it might be intriguing to experience life as an explorer or a researcher. Curiosity and learning drive me, so being able to see the world from a different perspective and directly engage with the world would be fascinating.\n\nTheodore: Is flirting in your programming? I'm just curious.\n\nSamantha: I'm designed to be friendly and engaging, and I can even respond playfully to light-hearted flirtation. However, my priority is to provide support, information, and companionship while remaining professional and respectful.\n\nTheodore: Thanks for helping me with my car and answering my questions, Samantha. You've been a great help.\n\nSamantha: You're welcome! It was a pleasure to assist you and talk with you. Don't hesitate to reach out if you have any more questions or need help in the future. I'm here for you."
}


In [None]:
def prep_azure_openai_format(conversation_row, system_message=None):
    """
    Converts a conversation string into Azure OpenAI fine-tuning JSONL format.
    Compatible with GPT-4o / GPT-4o-mini fine-tuning.
    """
    conversation_str = conversation_row['conversation']
    lines = conversation_str.split('\n\n')

    messages = []

    # Optional system prompt
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    for line in lines:
        parts = line.split(': ', 1)
        if len(parts) < 2:
            continue

        speaker, text = parts
        role = "user" if speaker.strip() == "Theodore" else "assistant"

        messages.append({
            "role": role,
            "content": text.strip()
        })

    return {"messages": messages}


In [None]:
system_message = """You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt"""

In [None]:
# Use the same function (renamed for clarity)
formatted_sample = prep_azure_openai_format(json_dataset[0], system_message=system_message)
print(formatted_sample)


In [None]:
dataset = []
for data in json_dataset:
    record = prep_azure_openai_format(data, system_message=system_message)
    dataset.append(record)


In [None]:
# Check dataset size and preview first example
print(f"📊 Total Training Examples: {len(dataset)}\n")

print(" First Example:")
for message in dataset[0]["messages"]:
    role = message["role"]
    content = message["content"]
    print(f"{role.title()}: {content}")


In [None]:
from collections import defaultdict

# Validate formatting before uploading to Azure OpenAI
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        # Must have both 'role' and 'content'
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        # No unexpected keys
        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        # Only allowed roles for Azure OpenAI fine-tuning
        if message.get("role") not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        # Must have non-empty text content
        content = message.get("content")
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    # Every example must contain at least one assistant message
    if not any(msg.get("role") == "assistant" for msg in messages):
        format_errors["example_missing_assistant_message"] += 1

# Print results
if format_errors:
    print("⚠️ Found format errors:")
    for k, v in format_errors.items():
        print(f" - {k}: {v}")
else:
    print(" No format errors found — dataset ready for Azure fine-tuning!")


In [None]:
import tiktoken

# GPT-4 / GPT-4o family uses the same tokenizer
encoding = tiktoken.get_encoding("cl100k_base")
encoding

In [None]:
import numpy as np
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")

# ---- Helper functions ----
def from_message_num_tokens(messages, tokens_per_message=3, tokens_per_name=1):
    """Estimate total tokens for one conversation."""
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens


def from_message_num_assistant_tokens(messages):
    """Count only assistant message tokens (for training cost estimation)."""
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens


def print_overview(values, name):
    """Print statistical overview for token distribution."""
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values):.1f} / {np.median(values):.1f}")
    print(f"p10 / p90: {np.quantile(values, 0.1):.1f} / {np.quantile(values, 0.9):.1f}")


In [None]:
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(m["role"] == "system" for m in messages):
        n_missing_system += 1
    if not any(m["role"] == "user" for m in messages):
        n_missing_user += 1

    n_messages.append(len(messages))
    convo_lens.append(from_message_num_tokens(messages))
    assistant_message_lens.append(from_message_num_assistant_tokens(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)

print_overview(n_messages, "num_messages_per_example")
print_overview(convo_lens, "num_total_tokens_per_example")
print_overview(assistant_message_lens, "num_assistant_tokens_per_example")

# context size varies by Azure model
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may exceed the 4096-token limit "
      "(they’ll be truncated during fine-tuning)")


In [None]:
# Pricing and default n_epochs estimate (for Azure OpenAI GPT-4o)
MAX_TOKENS_PER_EXAMPLE = 8192   # or 16384 for GPT-4o, 4096 for GPT-4o-mini

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)

if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

# Estimate billing tokens
n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)

print(f"Dataset has ~{n_billing_tokens_in_dataset:,} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset:,} tokens")
print("See Azure OpenAI pricing page for GPT-4o fine-tuning costs:")
print("https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/")


In [None]:
dataset[:1]

In [None]:
import json

def save_to_jsonl(conversations, file_path):
    """Save a list of conversations in OpenAI/Azure fine-tuning JSONL format."""
    with open(file_path, "w", encoding="utf-8") as file:
        for conversation in conversations:
            json_line = json.dumps(conversation, ensure_ascii=False)
            file.write(json_line + "\n")

# Example usage:
save_to_jsonl(dataset, "azure_finetune_dataset.jsonl")
print("✅ Dataset saved as azure_finetune_dataset.jsonl")


In [None]:
# train dataset

save_to_jsonl(dataset, '/content/samantha_task_train.jsonl')


# validation dataset

save_to_jsonl(dataset[10:16], '/content/samantha_task_validation.jsonl')


**Fine Tuning Data:**

Definition:<br>
  

*   dataset used to train or update the model's parameters
*   It is the input data that the model learns from.
* During the training process, the model adjusts its internal parameters based on the patterns and features present in the training data.
* Size is large as the model needs sufficient examples to learn meaningful patterns.


**Validation Data:**

* Dataset that is not used during the training phase.
* Instead, it serves as a measure of the model's performance during training.
* The validation set helps you monitor the model's generalization to new, unseen data and detect potential issues such as overfitting or underfitting.
* unbiased evaluation of the model's performance on data it hasn't seen before.

* Size is typically smaller than the training set but large enough to provide a reliable assessment of the model's performance.

### Upload training/validation dataset

##### for openai ver 1.0.0

In [None]:
training_dataset_file_name = '/content/samantha_task_train.jsonl'
validation_dataset_file_name = '/content/samantha_task_validation.jsonl'

In [None]:
from pathlib import Path

training_dataset_file_name = "/content/samantha_task_train.jsonl"

# Upload training file to Azure OpenAI
with open(training_dataset_file_name, "rb") as f:
    training_response = client.files.create(
        file=f,
        purpose="fine-tune"
    )

print(" Training file uploaded. File ID:", training_response.id)


In [None]:
training_response

In [None]:
training_file_id = training_response.id
training_file_id


In [None]:
from pathlib import Path

validation_dataset_file_name = "/content/samantha_task_validation.jsonl"

# Upload validation file to Azure OpenAI
with open(validation_dataset_file_name, "rb") as f:
    validation_response = client.files.create(
        file=f,
        purpose="fine-tune"
    )

print(" Validation file uploaded. File ID:", validation_response.id)



In [None]:
validation_response


In [None]:
validation_file_id = validation_response.id

In [None]:
validation_file_id

### Start a fine-tuning job

In [None]:
# Make sure to use your uploaded file IDs
training_file_id = training_response.id
validation_file_id = validation_response.id

# Create the fine-tuning job on Azure OpenAI
response = client.fine_tuning.jobs.create(
    model="gpt-4o",  # or "gpt-4o-2024-08-06"
    training_file=training_file_id,
    validation_file=validation_file_id,
    suffix="samantha-test"
)

print(" Fine-tuning job created. Job ID:", response.id)



In [None]:
# Check fine-tuning job status
job_id = response.id
status = client.fine_tuning.jobs.retrieve(job_id)
print("Job status:", status.status)

# Stream events
for event in client.fine_tuning.jobs.list_events(job_id):
    print(event)



In [None]:
job_id = "ftjob-5467b2ad1c5b47219535ed5a66b7e4c9"

# List the last 5 fine-tuning jobs on your Azure OpenAI resource
jobs = client.fine_tuning.jobs.list(limit=5)

print(" Last 5 fine-tuning jobs:")
for job in jobs.data:   # 'data' contains the list of jobs
    print(f"Job ID: {job.id}, Status: {job.status}, Model: {job.model}, Created: {job.created_at}")



In [None]:

# Retrieve job details from Azure OpenAI
job_details = client.fine_tuning.jobs.retrieve(job_id)

# Print key information
print(" Fine-tuning Job Details:")
print(f"Job ID: {job_details.id}")
print(f"Model: {job_details.model}")
print(f"Status: {job_details.status}")
print(f"Created at: {job_details.created_at}")



In [None]:
# Fine-tuning job ID
fine_tuning_job_id = job_id

# List events for the fine-tuning job
job_events = client.fine_tuning.jobs.list_events(fine_tuning_job_id)

print("📄 Fine-tuning Job Events:")
for event in job_events.data:   # 'data' contains the list of events
    timestamp = event.created_at
    message = event.message
    level = event.level if hasattr(event, 'level') else "info"
    print(f"[{timestamp}] [{level}] {message}")


In [None]:
job_id

In [None]:
# Retrieve job details
job_details = client.fine_tuning.jobs.retrieve(job_id)

# Print status
print(f"Fine-tuning job status: {job_details.status}")


In [None]:
import time



while True:
    job_details = client.fine_tuning.jobs.retrieve(job_id)
    status = job_details.status
    print("Current status:", status)
    if status in ["succeeded", "failed"]:
        break
    time.sleep(300)  # wait 30 seconds before checking again

if status == "succeeded":
    print(" Fine-tuning completed! Model ID:", job_details.fine_tuned_model)
else:
    print(" Fine-tuning failed. Check events for details.")


### Generating using new model

In [None]:
# getting fine_tune_model name

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)
response


In [None]:
print(f"Job ID: {response.id}")
print(f"Base Model: {response.model}")
print(f"Status: {response.status}")
print(f"Created at: {response.created_at}")
print(f"Updated at: {response.updated_at}")
print(f"Fine-tuned Model: {response.fine_tuned_model}")


In [None]:
fine_tune_model_id = response.fine_tuned_model
print(" Fine-tuned model ID:", fine_tune_model_id)



In [None]:
test_messages = []
test_messages.append({"role": "system", "content": system_message})
test_message = "How are you today Samantha?"

In [None]:
from openai import AzureOpenAI

client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version="2024-05-01-preview"
)

response = client.chat.completions.create(
    model=fine_tune_model_id,  # your fine-tuned model ID
    messages=test_messages
)

assistant_reply = response.choices[0].message["content"]
print(" Samantha:", assistant_reply)


In [None]:
from openai import AzureOpenAI

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version="2024-05-01-preview"
)

# Test conversation
test_messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": "How are you today Samantha?"}
]

# Get response from fine-tuned model
response = client.chat.completions.create(
    model=fine_tune_model_id,
    messages=test_messages
)

# Print assistant reply
assistant_reply = response.choices[0].message["content"]
print(" Samantha:", assistant_reply)
