In [None]:
!pip install matplotlib tiktoken openai numpy wandb

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from openai import OpenAI


In [None]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [None]:
import json
from collections import defaultdict
from tiktoken import get_encoding

def validate_and_estimate_finetuning_data(file_path):
    # Setup
    format_errors = defaultdict(int)
    token_counts = []
    total_tokens = 0
    encoding = get_encoding("cl100k_base")  # For OpenAI models


    # Load the dataset
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    for idx, ex in enumerate(dataset):
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        # Validate format
        conversation_tokens = 0
        assistant_message_found = False

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
                continue

            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

            # Count tokens for each message
            try:
                message_tokens = len(encoding.encode(message.get("content", "")))
                conversation_tokens += message_tokens
            except Exception as e:
                format_errors["tokenization_error"] += 1

            if message.get("role") == "assistant":
                assistant_message_found = True

        if not assistant_message_found:
            format_errors["example_missing_assistant_message"] += 1

        token_counts.append(conversation_tokens)
        total_tokens += conversation_tokens

    # Output results
    return {
        "format_errors": dict(format_errors),
        "token_counts": token_counts,
        "total_tokens": total_tokens,
    }



In [None]:
training_File_Path = "/content/drive/MyDrive/Finetuned_Physics_Sarcasticbot/physics_train.jsonl"
validation_File_Path = "/content/drive/MyDrive/Finetuned_Physics_Sarcasticbot/Physics_test.jsonl"

In [None]:
## Training data
result = validate_and_estimate_finetuning_data(training_File_Path)

# Print Results
print("Training Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])

result = validate_and_estimate_finetuning_data(validation_File_Path)

## Test dataset
print("\n\nTest Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])

Training Data
Format Errors: {}
Token Counts per Conversation: [64, 62, 65, 65, 61, 82, 61, 60, 67, 71, 63, 57, 71, 65, 71, 57, 71, 68, 63, 61, 64, 64, 64, 72, 61, 60, 64, 68, 64, 61, 66, 59, 69, 71, 63, 67, 70, 60, 67, 58, 58, 60, 67, 72, 61, 57, 65, 57, 61, 65, 65, 70, 58, 65, 65, 58, 68, 64, 65, 72, 76, 60, 62, 68, 60, 59, 65, 70, 67, 58, 64, 62, 65, 56, 68, 62, 54, 65, 70, 61, 71, 70, 57, 62, 61, 71, 53, 62, 55, 64, 66, 58, 69, 68, 66, 65, 56, 60, 71, 63]
Total Tokens: 6410


Test Data
Format Errors: {}
Token Counts per Conversation: [65, 72, 62, 76, 68, 63, 73, 59, 72, 70, 64, 72, 70, 78, 59, 67, 74, 67, 66, 65, 64, 70, 75, 56, 72]
Total Tokens: 1699


In [None]:
import wandb
from google.colab import userdata
api_key = userdata.get("OPENAI_API_KEY")

wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaimanojbera[0m ([33mberasaimanoj[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
## create a client
client = OpenAI(api_key=api_key)

# Function to check if a file already exists on OpenAI
def get_existing_file_id(filename):
    files = client.files.list()
    for file in files.data:
        if file.filename == filename:
            return file.id  # Return the existing file ID
    return None  # File does not exist

# Function to delete a file by ID
def delete_file(file_id):
    response = client.files.delete(file_id)
    return response.deleted

# Check and delete training file
file_name = "/content/drive/MyDrive/Finetuned_Physics_Sarcasticbot/physics_train.jsonl"
training_file_id = get_existing_file_id(file_name)
if training_file_id:
    print(f"Deleting existing training file: {training_File_Path}")
    delete_file(training_file_id)

# Check and delete validation file
file_name = "/content/drive/MyDrive/Finetuned_Physics_Sarcasticbot/Physics_test.jsonl"
validation_file_id = get_existing_file_id(file_name)
if validation_file_id:
    print(f"Deleting existing validation file: {validation_File_Path}")
    delete_file(validation_file_id)

# Upload the training file
training = client.files.create(
    file=open(training_File_Path, "rb"),
    purpose="fine-tune"
)
print(f"Training file uploaded: {training.id}")

# Upload the validation file
validation = client.files.create(
    file=open(validation_File_Path, "rb"),
    purpose="fine-tune"
)
print(f"Validation file uploaded: {validation.id}")

Training file uploaded: file-Y95HvxsPAzdS96xyJgxh2c
Validation file uploaded: file-AsYMPuvcgDtor4cMrsX9Bn


In [None]:
## List all the files to choose its id for fine tuning with it's data
files = client.files.list()
print(files.data)

[FileObject(id='file-AsYMPuvcgDtor4cMrsX9Bn', bytes=11904, created_at=1739675699, filename='Physics_test.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-Y95HvxsPAzdS96xyJgxh2c', bytes=44045, created_at=1739675699, filename='physics_train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-SNZ5ivgeGU6PHea6YVae99', bytes=884, created_at=1739517027, filename='step_metrics.csv', object='file', purpose='fine-tune-results', status='processed', status_details=None), FileObject(id='file-FLN6xi2T536n5U8t28s5zM', bytes=9659392, created_at=1739515980, filename='val.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-FLVeKPvFraYfFMJBFX9Lr6', bytes=18887348, created_at=1739515979, filename='train.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None), FileObject(id='file-9pX8RLawjekzKDVoVPsDmP', bytes=9651

In [None]:
## Paste the file id into the training_file parameter and choose the model and adjust the hyperparameters if you want to tune it
job = client.fine_tuning.jobs.create(
    training_file= training.id,
    validation_file=validation.id,
    model = "gpt-3.5-turbo-0125",
    method={
        "type": "supervised",
        "supervised": {
            "hyperparameters": {
                "n_epochs": 5,  # Number of epochs
                "batch_size": 8,  # Batch size
                "learning_rate_multiplier": 0.7,  # Learning rate scaling factor
            }
        }
    },
    integrations= [
        {
            "type": "wandb",
            "wandb": {
                "project": "Finetuned_Physics_Sacastic_bot",
                "tags": ["Physics", "Sarcastic", "finetuning"]
            }
        }
    ]
)
print(job)

FineTuningJob(id='ftjob-GzVu6JtwwW2SEONO9vhnDa2j', created_at=1739676182, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=8, learning_rate_multiplier=0.7, n_epochs=5), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-QL5uq9QCQwO0sZxVglpgmpWc', result_files=[], seed=339231874, status='validating_files', trained_tokens=None, training_file='file-Y95HvxsPAzdS96xyJgxh2c', validation_file='file-AsYMPuvcgDtor4cMrsX9Bn', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='Finetuned_Physics_Sacastic_bot', entity=None, name=None, tags=None, run_id='ftjob-GzVu6JtwwW2SEONO9vhnDa2j'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=8, learning_rate_multiplier=0.7, n_epochs=5)), type='supervised'), user_provided_suffix=None)


In [None]:
## Listing all the recent jobs
all_jobs = client.fine_tuning.jobs.list(limit=10).data
print(all_jobs)

[FineTuningJob(id='ftjob-GzVu6JtwwW2SEONO9vhnDa2j', created_at=1739676182, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=8, learning_rate_multiplier=0.7, n_epochs=5), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-QL5uq9QCQwO0sZxVglpgmpWc', result_files=[], seed=339231874, status='running', trained_tokens=None, training_file='file-Y95HvxsPAzdS96xyJgxh2c', validation_file='file-AsYMPuvcgDtor4cMrsX9Bn', estimated_finish=1739676674, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='Finetuned_Physics_Sacastic_bot', entity=None, name=None, tags=None, run_id='ftjob-GzVu6JtwwW2SEONO9vhnDa2j'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=8, learning_rate_multiplier=0.7, n_epochs=5)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob

In [None]:
## Prinint the recent job to get the fine-tuned model name
print(all_jobs[0])
print(client.fine_tuning.jobs.retrieve(all_jobs[0].id))

FineTuningJob(id='ftjob-GzVu6JtwwW2SEONO9vhnDa2j', created_at=1739676182, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=8, learning_rate_multiplier=0.7, n_epochs=5), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-QL5uq9QCQwO0sZxVglpgmpWc', result_files=[], seed=339231874, status='running', trained_tokens=None, training_file='file-Y95HvxsPAzdS96xyJgxh2c', validation_file='file-AsYMPuvcgDtor4cMrsX9Bn', estimated_finish=1739676674, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='Finetuned_Physics_Sacastic_bot', entity=None, name=None, tags=None, run_id='ftjob-GzVu6JtwwW2SEONO9vhnDa2j'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=8, learning_rate_multiplier=0.7, n_epochs=5)), type='supervised'), user_provided_suffix=None)
FineTuningJob(id='ftjob-G

In [None]:
import time
import requests
checkpoints = None

# Function to get the latest accuracy and loss from checkpoints
def get_latest_accuracy(job_id, api_key):
    url = f"https://api.openai.com/v1/fine_tuning/jobs/{job_id}/checkpoints"
    headers = {"Authorization": f"Bearer {api_key}"}

    response = requests.get(url, headers=headers)
    checkpoints = response.json().get("data", [])

    if not checkpoints:
        return None, None  # Return None if no checkpoints are available

    # Find the latest checkpoint based on step_number
    latest_checkpoint = max(checkpoints, key=lambda c: c["step_number"])
    latest_accuracy = latest_checkpoint["metrics"]["full_valid_mean_token_accuracy"]
    latest_loss = latest_checkpoint["metrics"]["full_valid_loss"]
    return latest_accuracy, latest_loss

# Function to monitor fine-tuning job and print training/validation metrics
def monitor_finetuning_progress(job_id, api_key, check_interval=10):
    while True:
        try:
            # Retrieve the fine-tuning job status
            job_status = client.fine_tuning.jobs.retrieve(job_id)

            # Print basic job details
            print(f"Job ID: {job_status.id}")
            print(f"Status: {job_status.status}")

            # Check if the job has completed
            if job_status.status in ["succeeded", "failed"]:
                print(f"Fine-tuning job {job_status.status}.")
                model_id = job_status.fine_tuned_model
                result_file_id = job_status.result_files[0]
                return job_status, model_id, result_file_id

            # Retrieve and print the latest accuracy and loss
            latest_accuracy, latest_loss = get_latest_accuracy(job_id, api_key)
            if latest_accuracy is not None and latest_loss is not None:
                print(f"Latest Accuracy: {latest_accuracy:.3f}")
                print(f"Latest Loss: {latest_loss:.3f}")
            else:
                print("No checkpoints available yet.")

            # Wait before the next check
            print(f"Checking again in {check_interval} seconds...\n")
            time.sleep(check_interval)

        except Exception as e:
            print(f"An error occurred: {e}. Retrying in {check_interval} seconds...\n")
            time.sleep(check_interval)


# Replace `fine_tuning_job_id` with your actual job ID
fine_tuning_job_id = all_jobs[0].id
status, model_name, result_file_id = monitor_finetuning_progress(fine_tuning_job_id, api_key, 10)
print(f"Status: {status}")
print(f"Model Name: {model_name}")
print(f"Result file id: {result_file_id}")

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-GzVu6JtwwW2SEONO9vhnDa2j
Status: running
N