# Fine-tuning GPT-4o-mini on GSM8K

This notebook demonstrates how to fine-tune OpenAI's GPT-4o-mini model on the GSM8K math reasoning dataset.

## Setup


In [26]:
! pip install -q openai datasets python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [16]:
import re
import os
import json
import time
from tqdm import tqdm
from openai import OpenAI, RateLimitError
from datasets import load_dataset
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

DATA_DIR = "../../data"
train_data_path = os.path.join(DATA_DIR, "train_data.jsonl")
val_data_path = os.path.join(DATA_DIR, "val_data.jsonl")
test_data_path = os.path.join(DATA_DIR, "test_data.jsonl")  


## Load Dataset

Load GSM8K and split into training (7,073 samples) and validation (400 samples) sets.


In [17]:
dataset = load_dataset("json", data_files={
    "train": train_data_path,
    "val": val_data_path,
    "test": test_data_path
})

train_data = dataset['train']
validation_data = dataset['val']


Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

## Evaluate Base Model

Test GPT-4o-mini's baseline accuracy on the validation set (no fine-tuning).


In [34]:
def extract_number(text):
    matches = re.findall(r"-?\d+(?:\.\d+)?", text)
    return float(matches[-1]) if matches else None

def process_sample(sample, retries=5):
    """Processes one sample, retrying on rate limit errors."""
    q = sample["question"]
    gt = extract_number(sample["answer"])

    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": f"Question: {q}\nAnswer:"}],
                temperature=0
            )
            pred = completion.choices[0].message.content
            pred_num = extract_number(pred)
            return pred_num == gt

        except RateLimitError:
            sleep_time = 2 ** attempt + 0.5  # exponential backoff
            time.sleep(sleep_time)

        except Exception as e:
            # catch-all for unexpected errors
            print(f"Error processing sample: {e}")
            return False

    return False  # if all retries failed


# Parallel execution with rate-limit handling
correct = 0
results = []

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = {executor.submit(process_sample, sample): sample for sample in validation_data}

    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            result = future.result()
            results.append(result)
        except Exception as e:
            print(f"Worker failed: {e}")
            results.append(False)

correct = sum(results)
base_accuracy = correct / len(validation_data)
print(f"\nBase GPT-4o-mini accuracy on validation set: {base_accuracy:.2%}")

100%|██████████| 400/400 [03:30<00:00,  1.90it/s]


Base GPT-4o-mini accuracy on validation set: 86.00%





## Convert to OpenAI Format

Convert the dataset to JSONL format with the required `messages` structure.


In [26]:
# Convert training data to JSONL
with open("gsm8k_train.jsonl", "w") as f:
    for sample in train_data:
        record = {
            "messages": [
                {"role": "system", "content": "You are a helpful assistant that solves math reasoning problems."},
                {"role": "user", "content": sample["question"]},
                {"role": "assistant", "content": sample["answer"]}
            ]
        }
        f.write(json.dumps(record) + "\n")

# Convert validation data to JSONL
with open("gsm8k_val.jsonl", "w") as f:
    for sample in validation_data:
        record = {
            "messages": [
                {"role": "system", "content": "You are a helpful assistant that solves math reasoning problems."},
                {"role": "user", "content": sample["question"]},
                {"role": "assistant", "content": sample["answer"]}
            ]
        }
        f.write(json.dumps(record) + "\n")

print("Training and validation files created successfully")

Training and validation files created successfully


## Upload Data to OpenAI

In [None]:
# Upload training file
print("Uploading training file...")
train_file = client.files.create(
    file=open("gsm8k_train.jsonl", "rb"),
    purpose="fine-tune"
)
print(f"Training file ID: {train_file.id}")

# Upload validation file
print("Uploading validation file...")
val_file = client.files.create(
    file=open("gsm8k_val.jsonl", "rb"),
    purpose="fine-tune"
)
print(f"Validation file ID: {val_file.id}")

Uploading training file...
Training file ID: file-Du4Z1kpVTrZBRUJTKuCxWb
Uploading validation file...
Validation file ID: file-TB5KkLH9wMVSfic6rEn59u


## Create Fine-tuning Job

Start the fine-tuning job with 1 epochs.


In [24]:
# Create fine-tuning job
print("\nCreating fine-tuning job...")
job1 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 1,
        "batch_size": 8,

    },
    suffix="gsm8k-math"
)

print(f"Fine-tuning job created: {job1.id}")
print(f"Status: {job1.status}")


job2 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.5,
        "batch_size": 8,

    },
    suffix="gsm8k-math"
)

print(f"Fine-tuning job created: {job2.id}")
print(f"Status: {job2.status}")


Creating fine-tuning job...
Fine-tuning job created: ftjob-3MUiWEkMxghpsYlfpXpvnFSj
Status: validating_files
Fine-tuning job created: ftjob-JvsiiLXWnyBdtDBA9xo7QCFy
Status: validating_files


In [29]:
job3 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.1,
        "batch_size": 8,

    },
    suffix="gsm8k-math"
)

In [31]:
job3 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.01,
        "batch_size": 8,

    },
    suffix="gsm8k-math"
)

## Evaluate Fine-tuned Model

Test the fine-tuned model on the validation set and compare to baseline.


In [12]:
def evaluate_model(model_id):
    def evaluate_sample(sample):
        q = sample["question"]
        gt = extract_number(sample["answer"])

        completion = client.chat.completions.create(
            model=model_id,
            messages=[{"role": "user", "content": f"Question: {q}\nAnswer:"}],
            temperature=0
        )

        pred = completion.choices[0].message.content
        pred_num = extract_number(pred)

        return pred_num == gt

    results = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_sample, s) for s in validation_data]
        for f in tqdm(as_completed(futures), total=len(futures)):
            results.append(f.result())


    return sum(results)


In [27]:
n_correct = evaluate_model('ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:gsm8k-math:CXxChjym')

tuned_accuracy = n_correct / len(validation_data)
print(f"\nFine-tuned GPT-4o-mini accuracy: {tuned_accuracy:.2%}")
print(f"Improvement: +{(tuned_accuracy - base_accuracy) * 100:.1f} percentage points")

100%|██████████| 400/400 [05:42<00:00,  1.17it/s]


Fine-tuned GPT-4o-mini accuracy: 84.00%
Improvement: +-2.0 percentage points





In [None]:
n_correct = evaluate_model('ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:gsm8k-math:CXxAVAcF')

tuned_accuracy = n_correct / len(validation_data)
print(f"\nFine-tuned GPT-4o-mini accuracy: {tuned_accuracy:.2%}")
print(f"Improvement: +{(tuned_accuracy - base_accuracy) * 100:.1f} percentage points")


Fine-tuned GPT-4o-mini accuracy: 83.00%
Improvement: +-3.0 percentage points
