# Fine-tuning GPT-4o-mini on SAMSUM dataset


This notebook demonstrates how to fine-tune OpenAI's GPT-4o-mini model.

## Setup


In [1]:
! pip install -q openai datasets python-dotenv evaluate rouge_score

In [None]:
import os
import json
import time
import evaluate
from tqdm import tqdm
from openai import OpenAI, RateLimitError
from datasets import load_dataset
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

DATA_DIR = "./"
train_data_path = os.path.join(DATA_DIR, "train_data.jsonl")
val_data_path = os.path.join(DATA_DIR, "validation_data.jsonl")
test_data_path = os.path.join(DATA_DIR, "test_data.jsonl")  

num_workers_to_use = 10


## Load Dataset


In [3]:
dataset = load_dataset("knkarthick/samsum")

train_data = dataset["train"].shuffle(seed=42).select(range(1000))
test_data = dataset['test'].shuffle(seed=42).select(range(200))
val_data = dataset['validation'].shuffle(seed=42).select(range(200))


def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for sample in data:
            f.write(json.dumps(sample) + "\n")


def format_prompt(example):
    return f"## Dialogue:\n{example['dialogue']}\n## Summary:\n"

val_data = val_data.map(lambda ex: {"text": format_prompt(ex)})
test_data = test_data.map(lambda ex: {"text": format_prompt(ex)})
train_data = train_data.map(lambda ex: {"text": format_prompt(ex)})

save_jsonl(val_data, val_data_path)
save_jsonl(test_data, test_data_path)
save_jsonl(train_data, train_data_path)


README.md: 0.00B [00:00, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

validation.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/14731 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Evaluate Base Model

Test GPT-4o-mini's baseline accuracy on the validation set (no fine-tuning).


In [64]:
def calculate_rouge(generated_texts, true_summary):
  rouge = evaluate.load("rouge")
  results = rouge.compute(predictions=generated_texts, references=true_summary)
  return results

def build_user_prompt(dialog:str):
    """Format an instruction prompt given a dialog string"""
    prompt = (                
        "You are a helpful assistant who writes concise, factual summaries of conversations. "
        "Summarize the following conversation into a single sentence. "
        "If it can summarized in a short sentence, do it. \n"
        f"## Dialog: {dialog}\n"
        "## Summary:\n"
    )
    return prompt

def build_messages_for_sample(sample):
    """Build messages for a single sample in openai chat format"""
    messages=[{"role": "user", "content": build_user_prompt(sample["dialogue"])}]
    return messages

def process_sample(model_name, sample, retries=5):
    """Processes one sample, retrying on rate limit errors."""
    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model=model_name,
                messages=build_messages_for_sample(sample),
                temperature=0
            )
            pred = completion.choices[0].message.content
            return pred

        except RateLimitError:
            sleep_time = 2 ** attempt + 0.5  # exponential backoff
            time.sleep(sleep_time)

        except Exception as e:
            # catch-all for unexpected errors
            print(f"Error processing sample: {e}")
            return False

    return False  # if all retries failed

def process_all_samples(model_name, samples):
    """Run samples in parallel while preserving original order."""
    results = [None] * len(samples)

    with ThreadPoolExecutor(max_workers=num_workers_to_use) as executor:
        futures = {
            executor.submit(process_sample, model_name, sample): idx
            for idx, sample in enumerate(samples)
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing samples"):
            idx = futures[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                print(f"Worker failed at index {idx}: {e}")
                results[idx] = None

    return results

def evaluate_model(model_name, samples):
    results = process_all_samples(model_name=model_name, samples=samples)    
    rouge = calculate_rouge(results, list(val_data["summary"]))
    print(
        f"\nModel `{model_name}` ROUGE scores on validation set:\n"
        f"Rouge-1: {rouge['rouge1']:.2%}\n"
        f"Rouge-2: {rouge['rouge2']:.2%}\n"
        f"Rouge-L: {rouge['rougeL']:.2%}"
    )
    return results, rouge
    

In [65]:
# Run the eval!
model_name = "gpt-4o-mini"
results, rouge = evaluate_model(model_name, samples=val_data)

Processing samples: 100%|██████████| 200/200 [00:19<00:00, 10.42it/s]



Model `gpt-4o-mini` ROUGE scores on validation set:
Rouge-1: 40.76%
Rouge-2: 16.15%
Rouge-L: 32.91%


In [51]:
# check one sample's generated summary
print(f"Ground truth summary = {val_data['summary'][0]}")
print(f"Generated summary = {results[0]}")

Ground truth summary = Magda and Victoria feel broke. 
Generated summary = Victoria and Magda discuss their financial struggles, with Victoria feeling broke after overspending and Magda expressing frustration after paying her car insurance.


## Convert to OpenAI Format

Convert the dataset to JSONL format with the required `messages` structure.


In [8]:
# Convert training data to JSONL
with open("samsum_train.jsonl", "w") as f:
    for sample in train_data:
        record = {
            "messages": [
                {"role": "user", "content": build_user_prompt(sample["dialogue"])},
                {"role": "assistant", "content": sample["summary"]}
            ]
        }
        f.write(json.dumps(record) + "\n")

# Convert validation data to JSONL
with open("samsum_val.jsonl", "w") as f:
    for sample in val_data:
        record = {
            "messages": [
                {"role": "user", "content": build_user_prompt(sample["dialogue"])},
                {"role": "assistant", "content": sample["summary"]}
            ]
        }
        f.write(json.dumps(record) + "\n")

print("Training and validation files created successfully")

Training and validation files created successfully


## Upload Data to OpenAI

In [9]:
# Upload training file
print("Uploading training file...")
train_file = client.files.create(
    file=open("samsum_train.jsonl", "rb"),
    purpose="fine-tune"
)
print(f"Training file ID: {train_file.id}")

# Upload validation file
print("Uploading validation file...")
val_file = client.files.create(
    file=open("samsum_val.jsonl", "rb"),
    purpose="fine-tune"
)
print(f"Validation file ID: {val_file.id}")

Uploading training file...
Training file ID: file-SNXVCoSrGDpvnARDNR84Zq
Uploading validation file...
Validation file ID: file-S3jojKW4n7QxahNcECkCdh


## Create Fine-tuning Jobs

Start the fine-tuning job with 1 epochs.  
We will tune 3 models with learning rate multipliers = (0.1, 0.5, 1.0)  
You may try other values, especially > 1.0 since we see best results with multiplier = 1.0 below. 


In [10]:
# Create fine-tuning job
print("\nCreating fine-tuning job...")
job1 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 1,
        "batch_size": 8,

    },
    suffix="samsum"
)
print(f"Fine-tuning job created: {job1.id}")
print(f"Status: {job1.status}")


job2 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.5,
        "batch_size": 8,

    },
    suffix="samsum"
)
print(f"Fine-tuning job created: {job2.id}")
print(f"Status: {job2.status}")


job3 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.1,
        "batch_size": 8,

    },
    suffix="samsum"
)
print(f"Fine-tuning job created: {job3.id}")
print(f"Status: {job3.status}")


Creating fine-tuning job...
Fine-tuning job created: ftjob-U26WNWTAuJGgNvTtKutPrw4B
Status: validating_files
Fine-tuning job created: ftjob-hAuLEXy1WdHqexGeY2u81z6F
Status: validating_files
Fine-tuning job created: ftjob-FV8juQuguXnTPECNPza85e11
Status: validating_files


## Evaluate Fine-tuned Models

Test the fine-tuned model on the validation set and compare to baseline.


In [67]:
# Get the model ids from the OpenAI console - https://platform.openai.com/finetune/
tuned_models = {
    "gpt-4o-mini-2024-07-18  lr_multiplier = 0.1": "ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZBT3p87",
    "gpt-4o-mini-2024-07-18  lr_multiplier = 0.5": "ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZBT67y8",
    "gpt-4o-mini-2024-07-18  lr_multiplier = 1.0": "ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZBTzGfE",
}

In [68]:
demo_sample = val_data[0]

for model_name, model_id in tuned_models.items():
    print("="*60)
    print(f"Model = {model_name} Rouge Results")
    results, rouge = evaluate_model(model_id, samples=val_data)
    print("-"*20)

    print("sample response:")
    print(process_sample(finetuned_model1_id, sample))
    print("="*60)
    

Model = gpt-4o-mini-2024-07-18-lr_multiplier=0.1 Rouge Results


Processing samples: 100%|██████████| 200/200 [00:12<00:00, 16.24it/s]



Model `ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZBT3p87` ROUGE scores on validation set:
Rouge-1: 48.62%
Rouge-2: 26.24%
Rouge-L: 41.31%
--------------------
sample response:
Victoria is broke, but her car insurance is paid for the rest of the year. 
Model = gpt-4o-mini-2024-07-18-lr_multiplier=0.5 Rouge Results


Processing samples: 100%|██████████| 200/200 [00:32<00:00,  6.24it/s]



Model `ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZBT67y8` ROUGE scores on validation set:
Rouge-1: 53.25%
Rouge-2: 30.13%
Rouge-L: 45.61%
--------------------
sample response:
Victoria and Magda are both broke this month. Magda just paid her car insurance. Victoria's is paid for the rest of the year. 
Model = gpt-4o-mini-2024-07-18-lr_multiplier=1.0 Rouge Results


Processing samples: 100%|██████████| 200/200 [00:13<00:00, 14.67it/s]



Model `ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZBTzGfE` ROUGE scores on validation set:
Rouge-1: 53.68%
Rouge-2: 30.73%
Rouge-L: 45.96%
--------------------
sample response:
Victoria is broke, but her car insurance is paid for the rest of the year. 
