# Fine-tuning GPT-4o-mini on SAMSUM dataset


This notebook demonstrates how to fine-tune OpenAI's GPT-4o-mini model.

## Setup


In [2]:
! pip install -q openai datasets python-dotenv evaluate rouge_score

In [3]:
import os
import json
import time
import evaluate
from tqdm import tqdm
from openai import OpenAI, RateLimitError
from datasets import load_dataset
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=openai_api_key)

DATA_DIR = "./"
train_data_path = os.path.join(DATA_DIR, "train_data.jsonl")
val_data_path = os.path.join(DATA_DIR, "validation_data.jsonl")
test_data_path = os.path.join(DATA_DIR, "test_data.jsonl")  


## Load Dataset


In [4]:
dataset = load_dataset("knkarthick/samsum")

train_data = dataset["train"].shuffle(seed=42).select(range(1000))
test_data = dataset['test'].shuffle(seed=42).select(range(200))
val_data = dataset['validation'].shuffle(seed=42).select(range(200))


def save_jsonl(data, filename):
    with open(filename, "w") as f:
        for sample in data:
            f.write(json.dumps(sample) + "\n")


def format_prompt(example):
    return f"## Dialogue:\n{example['dialogue']}\n## Summary:\n"

val_data = val_data.map(lambda ex: {"text": format_prompt(ex)})
test_data = test_data.map(lambda ex: {"text": format_prompt(ex)})
train_data = train_data.map(lambda ex: {"text": format_prompt(ex)})

save_jsonl(val_data, val_data_path)
save_jsonl(test_data, test_data_path)
save_jsonl(train_data, train_data_path)


## Evaluate Base Model

Test GPT-4o-mini's baseline accuracy on the validation set (no fine-tuning).


In [5]:
def calculate_rouge(generated_texts, true_summary):
  rouge = evaluate.load("rouge")
  results = rouge.compute(predictions=generated_texts, references=true_summary)
  return results

def build_user_prompt(dialog):
    prompt = (                
        "You are a helpful assistant who writes concise, factual summaries of conversations. "
        "Summarize the following conversation into a single sentence. "
        "If it can summarized in a short sentence, do it. \n"
        f"## Dialog: {dialog}\n"
        "## Summary:\n"
    )
    return prompt

def process_sample(sample, retries=5):
    """Processes one sample, retrying on rate limit errors."""
    dialog = sample["dialogue"]

    for attempt in range(retries):
        try:
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": build_user_prompt(dialog)}],
                temperature=0
            )
            pred = completion.choices[0].message.content
            return pred

        except RateLimitError:
            sleep_time = 2 ** attempt + 0.5  # exponential backoff
            time.sleep(sleep_time)

        except Exception as e:
            # catch-all for unexpected errors
            print(f"Error processing sample: {e}")
            return False

    return False  # if all retries failed

def process_all_samples(samples):
    """Run samples in parallel while preserving original order."""
    results = [None] * len(samples)

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {
            executor.submit(process_sample, sample): idx
            for idx, sample in enumerate(samples)
        }

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing samples"):
            idx = futures[future]
            try:
                results[idx] = future.result()
            except Exception as e:
                print(f"Worker failed at index {idx}: {e}")
                results[idx] = None

    return results

In [6]:
results = process_all_samples(samples=val_data)

rouge = calculate_rouge(results, list(val_data["summary"]))
print(
    f"\nBase GPT-4o-mini ROUGE score on validation set:\n"
    f"Rouge-1: {rouge['rouge1']:.2%}\n"
    f"Rouge-2: {rouge['rouge2']:.2%}\n"
    f"Rouge-L: {rouge['rougeL']:.2%}"
)

Processing samples: 100%|██████████| 200/200 [00:40<00:00,  4.98it/s]



Base GPT-4o-mini ROUGE score on validation set:
Rouge-1: 40.48%
Rouge-2: 15.93%
Rouge-L: 32.55%


In [7]:
results[0], val_data["summary"][0]

('Victoria and Magda discuss their financial struggles, with Victoria feeling broke after overspending and Magda expressing frustration after paying her car insurance, while Victoria is relieved that hers is already paid for the year.',
 'Magda and Victoria feel broke. ')

## Convert to OpenAI Format

Convert the dataset to JSONL format with the required `messages` structure.


In [8]:
# Convert training data to JSONL
with open("samsum_train.jsonl", "w") as f:
    for sample in train_data:
        record = {
            "messages": [
                {"role": "user", "content": build_user_prompt(sample["dialogue"])},
                {"role": "assistant", "content": sample["summary"]}
            ]
        }
        f.write(json.dumps(record) + "\n")

# Convert validation data to JSONL
with open("samsum_val.jsonl", "w") as f:
    for sample in val_data:
        record = {
            "messages": [
                {"role": "user", "content": build_user_prompt(sample["dialogue"])},
                {"role": "assistant", "content": sample["summary"]}
            ]
        }
        f.write(json.dumps(record) + "\n")

print("Training and validation files created successfully")

Training and validation files created successfully


## Upload Data to OpenAI

In [9]:
# Upload training file
print("Uploading training file...")
train_file = client.files.create(
    file=open("samsum_train.jsonl", "rb"),
    purpose="fine-tune"
)
print(f"Training file ID: {train_file.id}")

# Upload validation file
print("Uploading validation file...")
val_file = client.files.create(
    file=open("samsum_val.jsonl", "rb"),
    purpose="fine-tune"
)
print(f"Validation file ID: {val_file.id}")

Uploading training file...
Training file ID: file-SNXVCoSrGDpvnARDNR84Zq
Uploading validation file...
Validation file ID: file-S3jojKW4n7QxahNcECkCdh


## Create Fine-tuning Job

Start the fine-tuning job with 1 epochs.


In [10]:
# Create fine-tuning job
print("\nCreating fine-tuning job...")
job1 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 1,
        "batch_size": 8,

    },
    suffix="samsum"
)
print(f"Fine-tuning job created: {job1.id}")
print(f"Status: {job1.status}")


job2 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.5,
        "batch_size": 8,

    },
    suffix="samsum"
)
print(f"Fine-tuning job created: {job2.id}")
print(f"Status: {job2.status}")


job3 = client.fine_tuning.jobs.create(
    training_file=train_file.id,
    validation_file=val_file.id,
    model="gpt-4o-mini-2024-07-18",
    hyperparameters={
        "n_epochs": 1,
        "learning_rate_multiplier": 0.1,
        "batch_size": 8,

    },
    suffix="samsum"
)
print(f"Fine-tuning job created: {job3.id}")
print(f"Status: {job3.status}")


Creating fine-tuning job...
Fine-tuning job created: ftjob-U26WNWTAuJGgNvTtKutPrw4B
Status: validating_files
Fine-tuning job created: ftjob-hAuLEXy1WdHqexGeY2u81z6F
Status: validating_files
Fine-tuning job created: ftjob-FV8juQuguXnTPECNPza85e11
Status: validating_files


## Evaluate Fine-tuned Model

Test the fine-tuned model on the validation set and compare to baseline.


In [41]:
def evaluate_model(model_id):
    """Evaluate a model on the validation set and compute ROUGE."""

    def evaluate_sample(sample):
        q = sample["dialogue"]
        completion = client.chat.completions.create(
            model=model_id,
            messages=[{"role": "user", "content": build_user_prompt(q)}],
            temperature=0
        )
        return completion.choices[0].message.content

    # Run samples in parallel but preserve order
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(evaluate_sample, val_data), total=len(val_data)))

    rouge = calculate_rouge(results, list(val_data["summary"]))
    return rouge

## lr_multiplier = 0.1


In [36]:
rouge_score = evaluate_model('ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZ4rgruH')

print('Rouge-1: ', rouge_score['rouge1'])
print('Rouge-2: ', rouge_score['rouge2'])
print('Rouge-L: ', rouge_score['rougeL'])

100%|██████████| 200/200 [00:33<00:00,  5.92it/s]


Rouge-1:  0.11296262397526508
Rouge-2:  0.01201384855827984
Rouge-L:  0.0882997597030008


In [49]:
completion = client.chat.completions.create(
    model='ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZ4rgruH',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant that summarize conversations. Summarize the conversation in a few sentences.'},
        {"role": "user", "content": f"Dialogue: {dialogue}\nSummary:"}
        ],
    temperature=0
)

pred = completion.choices[0].message.content
pred

"Victoria and Magda are both feeling financially strained this month. Magda just paid her car insurance, while Victoria's is paid for the rest of the year."

In [45]:
completion = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant that summarize conversations. Summarize the conversation in a few sentences.'},
        {"role": "user", "content": f"Dialogue: {dialogue}\nSummary:"}
        ],
    temperature=0
)
pred = completion.choices[0].message.content
pred

'Victoria expresses her frustration about being broke after overspending, but looks forward to getting paid soon. Magda relates to her feelings and mentions just paying her car insurance, feeling like she was robbed. Victoria is relieved that her car insurance is already paid for the year.'

## lr_multiplier = 0.5


In [37]:
rouge_score = evaluate_model('ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZ4qjCPA')

print('Rouge-1: ', rouge_score['rouge1'])
print('Rouge-2: ', rouge_score['rouge2'])
print('Rouge-L: ', rouge_score['rougeL'])


100%|██████████| 200/200 [00:26<00:00,  7.63it/s]


Rouge-1:  0.12924997802793536
Rouge-2:  0.02352655682291644
Rouge-L:  0.10569437415041297


In [48]:
completion = client.chat.completions.create(
    model='ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZ4qjCPA',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant that summarize conversations. Summarize the conversation in a few sentences.'},
        {"role": "user", "content": f"Dialogue: {dialogue}\nSummary:"}
        ],
    temperature=0
)

pred = completion.choices[0].message.content
pred

'Victoria and Magda are broke. Magda just paid her car insurance.'

## lr_multiplier = 1


In [38]:
rouge_score = evaluate_model('ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZ4rbTPa')

print('Rouge-1: ', rouge_score['rouge1'])
print('Rouge-2: ', rouge_score['rouge2'])
print('Rouge-L: ', rouge_score['rougeL'])


100%|██████████| 200/200 [01:37<00:00,  2.04it/s]


Rouge-1:  0.1149850396819559
Rouge-2:  0.0124089839747357
Rouge-L:  0.09043878235459887


In [47]:
completion = client.chat.completions.create(
    model='ft:gpt-4o-mini-2024-07-18:ready-tensor-inc:samsum:CZ4rbTPa',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant that summarize conversations. Summarize the conversation in a few sentences.'},
        {"role": "user", "content": f"Dialogue: {dialogue}\nSummary:"}
        ],
    temperature=0
)

pred = completion.choices[0].message.content
pred

"Victoria and Magda are broke. Victoria's car insurance is paid for the rest of the year."