## Setup
Fine-tuned models are trained on datasets (distribution and size) similar to those in notebook 3.

#### Imports
 See `requirements.txt` for full dependency versions

In [None]:
import os
import json
import glob

from openai import OpenAI
from openai.types.fine_tuning import SupervisedMethod, SupervisedHyperparameters
from mistralai import Mistral

#### Global Paths, Directories, Variables, and Models

In [None]:
# Define Demo Study path
DEMO_PATH   = os.path.abspath(os.path.join(".."))

# Define relevant paths
API_KEY_DIR = os.path.expanduser(os.getenv("API_KEY_DIR", 'PATH')) # insert path to .txt with API Key
LLM_API = os.path.join(DEMO_PATH,'LLM_API')
JSONL_FILES = glob.glob(os.path.join(LLM_API, 'demo_app_updates_train_*.jsonl'))

# Reproducibility
SEED = 94032

# Load API keys
with open(os.path.join(API_KEY_DIR, "mistral_api_key_ai-measurement.txt"), encoding="utf-8") as f: MISTRAL_API_KEY = f.read().strip()
with open(os.path.join(API_KEY_DIR, "openai_api_key_ai-measurement.txt"), encoding="utf-8") as f: OPENAI_API_KEY = f.read().strip()

# Define API-relevant clients
MISTRAL_CLIENT =  Mistral(api_key=MISTRAL_API_KEY)
OPENAI_CLIENT = OpenAI(api_key=OPENAI_API_KEY)

### Models for API-requests
# OpenAI GPT models
GPT_MODELS = [
    "gpt-3.5-turbo-0125",
    "gpt-4o-mini-2024-07-18",
    "gpt-4o-2024-08-06",
    "gpt-4.1-2025-04-14",
    "gpt-4.1-mini-2025-04-14",
    "gpt-4.1-nano-2025-04-14",
]

# Mistral models (need to use "-latest" models for API)
MISTRAL_MODELS = [
    "mistral-large-latest",
    "mistral-small-latest",
    "open-mistral-nemo",
    "ministral-8b-latest",
]
###

# Define hyperparameters (let API decide, unless overridden)
N_EPOCHS   = "auto"
BATCH_SIZE = "auto"
LR_MULT    = "auto"

# Define parameters to fit API limitations
FILE_BATCH_SIZE = 3     # How many files to process per model
SKIP_BATCH = 3          # Set manually to choose which batch of files to process (0 = first 3, 1 = 4–6, etc)

## Fine-Tuning via API
Upload JSONL files, launch fine-tuning jobs per model and training split, and summarize jobs in a JSON file.

### OpenAI Models

In [None]:
# Upload each training file and capture OpenAI file IDs
file_ids = {}
for path in JSONL_FILES:
    fname = os.path.basename(path)
    print(f"Uploading {fname}...")
    with open(path, 'rb') as fin:
        resp = OPENAI_CLIENT.files.create(file=fin, purpose='fine-tune')
    file_ids[fname] = resp.id
    print(f"-> {fname} uploaded as {resp.id}")


In [None]:
# Pre-compute a list of (fname, file_id)
all_files = list(file_ids.items())
tasks_summary = []

# Fine-tune each OpenAI model on each file in the current batch
for model in GPT_MODELS:
    # Pick out exactly FILE_BATCH_SIZE files starting at SKIP_BATCH * BATCH_SIZE
    start = SKIP_BATCH * FILE_BATCH_SIZE
    end   = start + FILE_BATCH_SIZE
    batch = all_files[start:end]

    for fname, training_file_id in batch:
        # Parse dataset info from filename
        base = os.path.splitext(fname)[0]
        parts = base.split('_')
        split_type = parts[-2]
        size       = parts[-1]

        # Short model id for readable suffix (e.g. '3.5-turbo')
        mparts = model.split('-')
        model_short = '-'.join(mparts[1:3])

        suffix = f"{model_short}-{split_type}-{size}"

        # Prepare OpenAI API fine-tuning job config
        tune_params = {
            'training_file': training_file_id,
            'model': model,
            'method': {
                "type": "supervised",
                "supervised": SupervisedMethod(
                    hyperparameters=SupervisedHyperparameters(
                        n_epochs=N_EPOCHS,
                        batch_size=BATCH_SIZE,
                        learning_rate_multiplier=LR_MULT
                    )
                )
            },
            'seed': SEED,
            'suffix': suffix
        }

        print(f"Creating fine-tune job for {fname} on {model} with suffix {suffix}...")
        ft_resp = OPENAI_CLIENT.fine_tuning.jobs.create(**tune_params)
        print(f"  → Job {ft_resp.id} created, status: {ft_resp.status}")
        tasks_summary.append({
            'model': model,
            'dataset': fname,
            'job_id': ft_resp.id,
            'suffix': suffix,
            'status': ft_resp.status
        })

In [None]:
# Save all fine-tuning job info to a summary file
summary_path = os.path.join(LLM_API, 'fine_tune_jobs_summary.json')
with open(summary_path, 'w', encoding='utf-8') as sumf:
    json.dump(tasks_summary, sumf, indent=2)
print(f"All jobs created. Summary saved to {summary_path}")

### MISTRAL Models

In [None]:
# Upload all training files for Mistral
mistral_file_ids = {}
for path in JSONL_FILES:
    fname = os.path.basename(path)
    print(f"Uploading {fname} to Mistral…")
    with open(path, "rb") as f:
        resp = MISTRAL_CLIENT.files.upload(file={
            "file_name": fname,
            "content": f
        })
    mistral_file_ids[fname] = path, resp.id
    print(f" -> {fname} → {resp.id}")

In [None]:
DESIRED_EPOCHS = 3      # Number of epochs to approximate for each run

# Turn dict into a list
all_mistral_files = list(mistral_file_ids.items())
mistral_jobs = []

# Fine-tune each Mistral model on each file in the batch
for model in MISTRAL_MODELS:
    # Compute slice for the current batch (see above)
    start = SKIP_BATCH * BATCH_SIZE
    end   = start + BATCH_SIZE
    batch = all_mistral_files[start:end]

    for fname, (local_path, file_id) in batch:
        # Compute training steps as function of file size to match epochs
        size_bytes = os.path.getsize(local_path)
        size_mb    = size_bytes / (1024 * 1024)
        training_steps = max(1, int(DESIRED_EPOCHS * size_mb))

        hyperparams = {
            "training_steps":   training_steps,     # ~Epochs × MB
            "learning_rate":    1e-4,               # Standard starting LR
        }

        print(
            f"Creating Mistral FT job on {model} with {fname}: "
            f"{training_steps} steps (~{DESIRED_EPOCHS} epochs)…"
        )
        job = MISTRAL_CLIENT.fine_tuning.jobs.create(
            model=model,
            training_files=[{"file_id": file_id, "weight": 1}],
            hyperparameters=hyperparams,
            auto_start=True
        )

        print(f" → job {job.id}, status={job.status}")
        mistral_jobs.append({
            "model":         model,
            "dataset":       fname,
            "file_size_mb":  round(size_mb, 2),
            "training_steps":training_steps,
            "job_id":        job.id,
            "status":        job.status
        })

# Save all Mistral job info to a summary file
with open(os.path.join(LLM_API, "mistral_fine_tune_summary.json"), "w") as out:
    json.dump(mistral_jobs, out, indent=2)