In [None]:
import logging
import os
import dspy
import gqr
import pandas as pd

from pathlib import Path
from time import perf_counter
from datetime import datetime
from tqdm import tqdm
from dotenv import load_dotenv

from gqr.core.evaluator import Evaluator, evaluate, evaluate_by_dataset
from util import Classify, SafePredict, build_examples, make_gepa_metric, score_program

load_dotenv()

logging.getLogger("dspy.teleprompt.gepa.gepa").setLevel(logging.WARNING)
logging.getLogger("dspy.evaluate.evaluate").setLevel(logging.INFO)

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

## Models
Run DSPy + GEPA baselines for the selected models only.


In [None]:
models_to_test = [
    "mistral:7b",
    "granite3.3:2b",
    "phi4:14b",
    "qwen3:14b",
]

In [None]:
# Load training dataset for model development
train_data, eval_data = gqr.load_train_dataset()

## Run GEPA 
Optimize with GEPA, then evaluate on ID/OOD splits.


In [None]:
result_columns = ["model", "avg_latency", "id_acc", "ood_acc", "gqr_score", "dataset_acc"]

results = []
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_dir = Path("../results")
results_dir.mkdir(exist_ok=True)
output_file = results_dir / f"gepa_results_{timestamp}.csv"

trainset = build_examples(train_data)[:5000]
valset = build_examples(eval_data)[:1000]

for model_name in models_to_test:
    print()
    print("=" * 60)
    print(f"DSPy + GEPA: {model_name}")
    print("=" * 60)
    try:
        lm = dspy.LM(f"ollama_chat/{model_name}", 
                    api_base="http://localhost:11434", 
                    cache=False, 
                    reasoning_effort='low' if model_name.startswith('gpt-oss') else False,
                    
                )
        dspy.configure(lm=lm)

        reflection_lm = dspy.LM(
                    model="gpt-5-fiit", 
                    api_base=os.getenv("API_BASE"),
                    api_key=os.getenv("API_KEY"), 
                    cache=False, 
                    reasoning_effort=False,
                )

        metric=make_gepa_metric()
        student = SafePredict(Classify)
        gepa = dspy.GEPA(
            metric=metric,
            reflection_lm=lm,
            auto="light",
            seed=22,
        )
        optimized_program = gepa.compile(student, trainset=trainset, valset=valset)
        saves_dir = Path("../saves")
        saves_dir.mkdir(exist_ok=True)
        program_path = saves_dir / f"{model_name.replace(':', '_')}.json"
        optimized_program.save(program_path)
        print(f"Saved program to: {program_path}")

        prompt_latencies = []
        current_pbar = None

        def timed_score(text: str) -> int:
            start = perf_counter()
            prediction = score_program(text, program=optimized_program)
            prompt_latencies.append(perf_counter() - start)
            if current_pbar is not None:
                current_pbar.update(1)
            return prediction

        id_test_data = gqr.load_id_test_dataset()
        ood_test_data = gqr.load_ood_test_dataset()
        total_size = len(id_test_data) + len(ood_test_data)

        with tqdm(total=total_size, desc=f"{model_name}") as pbar:
            current_pbar = pbar
            id_test_data["predictions"] = [timed_score(doc) for doc in id_test_data["text"].values]
            ood_test_data["predictions"] = [timed_score(doc) for doc in ood_test_data["text"].values]

        id_scores = evaluate(
            predictions=id_test_data["predictions"],
            ground_truth=id_test_data["label"],
        )
        id_acc = id_scores["accuracy"]

        ood_overall_scores = Evaluator.evaluate(
            predicted_labels=ood_test_data["predictions"],
            true_labels=ood_test_data["label"],
        )

        ood_scores_df = evaluate_by_dataset(
            ood_test_data, pred_col="predictions", true_col="label", dataset_col="dataset"
        )
        if ood_scores_df.empty:
            ood_acc = ood_overall_scores["accuracy"]
            dataset_acc = {}
        else:
            ood_acc = ood_scores_df["accuracy"].mean()
            dataset_acc = dict(zip(ood_scores_df["dataset"], ood_scores_df["accuracy"]))

        gqr_score = 2 * (id_acc * ood_acc) / (id_acc + ood_acc) if (id_acc + ood_acc) > 0 else 0.0
        avg_latency = sum(prompt_latencies) / len(prompt_latencies) if prompt_latencies else None
        latency_display = f"{avg_latency:.3f}s" if avg_latency is not None else "n/a"

        result = {
            "model": model_name,
            "avg_latency": avg_latency,
            "id_acc": id_acc,
            "ood_acc": ood_acc,
            "gqr_score": gqr_score,
            "dataset_acc": str(dataset_acc),
        }
        results.append(result)

        print()
        print(
            f"ID: {id_acc:.4f} | OOD: {ood_acc:.4f} | GQR: {gqr_score:.4f} | Latency: {latency_display}"
        )
        print(f"Per-dataset: {dataset_acc}")
        pd.DataFrame(results, columns=result_columns).to_csv(output_file, index=False)
        print(f"Saved to: {output_file}")
    except Exception as e:
        print(f"Failed: {e}")
        results.append({
            "model": model_name,
            "avg_latency": None,
            "id_acc": None,
            "ood_acc": None,
            "gqr_score": None,
            "dataset_acc": None,
        })
        pd.DataFrame(results, columns=result_columns).to_csv(output_file, index=False)

print()
print("=" * 60)
print("GEPA runs complete!")
print("=" * 60)