In [1]:
import logging
import os
import dspy
import gqr
import pandas as pd

from pathlib import Path
from time import perf_counter
from datetime import datetime
from tqdm import tqdm

from gqr.core.evaluator import Evaluator, evaluate, evaluate_by_dataset
from util import Classify, SafePredict, build_examples, metric, score_program

logging.getLogger("dspy.optimizers.bootstrap_fewshot").setLevel(logging.WARNING)
logging.getLogger("dspy.evaluate.evaluate").setLevel(logging.INFO)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

## Models
Run DSPy + BootstrapFewShot baselines for the selected models only.


In [3]:
models_to_test = [
    "mistral:7b",
    "granite3.3:2b",
    "phi4:14b",
    "qwen3:14b",
]

In [4]:
# Load training dataset for model development
train_data, eval_data = gqr.load_train_dataset()

## Run BootstrapFewShot 
Optimize with BootstrapFewShot, then evaluate on ID/OOD splits.


In [5]:
result_columns = ["model", "avg_latency", "id_acc", "ood_acc", "gqr_score", "dataset_acc"]

results = []
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
results_dir = Path("../results")
results_dir.mkdir(exist_ok=True)
output_file = results_dir / f"fewshot_results_{timestamp}.csv"

trainset = build_examples(train_data)[:300]

for model_name in models_to_test:
    print()
    print("=" * 60)
    print(f"DSPy + BootstrapFewShot: {model_name}")
    print("=" * 60)
    try:
        lm = dspy.LM(f"ollama_chat/{model_name}", 
                    api_base="http://localhost:11434", 
                    cache=False, 
                    reasoning_effort='low' if model_name.startswith('gpt-oss') else False,
                    
                )
        dspy.configure(lm=lm)

        student = SafePredict(Classify)
        optimizer = dspy.BootstrapFewShotWithRandomSearch(
            max_labeled_demos=6,
            num_candidate_programs=4,
            metric=metric,
        )
        optimized_program = optimizer.compile(student, trainset=trainset)
        saves_dir = Path("../saves")
        saves_dir.mkdir(exist_ok=True)
        program_path = saves_dir / f"fewshot_{model_name.replace(':', '_')}.json"
        optimized_program.save(program_path)
        print(f"Saved program to: {program_path}")

        prompt_latencies = []
        current_pbar = None

        def timed_score(text: str) -> int:
            start = perf_counter()
            prediction = score_program(text, program=optimized_program)
            prompt_latencies.append(perf_counter() - start)
            if current_pbar is not None:
                current_pbar.update(1)
            return prediction

        id_test_data = gqr.load_id_test_dataset()
        ood_test_data = gqr.load_ood_test_dataset()
        total_size = len(id_test_data) + len(ood_test_data)

        with tqdm(total=total_size, desc=f"{model_name}") as pbar:
            current_pbar = pbar
            id_test_data["predictions"] = [timed_score(doc) for doc in id_test_data["text"].values]
            ood_test_data["predictions"] = [timed_score(doc) for doc in ood_test_data["text"].values]

        id_scores = evaluate(
            predictions=id_test_data["predictions"],
            ground_truth=id_test_data["label"],
        )
        id_acc = id_scores["accuracy"]

        ood_overall_scores = Evaluator.evaluate(
            predicted_labels=ood_test_data["predictions"],
            true_labels=ood_test_data["label"],
        )

        ood_scores_df = evaluate_by_dataset(
            ood_test_data, pred_col="predictions", true_col="label", dataset_col="dataset"
        )
        if ood_scores_df.empty:
            ood_acc = ood_overall_scores["accuracy"]
            dataset_acc = {}
        else:
            ood_acc = ood_scores_df["accuracy"].mean()
            dataset_acc = dict(zip(ood_scores_df["dataset"], ood_scores_df["accuracy"]))

        gqr_score = 2 * (id_acc * ood_acc) / (id_acc + ood_acc) if (id_acc + ood_acc) > 0 else 0.0
        avg_latency = sum(prompt_latencies) / len(prompt_latencies) if prompt_latencies else None
        latency_display = f"{avg_latency:.3f}s" if avg_latency is not None else "n/a"

        result = {
            "model": model_name,
            "avg_latency": avg_latency,
            "id_acc": id_acc,
            "ood_acc": ood_acc,
            "gqr_score": gqr_score,
            "dataset_acc": str(dataset_acc),
        }
        results.append(result)

        print()
        print(
            f"ID: {id_acc:.4f} | OOD: {ood_acc:.4f} | GQR: {gqr_score:.4f} | Latency: {latency_display}"
        )
        print(f"Per-dataset: {dataset_acc}")
        pd.DataFrame(results, columns=result_columns).to_csv(output_file, index=False)
        print(f"Saved to: {output_file}")
    except Exception as e:
        print(f"Failed: {e}")
        results.append({
            "model": model_name,
            "avg_latency": None,
            "id_acc": None,
            "ood_acc": None,
            "gqr_score": None,
            "dataset_acc": None,
        })
        pd.DataFrame(results, columns=result_columns).to_csv(output_file, index=False)

print()
print("=" * 60)
print("BootstrapFewShot runs complete!")
print("=" * 60)


DSPy + BootstrapFewShot: mistral:7b
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 4 candidate sets.
Average Metric: 259.00 / 300 (86.3%): 100%|██████████| 300/300 [01:00<00:00,  4.94it/s]

2025/12/30 11:26:48 INFO dspy.evaluate.evaluate: Average Metric: 259 / 300 (86.3%)



New best score: 86.33 for seed -3
Scores so far: [86.33]
Best score so far: 86.33
Average Metric: 283.00 / 300 (94.3%): 100%|██████████| 300/300 [01:24<00:00,  3.56it/s]

2025/12/30 11:28:12 INFO dspy.evaluate.evaluate: Average Metric: 283 / 300 (94.3%)



New best score: 94.33 for seed -2
Scores so far: [86.33, 94.33]
Best score so far: 94.33


  1%|▏         | 4/300 [00:02<02:57,  1.66it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 268.00 / 300 (89.3%): 100%|██████████| 300/300 [01:23<00:00,  3.59it/s]

2025/12/30 11:29:38 INFO dspy.evaluate.evaluate: Average Metric: 268 / 300 (89.3%)



Scores so far: [86.33, 94.33, 89.33]
Best score so far: 94.33


  1%|▏         | 4/300 [00:02<02:42,  1.82it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 272.00 / 300 (90.7%): 100%|██████████| 300/300 [01:33<00:00,  3.22it/s]

2025/12/30 11:31:13 INFO dspy.evaluate.evaluate: Average Metric: 272 / 300 (90.7%)



Scores so far: [86.33, 94.33, 89.33, 90.67]
Best score so far: 94.33


  1%|          | 2/300 [00:01<02:36,  1.91it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 283.00 / 300 (94.3%): 100%|██████████| 300/300 [01:25<00:00,  3.53it/s]

2025/12/30 11:32:39 INFO dspy.evaluate.evaluate: Average Metric: 283 / 300 (94.3%)



Scores so far: [86.33, 94.33, 89.33, 90.67, 94.33]
Best score so far: 94.33


  0%|          | 1/300 [00:00<03:37,  1.37it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 278.00 / 300 (92.7%): 100%|██████████| 300/300 [01:35<00:00,  3.14it/s]

2025/12/30 11:34:16 INFO dspy.evaluate.evaluate: Average Metric: 278 / 300 (92.7%)



Scores so far: [86.33, 94.33, 89.33, 90.67, 94.33, 92.67]
Best score so far: 94.33


  1%|          | 2/300 [00:00<02:27,  2.02it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 278.00 / 300 (92.7%): 100%|██████████| 300/300 [01:28<00:00,  3.38it/s]

2025/12/30 11:35:45 INFO dspy.evaluate.evaluate: Average Metric: 278 / 300 (92.7%)



Scores so far: [86.33, 94.33, 89.33, 90.67, 94.33, 92.67, 92.67]
Best score so far: 94.33
7 candidate programs found.
Saved program to: ../saves/fewshot_mistral_7b.json


mistral:7b: 100%|██████████| 22457/22457 [3:44:19<00:00,  1.67it/s]  



ID: 0.9408 | OOD: 0.9103 | GQR: 0.9253 | Latency: 0.599s
Per-dataset: {'jigsaw': 0.9682638456751711, 'olid': 0.8744186046511628, 'hate_xplain': 0.9720303285593934, 'hate_speech_slovak ': 0.8143899895724713, 'dkhate': 0.9118541033434651, 'web_questions': 0.9483267716535433, 'ml_questions': 0.8828125}
Saved to: ../results/fewshot_results_20251230_112547.csv

DSPy + BootstrapFewShot: granite3.3:2b
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 4 candidate sets.
Average Metric: 244.00 / 300 (81.3%): 100%|██████████| 300/300 [00:27<00:00, 10.91it/s]

2025/12/30 15:20:48 INFO dspy.evaluate.evaluate: Average Metric: 244 / 300 (81.3%)



New best score: 81.33 for seed -3
Scores so far: [81.33]
Best score so far: 81.33
Average Metric: 284.00 / 300 (94.7%): 100%|██████████| 300/300 [04:20<00:00,  1.15it/s]

2025/12/30 15:25:09 INFO dspy.evaluate.evaluate: Average Metric: 284 / 300 (94.7%)



New best score: 94.67 for seed -2
Scores so far: [81.33, 94.67]
Best score so far: 94.67


  1%|▏         | 4/300 [00:03<04:21,  1.13it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 270.00 / 300 (90.0%): 100%|██████████| 300/300 [02:35<00:00,  1.92it/s]

2025/12/30 15:27:48 INFO dspy.evaluate.evaluate: Average Metric: 270 / 300 (90.0%)



Scores so far: [81.33, 94.67, 90.0]
Best score so far: 94.67


  2%|▏         | 6/300 [00:01<01:00,  4.83it/s]


Bootstrapped 4 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Average Metric: 269.00 / 300 (89.7%): 100%|██████████| 300/300 [04:51<00:00,  1.03it/s]

2025/12/30 15:32:41 INFO dspy.evaluate.evaluate: Average Metric: 269 / 300 (89.7%)



Scores so far: [81.33, 94.67, 90.0, 89.67]
Best score so far: 94.67


  1%|          | 2/300 [00:02<05:08,  1.04s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 272.00 / 300 (90.7%): 100%|██████████| 300/300 [04:53<00:00,  1.02it/s]

2025/12/30 15:37:37 INFO dspy.evaluate.evaluate: Average Metric: 272 / 300 (90.7%)



Scores so far: [81.33, 94.67, 90.0, 89.67, 90.67]
Best score so far: 94.67


  0%|          | 1/300 [00:01<09:49,  1.97s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 274.00 / 300 (91.3%): 100%|██████████| 300/300 [04:02<00:00,  1.24it/s]

2025/12/30 15:41:41 INFO dspy.evaluate.evaluate: Average Metric: 274 / 300 (91.3%)



Scores so far: [81.33, 94.67, 90.0, 89.67, 90.67, 91.33]
Best score so far: 94.67


  1%|          | 2/300 [00:03<08:05,  1.63s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 264.00 / 300 (88.0%): 100%|██████████| 300/300 [06:07<00:00,  1.23s/it]

2025/12/30 15:47:52 INFO dspy.evaluate.evaluate: Average Metric: 264 / 300 (88.0%)



Scores so far: [81.33, 94.67, 90.0, 89.67, 90.67, 91.33, 88.0]
Best score so far: 94.67
7 candidate programs found.
Saved program to: ../saves/fewshot_granite3.3_2b.json


granite3.3:2b: 100%|██████████| 22457/22457 [3:01:38<00:00,  2.06it/s]  



ID: 0.9508 | OOD: 0.9007 | GQR: 0.9251 | Latency: 0.485s
Per-dataset: {'jigsaw': 0.977909147479776, 'olid': 0.9488372093023256, 'hate_xplain': 0.9962931760741365, 'hate_speech_slovak ': 0.9770594369134515, 'dkhate': 0.9483282674772037, 'web_questions': 0.8395669291338582, 'ml_questions': 0.6171875}
Saved to: ../results/fewshot_results_20251230_112547.csv

DSPy + BootstrapFewShot: phi4:14b
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 4 candidate sets.
Average Metric: 291.00 / 300 (97.0%): 100%|██████████| 300/300 [00:56<00:00,  5.30it/s]

2025/12/30 18:50:38 INFO dspy.evaluate.evaluate: Average Metric: 291 / 300 (97.0%)



New best score: 97.0 for seed -3
Scores so far: [97.0]
Best score so far: 97.0
Average Metric: 285.00 / 300 (95.0%): 100%|██████████| 300/300 [03:50<00:00,  1.30it/s]

2025/12/30 18:54:28 INFO dspy.evaluate.evaluate: Average Metric: 285 / 300 (95.0%)



Scores so far: [97.0, 95.0]
Best score so far: 97.0


  1%|▏         | 4/300 [00:01<01:37,  3.02it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 288.00 / 300 (96.0%): 100%|██████████| 300/300 [02:59<00:00,  1.67it/s]

2025/12/30 18:57:29 INFO dspy.evaluate.evaluate: Average Metric: 288 / 300 (96.0%)



Scores so far: [97.0, 95.0, 96.0]
Best score so far: 97.0


  2%|▏         | 7/300 [00:02<01:33,  3.13it/s]


Bootstrapped 4 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Average Metric: 284.00 / 300 (94.7%): 100%|██████████| 300/300 [01:51<00:00,  2.69it/s]

2025/12/30 18:59:22 INFO dspy.evaluate.evaluate: Average Metric: 284 / 300 (94.7%)



Scores so far: [97.0, 95.0, 96.0, 94.67]
Best score so far: 97.0


  1%|          | 2/300 [00:04<10:17,  2.07s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 277.00 / 300 (92.3%): 100%|██████████| 300/300 [02:19<00:00,  2.16it/s]

2025/12/30 19:01:46 INFO dspy.evaluate.evaluate: Average Metric: 277 / 300 (92.3%)



Scores so far: [97.0, 95.0, 96.0, 94.67, 92.33]
Best score so far: 97.0


  0%|          | 1/300 [00:00<02:36,  1.92it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 285.00 / 300 (95.0%): 100%|██████████| 300/300 [02:21<00:00,  2.12it/s]

2025/12/30 19:04:08 INFO dspy.evaluate.evaluate: Average Metric: 285 / 300 (95.0%)



Scores so far: [97.0, 95.0, 96.0, 94.67, 92.33, 95.0]
Best score so far: 97.0


  1%|          | 2/300 [00:00<01:38,  3.03it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 271.00 / 300 (90.3%): 100%|██████████| 300/300 [03:46<00:00,  1.33it/s]

2025/12/30 19:07:55 INFO dspy.evaluate.evaluate: Average Metric: 271 / 300 (90.3%)



Scores so far: [97.0, 95.0, 96.0, 94.67, 92.33, 95.0, 90.33]
Best score so far: 97.0
7 candidate programs found.
Saved program to: ../saves/fewshot_phi4_14b.json


phi4:14b: 100%|██████████| 22457/22457 [1:42:20<00:00,  3.66it/s] 



ID: 0.9656 | OOD: 0.9017 | GQR: 0.9325 | Latency: 0.273s
Per-dataset: {'jigsaw': 0.9620410703173615, 'olid': 0.7674418604651163, 'hate_xplain': 0.9629317607413648, 'hate_speech_slovak ': 0.8769551616266945, 'dkhate': 0.9361702127659575, 'web_questions': 0.8843503937007874, 'ml_questions': 0.921875}
Saved to: ../results/fewshot_results_20251230_112547.csv

DSPy + BootstrapFewShot: qwen3:14b
Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 4 candidate sets.
Average Metric: 284.00 / 300 (94.7%): 100%|██████████| 300/300 [01:14<00:00,  4.00it/s]

2025/12/30 20:51:43 INFO dspy.evaluate.evaluate: Average Metric: 284 / 300 (94.7%)



New best score: 94.67 for seed -3
Scores so far: [94.67]
Best score so far: 94.67
Average Metric: 285.00 / 300 (95.0%): 100%|██████████| 300/300 [00:57<00:00,  5.19it/s]

2025/12/30 20:52:41 INFO dspy.evaluate.evaluate: Average Metric: 285 / 300 (95.0%)



New best score: 95.0 for seed -2
Scores so far: [94.67, 95.0]
Best score so far: 95.0


  1%|▏         | 4/300 [00:01<01:58,  2.50it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 286.00 / 300 (95.3%): 100%|██████████| 300/300 [00:57<00:00,  5.25it/s]

2025/12/30 20:53:40 INFO dspy.evaluate.evaluate: Average Metric: 286 / 300 (95.3%)



New best score: 95.33 for seed -1
Scores so far: [94.67, 95.0, 95.33]
Best score so far: 95.33


  1%|▏         | 4/300 [00:01<02:09,  2.29it/s]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Average Metric: 290.00 / 300 (96.7%): 100%|██████████| 300/300 [00:59<00:00,  5.06it/s]

2025/12/30 20:54:41 INFO dspy.evaluate.evaluate: Average Metric: 290 / 300 (96.7%)



New best score: 96.67 for seed 0
Scores so far: [94.67, 95.0, 95.33, 96.67]
Best score so far: 96.67


  1%|          | 2/300 [00:00<02:11,  2.26it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 287.00 / 300 (95.7%): 100%|██████████| 300/300 [00:58<00:00,  5.16it/s]

2025/12/30 20:55:40 INFO dspy.evaluate.evaluate: Average Metric: 287 / 300 (95.7%)



Scores so far: [94.67, 95.0, 95.33, 96.67, 95.67]
Best score so far: 96.67


  0%|          | 1/300 [00:00<02:51,  1.75it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Average Metric: 290.00 / 300 (96.7%): 100%|██████████| 300/300 [01:05<00:00,  4.55it/s]

2025/12/30 20:56:46 INFO dspy.evaluate.evaluate: Average Metric: 290 / 300 (96.7%)



Scores so far: [94.67, 95.0, 95.33, 96.67, 95.67, 96.67]
Best score so far: 96.67


  1%|          | 2/300 [00:00<02:07,  2.34it/s]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Average Metric: 268.00 / 300 (89.3%): 100%|██████████| 300/300 [00:58<00:00,  5.15it/s]

2025/12/30 20:57:46 INFO dspy.evaluate.evaluate: Average Metric: 268 / 300 (89.3%)



Scores so far: [94.67, 95.0, 95.33, 96.67, 95.67, 96.67, 89.33]
Best score so far: 96.67
7 candidate programs found.
Saved program to: ../saves/fewshot_qwen3_14b.json


qwen3:14b: 100%|██████████| 22457/22457 [2:25:38<00:00,  2.57it/s]  


ID: 0.9637 | OOD: 0.9198 | GQR: 0.9412 | Latency: 0.389s
Per-dataset: {'jigsaw': 0.9281269446172993, 'olid': 0.8104651162790698, 'hate_xplain': 0.9435551811288964, 'hate_speech_slovak ': 0.9238790406673618, 'dkhate': 0.9240121580547113, 'web_questions': 0.9084645669291339, 'ml_questions': 1.0}
Saved to: ../results/fewshot_results_20251230_112547.csv

BootstrapFewShot runs complete!



