## DSPy train/eval

Note: I use my own fork of DSPy because I had to implement asynchronous batching: https://github.com/rayanehmi/dspy/tree/feat/async_batching

In [1]:
from pathlib import Path
from typing import Literal
import os 

DATA_PATH = Path.cwd().parent / "data"
DATA_TYPE : Literal["train", "rank", "final"] = "final"
OUTPUT_DIR = os.path.join(DATA_PATH, DATA_TYPE)


In [3]:
# Load complete data
import polars as pl

SEGMENTS_PATH = os.path.join(OUTPUT_DIR, "llm_segments_" + DATA_TYPE + ".parquet")
print(SEGMENTS_PATH)
df = pl.read_parquet(SEGMENTS_PATH)
df.head()

c:\Users\rayte\Work\prc2025dspy\data\final\llm_segments_final.parquet


FileNotFoundError: Le chemin d’accès spécifié est introuvable. (os error 3): c:\Users\rayte\Work\prc2025dspy\data\final\llm_segments_final.parquet

This error occurred with the following context stack:
	[1] 'parquet scan'
	[2] 'sink'


In [8]:
import dspy
from dotenv import load_dotenv
load_dotenv()

True

In [44]:
api_key = os.getenv("OPENAI_API_KEY")
gpt_4_1 = dspy.LM("openai/gpt-4.1", api_key=api_key)
gpt_4_1_nano = dspy.LM("openai/gpt-4.1-nano", api_key=api_key)
groq_api_key = os.getenv("GROQ_API_KEY")
gpt_oss_120b = dspy.LM("groq/openai/gpt-oss-120b", api_key=groq_api_key, cache=False)
gpt_5_1_instant = dspy.LM("openai/gpt-5.1", api_key=api_key, temperature=1.0, max_tokens=32000, reasoning_effort="none", cache=False)

json_adapter = dspy.JSONAdapter()

dspy.configure(lm=gpt_4_1, adapter=json_adapter)

In [10]:
from typing import Any

class BurntFuelPrediction(dspy.Signature):
    """Predict the amount of fuel burnt in kgs by the plane over the given segment of flight.
    Segment data is constructed from noisy telemetry: use your common sense if values seem wrong.
    hint: vertical_rate_balance contains positive_frac, negative_frac and near_zero_frac, each corresponding 
    to the share of samples where vertical rate is respectively more than, less than or around 64 ft/min.
    hint 2: estimate the fuel weight penalty (heavy in the beginning, lighter in the end).
    """
    features : dict[str, Any] = dspy.InputField()
    fuel_kg : float = dspy.OutputField() 

# Zero-shot chain of thought
fuel_cot = dspy.ChainOfThought(BurntFuelPrediction)

In [52]:
def df_to_dspy_examples(row, with_fuel: bool = True):
    """Converts a row to a dspy.Example."""
    row_data = row.to_dicts()[0] if hasattr(row, "to_dicts") else row

    def clean(value):
        return "" if value is None else value

    inputs = [
        "aircraft_type",
        "origin_name",
        "origin_destination",
        "track_points_compact",
        "track_points_compact",
        "vertical_rate_balance"
    ]
    
    features = {key: clean(row_data.get(key)) for key in inputs}
    example = dspy.Example(features=features).with_inputs("features")
    if with_fuel:
        example.fuel_kg = clean(row_data.get("fuel_kg"))
    return example


In [53]:
examples = [
    df_to_dspy_examples(row, with_fuel=False)
    for row in df.iter_rows(named=True)
]
examples[0]

Example({'features': {'aircraft_type': 'B789', 'origin_name': 'Kuala Lumpur International Airport', 'origin_destination': '', 'track_points_compact': 'time 2025-04-13T02:31:04.447000->2025-04-13T03:01:04.487000 (30 min) | sources acars:2, adsb:3076 | altitude 3.597e+04 -> 3.597e+04 -> 3.6e+04 -> 3.6e+04 (delta 24.99, range 24.99, mean 3.599e+04) | groundspeed 467 -> 474 -> 471 -> 476 (delta 9, range 15, mean 472) | vertical_rate 0 -> 0 -> 0 -> -64 (delta -64, range 128, mean -3.36) | mach 0.86 -> 0.86 (delta 0, range 0, mean 0.86) | path 45.18/24.35 -> 45.9/22.72 -> 46.62/20.97 -> 47.22/19.52 | delta_lat 2.033 delta_lon -4.833 | phase cruise | vr balance +0.00 / -0.00 / ~0 1.00', 'vertical_rate_balance': {'positive_frac': 0.0, 'negative_frac': 0.0, 'near_zero_frac': 1.0}}}) (input_keys={'features'})

In [None]:
(0.59 + 0.84) / 1100 * len(examples)

170.989

In [13]:
import random
import copy

randomized_examples = copy.deepcopy(examples)
random.Random(42).shuffle(randomized_examples)

## Metrics

In [14]:
import math

def float_metric(gold: dspy.Example, pred: dspy.Prediction, trace=None):
    """Return a scalar score (negative squared error) for the evaluator."""
    true_value = gold.fuel_kg
    pred_value = pred.fuel_kg
    if true_value is None or pred_value is None:
        return float("nan")

    squared_error = (true_value - pred_value) ** 2

    if trace is None: # if we're doing evaluation or optimization
        return -squared_error
    else:  # During bootstrapping / trace collection we simply mark good demos.
        return squared_error < 40000  # Squared error 200.
    
fake_example = dspy.Example(features={"foo": "bar"}, fuel_kg=500.0)
fake_prediction = dspy.Prediction(features={"foo": "bar"}, fuel_kg=600.0)
print(float_metric(fake_example, fake_prediction))  # error -100.0
print(float_metric(fake_example, fake_prediction, trace='foo'))  # True

-10000.0
True


## Evaluation

In [15]:
from dspy.evaluate import Evaluate

In [16]:
evaluator_50 = Evaluate(
    devset=randomized_examples[:50],
    num_threads=50, 
    display_progress=True, 
    display_table=True
)

evaluator_500 = Evaluate(
    devset=randomized_examples[:400],
    num_threads=15, 
    display_progress=True, 
    display_table=True
)

In [None]:
with dspy.context(lm=gpt_oss_120b):
    eval_results = evaluator_500(fuel_cot, metric=float_metric)

In [17]:
def compute_rmse(eval_results):
    rmse = math.sqrt(abs(eval_results.score)/len(eval_results.results))
    return rmse

In [None]:
compute_rmse(eval_results)

In [None]:
# dump in a csv file
import pandas as pd
import csv

results_list = []
for result in eval_results.results:
    true_value = result[0].fuel_burnt
    reasoning = result[1].reasoning
    pred_value = result[1].fuel_burnt
    metric = result[2]
    results_list.append({
        "true_value": true_value,
        "reasoning": reasoning,
        "pred_value": pred_value,
        "metric": metric
    })
results_df = pd.DataFrame(results_list)
results_df.to_csv("eval_results.csv", index=False, quoting=csv.QUOTE_ALL)

## Batch API

In [54]:
from tqdm import tqdm

EXAMPLES_PER_BATCH = 100

list_of_batches = []
# 0. prepare batches of 100 examples each
for i in tqdm(
    range(0, len(examples), EXAMPLES_PER_BATCH)
):
    batch_range = examples[i : i + EXAMPLES_PER_BATCH]
    list_of_batches.append(batch_range)


100%|██████████| 1316/1316 [00:00<00:00, 997056.37it/s]


In [56]:
list_of_batches[0][0]

Example({'features': {'aircraft_type': 'B789', 'origin_name': 'Kuala Lumpur International Airport', 'origin_destination': '', 'track_points_compact': 'time 2025-04-13T02:31:04.447000->2025-04-13T03:01:04.487000 (30 min) | sources acars:2, adsb:3076 | altitude 3.597e+04 -> 3.597e+04 -> 3.6e+04 -> 3.6e+04 (delta 24.99, range 24.99, mean 3.599e+04) | groundspeed 467 -> 474 -> 471 -> 476 (delta 9, range 15, mean 472) | vertical_rate 0 -> 0 -> 0 -> -64 (delta -64, range 128, mean -3.36) | mach 0.86 -> 0.86 (delta 0, range 0, mean 0.86) | path 45.18/24.35 -> 45.9/22.72 -> 46.62/20.97 -> 47.22/19.52 | delta_lat 2.033 delta_lon -4.833 | phase cruise | vr balance +0.00 / -0.00 / ~0 1.00', 'vertical_rate_balance': {'positive_frac': 0.0, 'negative_frac': 0.0, 'near_zero_frac': 1.0}}}) (input_keys={'features'})

In [57]:
import os

list_of_artifacts = []
for (i, minibatch) in tqdm(enumerate(list_of_batches), total=len(list_of_artifacts)):
    # 1. create a jsonl file
    artifacts = fuel_cot.create_batch_file(
        minibatch,
        input_file_path=os.path.join(OUTPUT_DIR, "batches", str(i)+".jsonl"),
        # endpoint="/v1/chat/completions"
    )
    list_of_artifacts.append(artifacts)


195it [00:03, 60.42it/s]


KeyboardInterrupt: 

In [31]:
artifact = list_of_artifacts[0]
artifact

BatchRequestArtifacts(request_file=WindowsPath('c:/Users/rayte/Work/prc2025dspy/data/train/batches/0.jsonl'), metadata_file=WindowsPath('c:/Users/rayte/Work/prc2025dspy/data/train/batches/0.jsonl.metadata.json'), metadata={'generated_at': '2025-11-29T21:53:08.615742Z', 'request_file': 'c:\\Users\\rayte\\Work\\prc2025dspy\\data\\train\\batches\\0.jsonl', 'endpoint': 'openai/v1/batches', 'dependency_versions': {'python': '3.13', 'dspy': '3.1.0b1', 'cloudpickle': '3.1'}, 'module': {'class_path': 'dspy.predict.chain_of_thought.ChainOfThought', 'repr': 'predict = Predict(StringSignature(features -> reasoning, fuel_kg\n    instructions=\'Predict the amount of fuel burnt in kgs by the plane over the given segment of flight.\\nSegment data is constructed from noisy telemetry: use your common sense if values seem wrong.\\nhint: vertical_rate_balance contains positive_frac, negative_frac and near_zero_frac, each corresponding \\nto the share of samples where vertical rate is respectively more th

In [58]:
# 2. send it to the endpoint
# batch_handle = await fuel_cot.asubmit_batch_file(
#     artifact,
#     completion_window="24h",
# )

predictions = await fuel_cot.abatch(
    examples=list_of_batches[0]
)

In [59]:
predictions

[Prediction(
     reasoning='This segment covers a 30-minute cruise phase in a Boeing 787-9 (B789), at a near-constant cruise altitude and speed (FL360, Mach 0.86, 472 knots mean). Vertical rate is essentially zero, indicating steady level flight. For a B789, typical cruise fuel burn ranges from about 5 to 6 metric tons per hour, depending on weight and engines, but this appears to be a mid-late cruise segment (fuel weight somewhat reduced, so better efficiency than in initial cruise climb). For 30 minutes, the fuel burn would therefore be (5 to 6)/2 = 2.5 to 3.0 metric tons (2500-3000 kg). Given steady cruise at optimal speed and altitude, use 2700 kg as a reasonable average.',
     fuel_kg=2700.0
 ),
 Prediction(
     reasoning="This is a mid-cruise segment for a Boeing 787-9 (B789), spanning 15 minutes at near-constant cruise altitude (~36,000 ft), stable mach (0.86), and no significant vertical movement (vertical_rate consistently ~0, with 'near_zero_frac' 1.0). Cruise phases are t

## Usage

In [None]:
for lm in [gpt_oss_120b]:
  cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])
  print(cost)