## DSPy train/eval

Note: I use my own fork of DSPy because I had to implement asynchronous batching: https://github.com/rayanehmi/dspy/tree/feat/async_batching

In [4]:
from pathlib import Path
from typing import Literal
import os 

DATA_PATH = Path.cwd().parent / "data"
DATA_TYPE : Literal["train", "rank", "final"] = "train"
OUTPUT_DIR = os.path.join(DATA_PATH, "output_batches", DATA_TYPE)


In [6]:
# Load complete data
import polars as pl

SEGMENTS_PATH = os.path.join(OUTPUT_DIR, "complete.parquet")
df = pl.read_parquet(SEGMENTS_PATH)
df = df.cast({
    "alt_start": int,
    "alt_end": int,
    "peak_flight_altitude": int,
    "vertical_delta_up": int,
    "vertical_delta_down": int,
    "ground_speed_start": int,
    "ground_speed_end": int
})
# df.head()

In [7]:
df_masked = df.drop(("fuel_burnt"))
df_masked.head()

idx,flight_id,aircraft,duration_min,route_icao,alt_start,alt_end,peak_flight_altitude,vertical_delta_up,vertical_delta_down,ground_speed_start,ground_speed_end
i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64
1143,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37025,37025,37025,0,25,496,496
1144,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37025,37025,37025,25,0,496,496
1145,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37025,37000,37025,0,25,496,496
1146,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37000,37000,37000,25,0,496,496
2474,"""prc771127708""","""B788""","""25.0min""","""SKBO-LEMD""",41000,41000,41000,0,25,495,538


In [27]:
import dspy
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
api_key = os.getenv("OPENAI_API_KEY")
gpt_4_1_nano = dspy.LM("openai/gpt-4.1-nano", api_key=api_key)
groq_api_key = os.getenv("GROQ_API_KEY")
gpt_oss_120b = dspy.LM("groq/openai/gpt-oss-120b", api_key=groq_api_key, cache=False)
gpt_5_1_instant = dspy.LM("openai/gpt-5.1", api_key=api_key, temperature=1.0, max_tokens=32000, reasoning_effort="none", cache=False)

json_adapter = dspy.JSONAdapter()

dspy.configure(lm=gpt_oss_120b, adapter=json_adapter)

In [9]:
from typing import Any, Annotated
import pydantic


class BurntFuelPrediction(dspy.Signature):
    """Predict the amount of fuel burnt in kgs by the plane over the given segment of flight.
    Segment data is constructed from noisy telemetry: use your common sense if values seem wrong.
    """
    aircraft: str = dspy.InputField()
    duration_min: str = dspy.InputField()
    route_icao: str = dspy.InputField()
    alt_start: int = dspy.InputField(desc="Altitude at the start of the segment in feet.")
    alt_end: int = dspy.InputField()
    peak_flight_altitude: int = dspy.InputField(desc="Peak altitude of the flight. Can be used to estimate the phase of the segment.")
    vertical_delta_up:  int = dspy.InputField(desc="Cumulative climb within the segment (summing positive altitude diffs).")
    vertical_delta_down: int = dspy.InputField()
    ground_speed_start: int = dspy.InputField(desc="Ground speed at the start of the segment.")
    ground_speed_end: int = dspy.InputField()
    fuel_burnt: int = dspy.OutputField(desc="Estimated amount of fuel burnt in kilograms.")  #  Can be an expression that will be evaluated (like '801 * 60 / 3.6')


# Zero-shot chain of thought
fuel_cot = dspy.ChainOfThought(BurntFuelPrediction)

In [13]:
eval_set = df[:25]
eval_set.head()

idx,fuel_burnt,flight_id,aircraft,duration_min,route_icao,alt_start,alt_end,peak_flight_altitude,vertical_delta_up,vertical_delta_down,ground_speed_start,ground_speed_end
i64,f64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64
1143,1106.76448,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37025,37025,37025,0,25,496,496
1144,1070.47712,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37025,37025,37025,25,0,496,496
1145,925.32768,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37025,37000,37025,0,25,496,496
1146,943.47136,"""prc770962946""","""A332""","""12.0min""","""ZSHC-NZAA""",37000,37000,37000,25,0,496,496
2474,2000.0,"""prc771127708""","""B788""","""25.0min""","""SKBO-LEMD""",41000,41000,41000,0,25,495,538


In [47]:
def df_to_dspy_examples(row, with_fuel: bool = False):
    """Converts a row to a dspy.Example."""
    row_data = row.to_dicts()[0] if hasattr(row, "to_dicts") else row

    def clean(value):
        return "" if value is None else value

    inputs = [
        "aircraft",
        "duration_min",
        "route_icao",
        "alt_start",
        "alt_end",
        "peak_flight_altitude",
        # "vertical_delta_up",
        # "vertical_delta_down",
        # "ground_speed_start",
        # "ground_speed_end",
    ]
    payload = {key: clean(row_data.get(key)) for key in inputs}
    example = dspy.Example(**payload).with_inputs(*inputs)
    if with_fuel:
        example.fuel_burnt = clean(row_data.get("fuel_burnt"))
    return example

In [48]:
examples = [
    df_to_dspy_examples(row, with_fuel=True)
    for row in df.iter_rows(named=True)
]
examples[0]

Example({'aircraft': 'A332', 'duration_min': '12.0min', 'route_icao': 'ZSHC-NZAA', 'alt_start': 37025, 'alt_end': 37025, 'peak_flight_altitude': 37025, 'fuel_burnt': 1106.76448}) (input_keys={'alt_end', 'route_icao', 'alt_start', 'peak_flight_altitude', 'duration_min', 'aircraft'})

In [49]:
import random
import copy

randomized_examples = copy.deepcopy(examples)
random.Random(42).shuffle(randomized_examples)

## Metrics

In [None]:
import math

def float_metric(gold: dspy.Example, pred: dspy.Prediction, trace=None):
    """Return a scalar score (negative squared error) for the evaluator."""
    true_value = gold.fuel_burnt
    pred_value = pred.fuel_burnt
    if true_value is None or pred_value is None:
        return float("nan")

    squared_error = (true_value - pred_value) ** 2

    if trace is None: # if we're doing evaluation or optimization
        return -squared_error
    else:  # During bootstrapping / trace collection we simply mark good demos.
        return squared_error < 40000  # Squared error 200.
    
fake_example = dspy.Example(features={"foo": "bar"}, fuel_burnt=500.0)
fake_prediction = dspy.Prediction(features={"foo": "bar"}, fuel_burnt=600.0)
print(float_metric(fake_example, fake_prediction))  # error -100.0
print(float_metric(fake_example, fake_prediction, trace='foo'))  # True

-10000.0
True


## Evaluation

In [22]:
from dspy.evaluate import Evaluate

In [39]:
evaluator_50 = Evaluate(
    devset=randomized_examples[:50],
    num_threads=50, 
    display_progress=True, 
    display_table=True
)

In [50]:
with dspy.context(lm=gpt_5_1_instant):
    eval_results = evaluator_50(fuel_cot, metric=float_metric, num_threads=25)

Average Metric: -7212037.73 / 50 (-14424075.5%): 100%|██████████| 50/50 [00:07<00:00,  6.57it/s]

2025/11/29 18:16:16 INFO dspy.evaluate.evaluate: Average Metric: -7212037.727646933 / 50 (-14424075.5%)





Unnamed: 0,aircraft,duration_min,route_icao,alt_start,alt_end,peak_flight_altitude,vertical_delta_up,vertical_delta_down,ground_speed_start,ground_speed_end,example_fuel_burnt,reasoning,pred_fuel_burnt,float_metric
0,B38M,1.0min,RCTP-WMKK,36000,33975,36000,3900.0,5950.0,473,462,200.0,Aircraft B38M (737 MAX 8). Segment is 1 minute long at high cruise...,43,✔️ [-24649.000]
1,A359,3.0min,ZSPD-LEMD,38025,38025,38050,75.0,75.0,420,410,355.616128,Aircraft is an Airbus A350-900 (A359). Altitude is essentially con...,280,✔️ [-5717.799]
2,B738,4.0min,LTFJ-EGCC,36000,36000,36000,0.0,0.0,389,400,217.72416,"Aircraft: Boeing 737-800 at cruise (FL360 to FL360, no vertical ch...",170,✔️ [-2277.595]
3,A20N,4.0min,KMDW-MMGL,37975,38025,38075,425.0,375.0,456,460,45.3592,Aircraft A20N (A320neo). Segment duration 4 minutes at about FL380...,150,✔️ [-10949.697]
4,A20N,1.0min,SKBO-KMCO,4150,2200,6075,6450.0,8375.0,239,204,41.276872,"Aircraft A20N (A320neo). Segment duration is 1 minute, altitude dr...",24,✔️ [-298.490]
5,A20N,5.0min,MMTJ-MMUN,37000,37000,37025,150.0,175.0,496,506,90.7184,Aircraft A20N (A320neo) at essentially constant cruise altitude FL...,200,✔️ [-11942.468]
6,B789,14.0min,EHAM-KPDX,40000,40025,40025,2950.0,2925.0,465,461,1100.0,Aircraft: Boeing 787-9 (B789). Segment duration: 14 minutes (~0.23...,1400,✔️ [-90000.000]
7,A320,4.0min,MMGL-KCLT,37000,37000,37025,675.0,675.0,441,442,100.0,"Segment is 4 minutes at essentially constant cruise altitude (37,0...",170,✔️ [-4900.000]
8,A20N,5.0min,SKBO-KORD,38000,38000,38075,950.0,950.0,455,455,163.0,Aircraft A20N (A320neo). Segment is 5 minutes at essentially const...,200,✔️ [-1369.000]
9,A320,5.0min,KORD-MMGL,36025,36000,36025,400.0,425.0,356,354,200.0,Segment is at essentially peak cruise altitude (~FL360) with very ...,225,✔️ [-625.000]


In [42]:
def compute_rmse(eval_results):
    rmse = math.sqrt(abs(eval_results.score)/len(eval_results.results))
    return rmse

In [51]:
compute_rmse(eval_results)

537.1047469535156

In [46]:
# dump in a csv file
import pandas as pd
import csv

results_list = []
for result in eval_results.results:
    true_value = result[0].fuel_burnt
    reasoning = result[1].reasoning
    pred_value = result[1].fuel_burnt
    metric = result[2]
    results_list.append({
        "true_value": true_value,
        "reasoning": reasoning,
        "pred_value": pred_value,
        "metric": metric
    })
results_df = pd.DataFrame(results_list)
results_df.to_csv("eval_results.csv", index=False, quoting=csv.QUOTE_ALL)

## Usage

In [30]:
for lm in [gpt_5_1_instant]:
  cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])
  print(cost)

0.134725
