## DSPy process

Note: I use my own fork of DSPy because I had to implement asynchronous batching: https://github.com/rayanehmi/dspy/tree/feat/async_batching

In [1]:
from pathlib import Path
from typing import Literal
import os 

DATA_PATH = Path.cwd().parent / "data"
DATA_TYPE : Literal["train", "rank", "final"] = "final"
OUTPUT_DIR = os.path.join(DATA_PATH, DATA_TYPE)


In [76]:
# Load complete data
import polars as pl

SEGMENTS_PATH = os.path.join(OUTPUT_DIR, "llm_segments.parquet")
print(SEGMENTS_PATH)
initial_df = pl.read_parquet(SEGMENTS_PATH)
initial_df.head()

c:\Users\rayte\Work\prc2025dspy\data\final\llm_segments.parquet


idx,flight_id,fuel_kg,flight_date,aircraft_type,origin_name,destination_name,track_points_compact,vertical_rate_balance
i64,str,null,date,str,str,str,str,struct[3]
0,"""prc806615763""",,2025-09-01,"""A320""","""Guadalajara International Airp…","""Oakland International Airport""","""time 2025-09-01T03:03:10.92500…","{0.0,0.0,1.0}"
1,"""prc806615763""",,2025-09-01,"""A320""","""Guadalajara International Airp…","""Oakland International Airport""","""time 2025-09-01T03:07:51.58400…","{0.0,0.0,1.0}"
2,"""prc806615763""",,2025-09-01,"""A320""","""Guadalajara International Airp…","""Oakland International Airport""","""time 2025-09-01T03:12:50.92100…","{0.0,0.0,1.0}"
3,"""prc806615763""",,2025-09-01,"""A320""","""Guadalajara International Airp…","""Oakland International Airport""","""time 2025-09-01T03:17:51.40400…","{0.0,0.0,1.0}"
4,"""prc806615763""",,2025-09-01,"""A320""","""Guadalajara International Airp…","""Oakland International Airport""","""time 2025-09-01T03:22:50.53900…","{0.0,0.0,1.0}"


In [2]:
import dspy
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
api_key = os.getenv("OPENAI_API_KEY")
gpt_4_1 = dspy.LM("openai/gpt-4.1", api_key=api_key)
gpt_4_1_nano = dspy.LM("openai/gpt-4.1-nano", api_key=api_key)
groq_api_key = os.getenv("GROQ_API_KEY")
gpt_oss_120b = dspy.LM("groq/openai/gpt-oss-120b", api_key=groq_api_key, cache=False)
gpt_5_1_instant = dspy.LM("openai/gpt-5.1", api_key=api_key, temperature=1.0, max_tokens=32000, reasoning_effort="none", cache=False)

# json_adapter = dspy.JSONAdapter()  # made the first batch fail?

dspy.configure(lm=gpt_oss_120b)

In [4]:
from typing import Any

class BurntFuelPrediction(dspy.Signature):
    """Predict the amount of fuel burnt in kgs by the plane over the given segment of flight.
    Segment data is constructed from noisy telemetry: use your common sense if values seem wrong.
    hint: vertical_rate_balance contains positive_frac, negative_frac and near_zero_frac, each corresponding 
    to the share of samples where vertical rate is respectively more than, less than or around 64 ft/min.
    hint 2: estimate the fuel weight penalty (heavy in the beginning, lighter in the end).
    """
    features : dict[str, Any] = dspy.InputField()
    fuel_kg : float = dspy.OutputField() 

# Zero-shot chain of thought
fuel_cot = dspy.ChainOfThought(BurntFuelPrediction)

In [50]:
def df_to_dspy_examples(row, with_fuel: bool = True):
    """Converts a row to a dspy.Example."""
    row_data = row.to_dicts()[0] if hasattr(row, "to_dicts") else row

    def clean(value):
        return "" if value is None else value

    inputs = [
        "idx",
        "aircraft_type",
        "origin_name",
        "origin_destination",
        "track_points_compact",
        "track_points_compact",
        "vertical_rate_balance"
    ]
    
    features = {key: clean(row_data.get(key)) for key in inputs}
    example = dspy.Example(features=features).with_inputs("features")
    if with_fuel:
        example.fuel_kg = clean(row_data.get("fuel_kg"))
    return example


In [77]:
examples = [
    df_to_dspy_examples(row, with_fuel=False)
    for row in initial_df.iter_rows(named=True)
]
examples[0]

Example({'features': {'idx': 0, 'aircraft_type': 'A320', 'origin_name': 'Guadalajara International Airport', 'origin_destination': '', 'track_points_compact': 'time 2025-09-01T03:03:10.925000->2025-09-01T03:07:51.584000 (4.678 min) | sources acars:2, adsb:557 | altitude 3.6e+04 -> 3.598e+04 -> 3.602e+04 -> 3.601e+04 (delta 9, range 50, mean 3.6e+04) | groundspeed 446 -> 448 -> 447 -> 447 (delta 1, range 2, mean 447.3) | vertical_rate 0 -> 0 -> 0 -> -64 (delta -64, range 128, mean 8.989) | mach 0.746 -> 0.749 (delta 0.003, range 0.003, mean 0.7475) | path 31.75/-116.8 -> 31.96/-116.9 -> 32.13/-117 -> 32.29/-117.1 | delta_lat 0.5339 delta_lon -0.3661 | phase cruise | vr balance +0.00 / -0.00 / ~0 1.00', 'vertical_rate_balance': {'positive_frac': 0.0, 'negative_frac': 0.0, 'near_zero_frac': 1.0}}}) (input_keys={'features'})

In [8]:
import random
import copy

randomized_examples = copy.deepcopy(examples)
random.Random(42).shuffle(randomized_examples)

## Metrics

In [7]:
import math

def float_metric(gold: dspy.Example, pred: dspy.Prediction, trace=None):
    """Return a scalar score (negative squared error) for the evaluator."""
    true_value = gold.fuel_kg
    pred_value = pred.fuel_kg
    if true_value is None or pred_value is None:
        return float("nan")

    squared_error = (true_value - pred_value) ** 2

    if trace is None: # if we're doing evaluation or optimization
        return -squared_error
    else:  # During bootstrapping / trace collection we simply mark good demos.
        return squared_error < 40000  # Squared error 200.
    
fake_example = dspy.Example(features={"foo": "bar"}, fuel_kg=500.0)
fake_prediction = dspy.Prediction(features={"foo": "bar"}, fuel_kg=600.0)
print(float_metric(fake_example, fake_prediction))  # error -100.0
print(float_metric(fake_example, fake_prediction, trace='foo'))  # True

-10000.0
True


## Evaluation

In [6]:
from dspy.evaluate import Evaluate

In [16]:
evaluator_50 = Evaluate(
    devset=randomized_examples[:50],
    num_threads=50, 
    display_progress=True, 
    display_table=True
)

evaluator_500 = Evaluate(
    devset=randomized_examples[:400],
    num_threads=15, 
    display_progress=True, 
    display_table=True
)

In [None]:
with dspy.context(lm=gpt_oss_120b):
    eval_results = evaluator_500(fuel_cot, metric=float_metric)

In [17]:
def compute_rmse(eval_results):
    rmse = math.sqrt(abs(eval_results.score)/len(eval_results.results))
    return rmse

In [None]:
compute_rmse(eval_results)

In [None]:
# dump in a csv file
import pandas as pd
import csv

results_list = []
for result in eval_results.results:
    true_value = result[0].fuel_burnt
    reasoning = result[1].reasoning
    pred_value = result[1].fuel_burnt
    metric = result[2]
    results_list.append({
        "true_value": true_value,
        "reasoning": reasoning,
        "pred_value": pred_value,
        "metric": metric
    })
results_df = pd.DataFrame(results_list)
results_df.to_csv("eval_results.csv", index=False, quoting=csv.QUOTE_ALL)

## Batch API

In [None]:
from tqdm import tqdm

In [None]:
def create_minibatch(examples, n_examples_per_batch = 10000):
    list_of_batches = []
    for i in tqdm(
        range(0, len(examples), n_examples_per_batch)
    ):
        batch_range = examples[i : i + n_examples_per_batch]
        list_of_batches.append(batch_range)
    return list_of_batches

list_of_batches = create_minibatch(examples=examples)

100%|██████████| 7/7 [00:00<00:00, 14926.35it/s]


In [None]:
import os

def create_batch_files(list_of_batches: list):
    """Takes a list of minibatches and creates the files"""
    list_of_artifacts = []
    for (i, minibatch) in tqdm(enumerate(list_of_batches), total=len(list_of_batches)):
        # 1. create a jsonl file
        artifact = fuel_cot.create_batch_file(
            minibatch,
            input_file_path=os.path.join(OUTPUT_DIR, "batches", str(i)+".jsonl"),
            # endpoint="/v1/chat/completions"
        )
        list_of_artifacts.append(artifact)
    return list_of_artifacts

list_of_artifacts = create_batch_files(list_of_batches)


NameError: name 'tqdm' is not defined

In [None]:
# 2. send it to the endpoint
# DANGER :)
async def send_batches(list_of_artifacts):
    batch_handles = [
        await fuel_cot.asubmit_batch_file(
            artifact,
            completion_window="24h",
        ) for artifact in list_of_artifacts
    ]
    return batch_handles

batch_handles = await send_batches(list_of_artifacts)

In [None]:
# import pickle
# with open('checkpoint_list_of_batches.pkl', 'wb') as f:  # open a text file
#     pickle.dump(list_of_batches, f) # serialize the list
# with open('checkpoint_list_of_artifacts.pkl', 'wb') as f:  # open a text file
#     pickle.dump(list_of_artifacts, f) # serialize the list
# with open('checkpoint_batch_handles.pkl', 'wb') as f:  # open a text file
#     pickle.dump(batch_handles, f) # serialize the list

NameError: name 'list_of_batches' is not defined

In [9]:
import pickle
with open('checkpoint_list_of_batches.pkl', 'rb') as f:
    list_of_batches = pickle.load(f)
with open('checkpoint_list_of_artifacts.pkl', 'rb') as f:
    list_of_artifacts = pickle.load(f)
with open('checkpoint_batch_handles.pkl', 'rb') as f:
    batch_handles = pickle.load(f)

In [10]:
# 3. and finally retrieve the predictions
responses = [
    await fuel_cot.aretrieve_batch_predictions(
        batch_handle.batch_id,
        artifact,
        custom_llm_provider="groq",
        download_output_path=os.path.join(OUTPUT_DIR, "batches_output", batch_handle.batch_id),  # optional, defaults next to metadata
        return_failed_items=True,
    ) for batch_handle, artifact in zip(batch_handles, list_of_artifacts)
]

Create a df with input/preds

In [None]:
import pandas as pd

rows = []
failures_to_retry = []

for (preds, failures), artifact in zip(responses, list_of_artifacts):
    examples = artifact.metadata["examples"]
    for idx, pred in enumerate(preds):
        row = {
            **examples[idx]["inputs"],              # original features
            "custom_id": examples[idx]["custom_id"]
        }
        if pred is None:
            row["fuel_kg"] = None
            row["reasoning"] = None
        else:
            outputs = pred.toDict()
            row["fuel_kg"] = outputs["fuel_kg"]
            row["reasoning"] = outputs["reasoning"]
        rows.append(row)

    for failure in failures:
        failures_to_retry.append(
            examples[failure.index]["inputs"]
        )

final_df = pd.DataFrame(rows)


In [86]:
import pandas as pd

def responses_to_dataframe(
    responses,
    artifacts,
    *,
    feature_col: str = "features",
    prediction_cols: tuple[str, str] = ("reasoning", "fuel_kg"),
    nested_feature_key: str = "features",  # the key inside artifact metadata
    index_field: str = "idx",
):
    rows: list[dict] = []
    failures_to_retry: list[dict] = []

    for (preds, failures), artifact in zip(responses, artifacts):
        examples = artifact.metadata["examples"]
        for idx, pred in enumerate(preds):
            inputs = examples[idx]["inputs"]
            nested_features = inputs.get(nested_feature_key, inputs)

            row = {
                feature_col: nested_features,                # keep the full feature dict
                "custom_id": examples[idx]["custom_id"],
                index_field: nested_features.get(index_field),
            }

            if pred is None:
                for col in prediction_cols:
                    row[col] = None
            else:
                outputs = pred.toDict()
                for col in prediction_cols:
                    row[col] = outputs.get(col)

            rows.append(row)

        for failure in failures:
            failures_to_retry.append(examples[failure.index]["inputs"])

    final_df = pd.DataFrame(rows)
    return final_df, failures_to_retry


In [87]:
final_df, retry_payloads = responses_to_dataframe(
    responses,
    list_of_artifacts,
    feature_col="features",
    prediction_cols=("reasoning", "fuel_kg"),
    nested_feature_key="features",
    index_field="idx",
)


In [None]:
final_df

In [88]:
final_df.head()

Unnamed: 0,features,custom_id,idx,reasoning,fuel_kg
0,"{'idx': 0, 'aircraft_type': 'A320', 'origin_na...",chainofthought-0-e95bae8c,0,The segment lasts 4.678 minutes (≈0.078 hours)...,190.0
1,"{'idx': 1, 'aircraft_type': 'A320', 'origin_na...",chainofthought-1-2cca347b,1,The segment lasts about 5 minutes (4.989 min ≈...,208.3
2,"{'idx': 2, 'aircraft_type': 'A320', 'origin_na...",chainofthought-2-c0dfb547,2,The segment lasts about 5 minutes (0.0835 h) a...,200.3
3,"{'idx': 3, 'aircraft_type': 'A320', 'origin_na...",chainofthought-3-0a205acf,3,The segment lasts just under 5 minutes (4.986 ...,210.0
4,"{'idx': 4, 'aircraft_type': 'A320', 'origin_na...",chainofthought-4-f785a9d4,4,The segment lasts about 5.0 minutes (0.0833 ho...,219.0


In [91]:
len(final_df)

61745

In [93]:
final_df.to_csv("final_df.csv")

In [90]:
len(final_df[final_df["fuel_kg"].isna()])

3989

In [89]:
import pandas as pd
import dspy

async def round_v2(
    module: dspy.Module,
    final_df: pd.DataFrame,
    *,
    feature_column: str = "features",
    prediction_columns: tuple[str, str] = ("reasoning", "fuel_kg"),
    index_column: str = "idx",
    n_pred_per_batch: int = 10_000,
    minimal_batch_threshold: int = 200,
):
    """
    Fill missing predictions in `final_df` by calling `module`.
    Adds/updates `index_column` so you can join back to the original dataset.
    """

    def to_example(payload):
        if isinstance(payload, dspy.Example):
            return payload
        if isinstance(payload, dict):
            nested = payload.get("features", payload)
            return dspy.Example(features=nested).with_inputs("features")
        raise TypeError(...)


    def get_idx(payload):
        if isinstance(payload, dspy.Example):
            data = payload.inputs().toDict().get("features", payload.inputs().toDict())
        elif isinstance(payload, dict):
            data = payload.get("features", payload)
        else:
            return None
        return data.get("idx")


    if index_column not in final_df.columns:
        final_df[index_column] = pd.NA

    missing_mask = final_df[prediction_columns[-1]].isna()
    pending_idx = final_df.index[missing_mask].tolist()
    if not pending_idx:
        return final_df, 0, 0

    pending_idx = pending_idx[:n_pred_per_batch]
    indexed_examples = []
    for row_idx in pending_idx:
        payload = final_df.at[row_idx, feature_column]
        example = to_example(payload)
        # make sure we keep the original idx visible on the dataframe
        idx_value = get_idx(payload)
        if idx_value is not None:
            final_df.at[row_idx, index_column] = idx_value
        indexed_examples.append((row_idx, example))

    examples = [ex for _, ex in indexed_examples]
    newly_filled = 0

    if len(examples) < minimal_batch_threshold:
        predictions, failed_examples, _ = module.batch(
            examples,
            return_failed_examples=True,
        )
        for (row_idx, _), pred in zip(indexed_examples, predictions):
            if pred is None:
                continue
            values = pred.toDict()
            for col in prediction_columns:
                final_df.at[row_idx, col] = values.get(col)
            newly_filled += 1
        remaining = len(pending_idx) - newly_filled
        return final_df, newly_filled, remaining

    preds, failures = await module.abatch(
        examples,
        return_failed_items=True,
    )
    for position, pred in enumerate(preds):
        if pred is None:
            continue
        row_idx = indexed_examples[position][0]
        values = pred.toDict()
        for col in prediction_columns:
            final_df.at[row_idx, col] = values.get(col)
        newly_filled += 1

    remaining = len(pending_idx) - newly_filled
    return final_df, newly_filled, remaining


In [96]:
final_df, filled, remaining = await round_v2(
    fuel_cot,
    final_df,
    n_pred_per_batch=10_000,
    minimal_batch_threshold=200,
)

print(f"Filled {filled} rows; {remaining} still NaN.")


Processed 2 / 2 examples: 100%|██████████| 2/2 [00:01<00:00,  1.44it/s]
Filled 2 rows; 0 still NaN.


In [97]:
len(final_df[final_df["fuel_kg"].isna()])

0

In [98]:
import pickle
with open('final_df.pkl', 'wb') as f:  # open a text file
    pickle.dump(final_df, f) # serialize the list

In [101]:
final_df.columns

Index(['features', 'custom_id', 'idx', 'reasoning', 'fuel_kg'], dtype='object')

In [None]:
submission_file = pd.read_parquet(os.path.join(DATA_PATH, "fuel_final_submission.parquet"))
submission_file.columns

Index(['idx', 'flight_id', 'start', 'end', 'fuel_kg'], dtype='object')

In [103]:
submission_file.head()

Unnamed: 0,idx,flight_id,start,end,fuel_kg
0,0,prc806615763,2025-09-01 03:03:10.925,2025-09-01 03:07:51.584,
1,1,prc806615763,2025-09-01 03:07:51.584,2025-09-01 03:12:50.921,
2,2,prc806615763,2025-09-01 03:12:50.921,2025-09-01 03:17:51.404,
3,3,prc806615763,2025-09-01 03:17:51.404,2025-09-01 03:22:50.539,
4,4,prc806615763,2025-09-01 03:22:50.539,2025-09-01 03:27:50.727,


In [104]:
# Ensure the idx column is unique in final_df (drop duplicates if you retried rows)
pred_lookup = (
    final_df
    .dropna(subset=["fuel_kg"])
    .drop_duplicates(subset=["idx"], keep="last")
    .set_index("idx")["fuel_kg"]
)

submission_file["fuel_kg"] = submission_file["idx"].map(pred_lookup)


In [None]:
submission_file.head()

Unnamed: 0,idx,flight_id,start,end,fuel_kg
0,0,prc806615763,2025-09-01 03:03:10.925,2025-09-01 03:07:51.584,163.0
1,1,prc806615763,2025-09-01 03:07:51.584,2025-09-01 03:12:50.921,165.0
2,2,prc806615763,2025-09-01 03:12:50.921,2025-09-01 03:17:51.404,120.0
3,3,prc806615763,2025-09-01 03:17:51.404,2025-09-01 03:22:50.539,30.0
4,4,prc806615763,2025-09-01 03:22:50.539,2025-09-01 03:27:50.727,38.0


In [106]:
submission_file.to_parquet("llm_submission.parquet")

## Rank

In [108]:
rank_submission_file = pd.read_parquet(os.path.join(DATA_PATH, "fuel_rank_submission.parquet"))
rank_submission_file.head()

Unnamed: 0,idx,flight_id,start,end,fuel_kg
0,0,prc806615763,2025-09-01 03:03:10.925,2025-09-01 03:07:51.584,
1,1,prc806615763,2025-09-01 03:07:51.584,2025-09-01 03:12:50.921,
2,2,prc806615763,2025-09-01 03:12:50.921,2025-09-01 03:17:51.404,
3,3,prc806615763,2025-09-01 03:17:51.404,2025-09-01 03:22:50.539,
4,4,prc806615763,2025-09-01 03:22:50.539,2025-09-01 03:27:50.727,


In [109]:
pred_lookup = (
    final_df
    .dropna(subset=["fuel_kg"])
    .drop_duplicates(subset=["idx"], keep="last")
    .set_index("idx")["fuel_kg"]
)

rank_submission_file["fuel_kg"] = rank_submission_file["idx"].map(pred_lookup)

In [None]:
rank_submission_file.head()

Unnamed: 0,idx,flight_id,start,end,fuel_kg
0,0,prc806615763,2025-09-01 03:03:10.925,2025-09-01 03:07:51.584,163.0
1,1,prc806615763,2025-09-01 03:07:51.584,2025-09-01 03:12:50.921,165.0
2,2,prc806615763,2025-09-01 03:12:50.921,2025-09-01 03:17:51.404,120.0
3,3,prc806615763,2025-09-01 03:17:51.404,2025-09-01 03:22:50.539,30.0
4,4,prc806615763,2025-09-01 03:22:50.539,2025-09-01 03:27:50.727,38.0


In [111]:
rank_submission_file.to_parquet("polite-emu_v0.parquet")

In [113]:
shorter_df = final_df.drop(columns="features")
shorter_df.to_parquet("final_df.parquet")

## Usage

In [None]:
for lm in [gpt_oss_120b]:
  cost = sum([x['cost'] for x in lm.history if x['cost'] is not None])
  print(cost)