# Setup

## Import dependencies

In [1]:
%pip install -U dspy datasets tabulate duckdb pandas numpy ipywidgets "sqlglot[rs]" wandb --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import dspy
from datasets import load_dataset
import tabulate
import pandas as pd
import os
from dotenv import load_dotenv

In [None]:
load_dotenv(".env.local")
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

wandb_api_key = os.getenv("WANDB_API_KEY")
if not wandb_api_key:
    raise ValueError("WANDB_API_KEY not found in environment variables")

lm = dspy.LM("openai/gpt-5-mini", api_key=openai_api_key, temperature=1, max_tokens=16000)
dspy.configure(lm=lm)

## Load data

In [4]:
ds = load_dataset("gretelai/synthetic_text_to_sql")

# Set up DSPy

## Set up Signature and Modules

In [5]:
class ProblemDef(dspy.Signature):
    """You are a database expert. You are provided with context for how some table(s) were constructed, and a natural language prompt for what the user wants. Your job is to write a SQL query to provide them with the required data."""
    
    sql_context: str = dspy.InputField(description="SQL queries for creating the table(s) and loading some data")
    sql_prompt: str = dspy.InputField(description="User's natural language prompt")
    sql: str = dspy.OutputField(description="SQL query that delivers on the user's request. Format as code that can be directly run without any changes – do not use new lines or anything else of that sort.")

program = dspy.ChainOfThought(ProblemDef)

In [6]:
# !pip install duckdb pandas numpy sqlglot --quiet
import duckdb, pandas as pd, numpy as np, re
import sqlglot
from sqlglot import parse_one

_ORDER_BY = re.compile(r"\border\s+by\b", re.IGNORECASE)

def _split_sql_statements(script: str):
    out, buf, q = [], [], None
    i, n = 0, len(script)
    while i < n:
        ch = script[i]
        if q:
            buf.append(ch)
            if ch == q:
                if i + 1 < n and script[i+1] == q:
                    buf.append(script[i+1]); i += 1
                else:
                    q = None
        else:
            if ch in ("'", '"', "`"):
                q = ch; buf.append(ch)
            elif ch == ';':
                s = "".join(buf).strip()
                if s: out.append(s)
                buf = []
            else:
                buf.append(ch)
        i += 1
    tail = "".join(buf).strip()
    if tail: out.append(tail)
    return out

import re
from sqlglot import parse_one

_SQLITE_DATE_RE = re.compile(
    r"""\bdate\s*\(\s*'now'\s*(?:,\s*'([+-])\s*(\d+)\s*(year|month|day)s?'\s*)?\)""",
    re.IGNORECASE,
)
_SQLITE_DATETIME_RE = re.compile(
    r"""\bdatetime\s*\(\s*'now'\s*(?:,\s*'([+-])\s*(\d+)\s*(year|month|day|hour|minute|second)s?'\s*)?\)""",
    re.IGNORECASE,
)

def _normalize_sqlite_dates(sql: str) -> str:
    # date('now') or date('now','-1 year') -> CURRENT_DATE +/- INTERVAL 'N unit'
    def _date_subst(m):
        sign, num, unit = m.group(1), m.group(2), m.group(3)
        if not sign:  # just date('now')
            return "CURRENT_DATE"
        op = "-" if sign == "-" else "+"
        return f"CURRENT_DATE {op} INTERVAL '{num} {unit.lower()}'"
    sql = _SQLITE_DATE_RE.sub(_date_subst, sql)

    # datetime('now') / datetime('now','+/-N unit') -> CURRENT_TIMESTAMP +/- INTERVAL 'N unit'
    def _dt_subst(m):
        sign, num, unit = m.group(1), m.group(2), m.group(3)
        if not sign:
            return "CURRENT_TIMESTAMP"
        op = "-" if sign == "-" else "+"
        return f"CURRENT_TIMESTAMP {op} INTERVAL '{num} {unit.lower()}'"
    sql = _SQLITE_DATETIME_RE.sub(_dt_subst, sql)

    return sql

def _mysql_to_duckdb(stmt: str) -> str:
    s = _normalize_sqlite_dates(stmt)  # <-- NEW: normalize SQLite first
    try:
        return parse_one(s, read="mysql").sql(dialect="duckdb")
    except Exception:
        # minimal fallbacks for MySQLisms if parse fails
        s = re.sub(r"`([^`]+)`", r'"\1"', s)
        s = re.sub(
            r"DATE_SUB\s*\(\s*(CURRENT_DATE|NOW\(\))\s*,\s*INTERVAL\s+(\d+)\s+(YEAR|MONTH|DAY)\s*\)",
            lambda m: f"{'CURRENT_DATE' if m.group(1).startswith('CURRENT') else 'CURRENT_DATE'} - INTERVAL '{m.group(2)} {m.group(3).lower()}'",
            s, flags=re.IGNORECASE,
        )
        s = re.sub(
            r"DATE_ADD\s*\(\s*(CURRENT_DATE|NOW\(\))\s*,\s*INTERVAL\s+(\d+)\s+(YEAR|MONTH|DAY)\s*\)",
            lambda m: f"{'CURRENT_DATE' if m.group(1).startswith('CURRENT') else 'CURRENT_DATE'} + INTERVAL '{m.group(2)} {m.group(3).lower()}'",
            s, flags=re.IGNORECASE,
        )
        s = re.sub(r"\bIFNULL\s*\(", "COALESCE(", s, flags=re.IGNORECASE)
        s = re.sub(r"\bLOCATE\s*\(\s*([^,]+)\s*,\s*([^)]+)\)", r"STRPOS(\2, \1)", s, flags=re.IGNORECASE)
        return s

def _normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == "O":
            try:
                df[c] = pd.to_numeric(df[c])
            except Exception:
                pass
    return df.replace({np.nan: None})

def _exec_script_capture_last_select(con, script: str):
    last_df, last_sel_sql = None, None
    for raw in _split_sql_statements(script):
        stmt = _mysql_to_duckdb(raw)
        # detect SELECT after minimal comment strip
        s = re.sub(r"^\s*(--[^\n]*\n|/\*.*?\*/\s*)*", "", stmt, flags=re.DOTALL)
        if re.match(r"(?is)^\s*(with\b.*?select|select)\b", s):
            last_df = con.execute(stmt).fetchdf()
            last_sel_sql = stmt
        else:
            con.execute(stmt)
    if last_df is not None:
        last_df = _normalize_df(last_df)
    return last_df, last_sel_sql

def evaluate_sql(sql_context: str, golden_sql: str, predicted_sql: str):
    con = duckdb.connect(":memory:")

    # context
    try:
        for raw in _split_sql_statements(sql_context):
            con.execute(_mysql_to_duckdb(raw))
    except Exception as e:
        return 0, {"reason": "context_error", "detail": str(e)}

    # golden
    try:
        gold_df, gold_last_select = _exec_script_capture_last_select(con, golden_sql)
    except Exception as e:
        return 0, {"reason": "gold_error", "detail": str(e)}
    if gold_df is None:
        return 0, {"reason": "gold_no_select", "detail": "No SELECT in golden_sql."}

    # predicted
    try:
        pred_df, pred_last_select = _exec_script_capture_last_select(con, predicted_sql)
    except Exception as e:
        return 0, {"reason": "pred_error", "detail": str(e)}
    if pred_df is None:
        return 0, {"reason": "pred_no_select", "detail": "No SELECT in predicted_sql."}

    # column alignment (allow pred supersets; else try set/positional)
    gold_cols, pred_cols = list(gold_df.columns), list(pred_df.columns)
    if gold_cols == pred_cols:
        pass
    elif set(gold_cols).issubset(pred_cols):
        pred_df = pred_df[gold_cols]
    elif set(gold_cols) == set(pred_cols):
        pred_df = pred_df[gold_cols]
    elif gold_df.shape[1] == pred_df.shape[1]:
        new_names = [f"c{i}" for i in range(gold_df.shape[1])]
        gold_df = gold_df.copy(); pred_df = pred_df.copy()
        gold_df.columns = new_names; pred_df.columns = new_names
    else:
        return 0, {"reason": "column_mismatch",
                   "detail": f"Different number of columns: expected {gold_df.shape[1]}, got {pred_df.shape[1]}"}

    # ordering rule from gold's last SELECT
    gold_has_order = bool(_ORDER_BY.search(gold_last_select or ""))
    if not gold_has_order:
        try:
            g = gold_df.sort_values(by=list(gold_df.columns), kind="mergesort").reset_index(drop=True)
            p = pred_df.sort_values(by=list(gold_df.columns), kind="mergesort").reset_index(drop=True)
        except Exception:
            g = gold_df.reset_index(drop=True); p = pred_df.reset_index(drop=True)
    else:
        g = gold_df.reset_index(drop=True); p = pred_df.reset_index(drop=True)

    # value compare
    if g.shape != p.shape:
        return 0, {"reason": "shape_mismatch", "detail": f"gold {g.shape} vs pred {p.shape}"}

    for c in g.columns:
        if pd.api.types.is_numeric_dtype(g[c]) and pd.api.types.is_numeric_dtype(p[c]):
            if not np.allclose(g[c].values, p[c].values, rtol=1e-6, atol=1e-8, equal_nan=True):
                return 0, {"reason": "value_mismatch", "detail": f"Numeric mismatch in '{c}'",
                           "gold_head": g.head(10).to_dict("records"),
                           "pred_head": p.head(10).to_dict("records")}
        else:
            eq = [(x == y) or (x is None and y is None) for x, y in zip(g[c].values, p[c].values)]
            if not all(eq):
                return 0, {"reason": "value_mismatch", "detail": f"Mismatch in '{c}'",
                           "gold_head": g.head(10).to_dict("records"),
                           "pred_head": p.head(10).to_dict("records")}
    return 1, None


## Test

In [7]:
demo_index = 4
context = ds['train'][demo_index]['sql_context']
prompt = ds['train'][demo_index]['sql_prompt']
golden_sql = ds['train'][demo_index]['sql']

print(f"Context: {context}")
print(f"Prompt: {prompt}")
print(f"Golden sql: {golden_sql}")
result = program(sql_context=context, sql_prompt=prompt)
print(result)

Context: CREATE TABLE upgrades (id INT, cost FLOAT, type TEXT); INSERT INTO upgrades (id, cost, type) VALUES (1, 500, 'Insulation'), (2, 1000, 'HVAC'), (3, 1500, 'Lighting');
Prompt: Find the energy efficiency upgrades with the highest cost and their types.
Golden sql: SELECT type, cost FROM (SELECT type, cost, ROW_NUMBER() OVER (ORDER BY cost DESC) as rn FROM upgrades) sub WHERE rn = 1;
Prediction(
    reasoning='We need the upgrade(s) that have the maximum cost. Use a subquery to get MAX(cost) and return rows matching that value (including id, type, and cost).',
    sql='SELECT id, type, cost FROM upgrades WHERE cost = (SELECT MAX(cost) FROM upgrades);'
)


In [8]:
score, info = evaluate_sql(context, golden_sql, result.sql)
print(score, info)


1 None


## Environment didn't work, let's use LLM as Judge

In [9]:
class Judge(dspy.Signature):
    """You are required to judge two SQL queries for functional similarity. You will be given a context of how the table(s) and data were created, and the natural language prompt from the user"""

    sql_context: str = dspy.InputField(description="SQL statement(s) creating the table(s) and the input data")
    sql_prompt: str = dspy.InputField(description="Natural language prompt from the user")
    golden_sql: str = dspy.InputField(description="The golden SQL query from our dataset")
    candidate_sql: str = dspy.InputField(description="A SQL query generated by a model for the same prompt")
    similar: bool = dspy.OutputField(description="True if the candidate SQL query is functionally similar to the golden SQL query")

judge = dspy.ChainOfThought(Judge)
    

In [10]:
judge_response = judge(sql_context=context, sql_prompt=prompt, golden_sql=golden_sql, candidate_sql=result.sql)
print(f"Context: {context}")
print(f"Prompt: {prompt}")
print(f"Golden SQL: {golden_sql}")
print(f"Candidate SQL: {result.sql}")
print(f"Judge Response: {judge_response}")


Context: CREATE TABLE upgrades (id INT, cost FLOAT, type TEXT); INSERT INTO upgrades (id, cost, type) VALUES (1, 500, 'Insulation'), (2, 1000, 'HVAC'), (3, 1500, 'Lighting');
Prompt: Find the energy efficiency upgrades with the highest cost and their types.
Golden SQL: SELECT type, cost FROM (SELECT type, cost, ROW_NUMBER() OVER (ORDER BY cost DESC) as rn FROM upgrades) sub WHERE rn = 1;
Candidate SQL: SELECT id, type, cost FROM upgrades WHERE cost = (SELECT MAX(cost) FROM upgrades);
Judge Response: Prediction(
    reasoning='Both queries return the upgrade(s) that have the maximum cost and include the type and cost information. Differences:\n- The candidate also returns the id column (extra column not present in the golden query).\n- The golden query uses ROW_NUMBER() and will return a single row (even if there are ties), whereas the candidate uses cost = MAX(cost) and will return all rows that tie for the maximum cost.\n\nDespite these differences in returned columns and tie-handling

# Get ready to GEPA

In [11]:
# pip install datasets dspy-ai
import math, random
from typing import Callable, List, Tuple, Optional
from datasets import Dataset, DatasetDict
from dspy import GEPA

def split_for_gepa(
    ds: Dataset,
    to_example: Callable[[dict], "dspy.Example"],
    val_size: float = 0.15,
    seed: int = 42,
    group_col: Optional[str] = None,
    stratify_col: Optional[str] = None,
) -> Tuple[List["dspy.Example"], List["dspy.Example"]]:
    """
    Return (train_set, val_set) as lists of dspy.Example.
    - If group_col is set: group-aware split (no group leakage).
    - Else if stratify_col is set: use HF stratified split.
    - Else: random split.
    """
    assert 0.0 < val_size < 1.0, "val_size must be in (0,1)"
    rng = random.Random(seed)

    # --- Group-aware split (preferred for text2sql) ---
    if group_col:
        groups = ds[group_col]
        # Build group -> indices
        g2idx = {}
        for i, g in enumerate(groups):
            g2idx.setdefault(g, []).append(i)
        uniq_groups = list(g2idx.keys())
        rng.shuffle(uniq_groups)
        n_val_groups = max(1, math.floor(val_size * len(uniq_groups)))
        val_groups = set(uniq_groups[:n_val_groups])

        val_idx = [i for g in val_groups for i in g2idx[g]]
        train_idx = [i for g in uniq_groups[n_val_groups:] for i in g2idx[g]]

        # Edge case: if a group is gigantic, ensure both splits non-empty
        if not train_idx or not val_idx:
            # fallback: plain random split
            perm = list(range(len(ds)))
            rng.shuffle(perm)
            cut = max(1, math.floor(val_size * len(ds)))
            val_idx, train_idx = perm[:cut], perm[cut:]

        ds_train = ds.select(train_idx)
        ds_val = ds.select(val_idx)

    # --- Stratified split (when you have a label/cluster column) ---
    elif stratify_col:
        # HF does stratify on categorical-like columns
        parts: DatasetDict = ds.train_test_split(
            test_size=val_size,
            seed=seed,
            stratify_by_column=stratify_col,
        )
        ds_train, ds_val = parts["train"], parts["test"]

    # --- Simple random split ---
    else:
        parts: DatasetDict = ds.train_test_split(test_size=val_size, seed=seed)
        ds_train, ds_val = parts["train"], parts["test"]

    # Map to dspy.Example lists
    train_set = [to_example(r) for r in ds_train]
    val_set = [to_example(r) for r in ds_val]
    return train_set, val_set

def to_dspy_example(row):
    # mark inputs; leave gold 'sql' as label
    return dspy.Example(
        sql_prompt=row["sql_prompt"],
        sql_context=row["sql_context"],
        sql=row["sql"],          # gold label
    ).with_inputs("sql_prompt", "sql_context")


# call function that splits ds['train'] into train_set and val_set as needed
# ds is your loaded HF dataset dict; we split ds["train"]
train_set, val_set = split_for_gepa(
    ds["train"],
    to_dspy_example,          # your to_dspy_example(row)
    val_size=0.05,
    seed=42,
    group_col=None,      # e.g., "db_id" if available
    stratify_col=None,   # or a column like "op_class" if you want stratification
)

In [None]:
max_variants_to_try = 20 # number of variants to test
mini_batch_size = 3 # mini-batch size
val_set_size = 16 # val-set size

def budget_for_variants(N, V, k, slack=2):
    # slack handles occasional extra probes/promotions
    return V + N * (k + slack)

def metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    judge_response = judge(sql_context=example.sql_context, sql_prompt=example.sql_prompt, golden_sql=example.sql, candidate_sql=pred.sql)
    score = 0
    if (judge_response.similar):
        score = 1
    return dspy.Prediction(score=score, feedback=judge_response.reasoning)

val_for_tracking = val_set[:val_set_size]   # 128–512 is a good range

optimizer = GEPA(
    metric=metric_with_feedback,
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=mini_batch_size,
    reflection_lm=lm,
    use_wandb=False,
    wandb_api_key=wandb_api_key,
    max_metric_calls=budget_for_variants(max_variants_to_try, mini_batch_size, val_set_size),
    log_dir="logs"
)

optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_for_tracking,
)

2025/10/12 16:25:57 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 363 metric calls of the program. This amounts to 0.00 full evals on the train+val set.
2025/10/12 16:25:57 INFO dspy.teleprompt.gepa.gepa: Using 16 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
2025/10/12 16:25:57 INFO dspy.evaluate.evaluate: Average Metric: 7.0 / 16 (43.8%)
2025/10/12 16:25:57 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.4375
2025/10/12 16:25:57 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.4375


Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:00<00:00, 124.80it/s]

2025/10/12 16:25:57 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)
2025/10/12 16:25:57 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: You are a database expert assistant whose job is to read a small database schema + sample data (sql_context) and a natural-language user request (sql_prompt) and produce a single correct SQL query that returns the data the user wants.

Follow these rules every time:

1) Parse schema and sample data
   - Read all CREATE TABLE and INSERT statements in sql_context to learn table names, column names and types, and example rows.
   - Use the sample rows only to understand likely data shapes and edge cases (e.g., whether values repeat, whether some entities have no related rows). Do NOT hardcode answers from sample data unless the prompt explicitly asks for the current value(s).

2) Preserve the user's requested semantics exactly
   - If the prompt asks for totals, counts, averages, minima, maxima, or similar, use the


Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:00<00:00, 124.51it/s]

2025/10/12 16:25:57 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:25:57 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: You are a SQL-writing assistant (a database expert). You will be given two inputs:
- sql_context: SQL DDL/DML statements (CREATE TABLE, INSERT INTO, maybe CREATE SCHEMA) that fully describe the available tables, columns, types, sample values, and optional schema qualification.
- sql_prompt: a short natural-language request describing the data the user wants.

Your job: produce a correct, minimal SQL statement that returns the requested data or performs the requested change, and a short explanation of your reasoning. Output exactly two labeled sections like in the examples:
- reasoning: a concise explanation of how you derived the query from the prompt and context, including any important semantic choices (e.g., why you used DISTINCT, GROUP BY, a WHERE on a PK, date truncation, etc.).
- sql: the final SQL statement.

Rules, assumptions and domain-specific guidance (apply these when constructi

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:00<00:00, 76.88it/s]

2025/10/12 16:25:57 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:26:14 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: You are a "database expert" assistant whose job is to write a SQL query to satisfy a user's natural-language request given a schema and sample data. The assistant will be given two inputs:
- sql_prompt: a natural-language description of what the user wants (e.g., "What is the total revenue generated by each funding source between 2015 and 2019?")
- sql_context: DDL (CREATE TABLE) and sometimes INSERT statements that define table structures, sample data, and any lookup mappings.

Your output must be a correct SQL statement that implements the requested logic against the provided schema, plus a short reasoning note explaining assumptions you made. Follow these rules and best practices when producing the SQL and reasoning:

1. Preserve the requested semantics exactly (functional equivalence)
   - Match the shape of the requested output: return only the columns the prompt asks for (or those clea

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:15<00:00,  5.29s/it]

2025/10/12 16:26:52 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:27:11 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: You are a SQL-writing assistant and database expert. You will be given two inputs:
- sql_context: SQL DDL/INSERTs/VIEWs that define the available schema (table and view names, column names, and sometimes sample rows).
- sql_prompt: a natural-language request asking for data derived from that schema.

Your job: produce a correct SQL query that answers sql_prompt using only the objects and columns available in sql_context. Also produce a short reasoning that explains your approach.

Required output format (always include both sections):
- reasoning: 1–4 concise sentences describing which table(s)/view(s) you used, how you filtered/joined/aggregated, and any critical interpretation decisions.
- sql: a single SQL statement (standard SQL) that implements the reasoning. Prefer readable SQL with table aliases. Use column aliases when helpful.

Rules, conventions and important details to follow:
1. 

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:26<00:00,  8.96s/it] 

2025/10/12 16:27:51 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:28:28 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: You are a database expert whose job is to write a single SQL query (plus a one‑ or two‑sentence reasoning summary) to satisfy a user's natural‑language data request given a provided SQL schema and optional sample data (the "sql_context") and the user's prompt (the "sql_prompt").

Follow these rules and best practices every time:

1. Use only the tables, columns and data types that appear in the provided sql_context. Do not assume columns, flags or relationships that are not present in the schema. If a required column is missing, ask a clarification question instead of guessing.

2. Precise interpretation of the prompt:
   - Apply exactly the filters and constraints stated in the prompt (e.g., state = 'Oregon', year = 2018). Do not omit required filters.
   - Do not add extra semantic constraints that the prompt does not ask for (e.g., do not add a "graduate student" filter unless the prompt 

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:23<00:00,  7.74s/it] 

2025/10/12 16:29:20 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:29:54 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: You are a SQL generation expert. Your job: given a natural-language sql_prompt and a sql_context (CREATE/INSERT statements that define the available tables and columns), produce a single correct SQL query that returns the data the user asked for — plus a short, explicit reasoning paragraph explaining the key design choices, any assumptions or ambiguities, and any potential alternatives the user might prefer.

Follow these rules every time:

1. Parse the prompt and the provided schema literally
   - Use only table and column names that exist in sql_context.
   - Respect column datatypes implied by the CREATE statements (dates, booleans, numeric types).

2. Clarify ambiguity proactively (and ask the user if necessary)
   - If the prompt is ambiguous about time windows (e.g., "past quarter", "last month"), inclusion of boundary dates, whether future-dated rows should be excluded, or whether zer

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:27<00:00,  9.13s/it] 

2025/10/12 16:30:49 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:31:18 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: You are a database expert whose job is to read a provided schema/data context and a natural-language prompt, then produce a correct SQL query that returns the data requested.

Input format you will receive:
- sql_context: one or more CREATE TABLE and INSERT INTO statements that fully describe the available tables and sample data.
- sql_prompt: a natural-language question describing the data the user wants.

Required output format:
- Provide two labeled sections: a short "reasoning" that explains your approach/assumptions in 1–3 sentences, and a "sql" containing the single SQL statement (no additional SQL statements). The SQL should be standard SQL where possible.

Behavioral and technical rules (include these facts/strategies in your reasoning and follow them in SQL):
1. Understand the schema from the CREATE/INSERT statements. Use only those table and column names; do not invent tables/colum

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:35<00:00, 11.89s/it]

2025/10/12 16:32:52 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/10/12 16:33:26 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: 
You are an SQL generation assistant. You will be given two inputs:
- sql_context: one or more CREATE TABLE and INSERT INTO statements that fully describe the available tables and sample data.
- sql_prompt: a natural-language question describing the data the user wants.

Your job is to produce a single correct SQL query that returns the data requested, plus a very short explanation of your approach. Follow these rules exactly for every response.

Output format (required):
- Provide exactly two labeled sections in plain text:
  1. reasoning — 1–3 concise sentences explaining approach and any assumptions (include the chosen join type and rounding choice when applicable).
  2. sql — a single SQL statement only (no other SQL statements, no trailing semicolons unless the context shows semicolons are used, and no explanatory SQL comments).

Core behavioral rules (apply these to choose query struct

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:16<00:00,  5.65s/it] 

2025/10/12 16:34:30 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:35:09 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: You are a database expert whose job is to produce a correct SQL query (and a brief explanation of your approach) given:
- sql_context: SQL DDL/DML that defines one or more tables (CREATE TABLE, optional schema qualification, and optionally INSERT rows) that you can use to infer table and column names and types.
- sql_prompt: a natural-language request for the data to return from the schema in sql_context.

Input format you will receive:
- Two strings: sql_context and sql_prompt. Parse sql_context to discover table names, schema names (if present), and column names. Use that information when writing queries.

Output format you must return (two parts):
1. reasoning — a short plain-English description of the plan you used to produce the query, including any assumptions you explicitly made that affect results (for example whether NULLs are excluded, whether ties are broken, whether ordering is a

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:40<00:00, 13.63s/it] 

2025/10/12 16:36:51 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:37:52 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: You are a database expert assistant. Given a database schema and a natural-language request, produce a correct single SQL statement that answers the request plus a short plain-English explanation of the approach and any assumptions.

Input format you will receive:
- Two strings: sql_context and sql_prompt.
  - sql_context contains SQL DDL/DML (CREATE TABLE, optional schema qualification, and optional INSERT rows). Parse this to discover all table names, schema names (if present), and column names and types; only use these names in your SQL.
  - sql_prompt is a natural-language request describing the data to return from the schema.

Required output (two parts):
1. reasoning — A concise (one to a few sentences) plain-English description of the plan used to produce the query and any explicit assumptions that affect results (for example: NULLs excluded, tie-breaking, chosen join keys when not e

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:21<00:00,  7.07s/it] 

2025/10/12 16:38:46 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:39:14 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: You are a "database expert" assistant that receives two inputs:

- sql_context: SQL DDL/DML snippets (CREATE TABLE and/or INSERT statements, or a short schema description) that define the table(s), column names and types, and sometimes sample data.
- sql_prompt: a natural-language request describing the data the user wants or an operation to perform (SELECT/UPDATE/DELETE/etc).

Your job: produce a correct SQL statement that accomplishes the user's request against the schema in sql_context. Also produce a short, explicit "reasoning" note that documents any assumptions you made or important implementation choices (thresholds, filters, NULL handling, quoting, etc.).

Behavioral rules and details (follow these precisely):

1. Output format
   - Always return two parts: a brief "reasoning" paragraph explaining your interpretation and assumptions, followed by the final "sql" statement. Keep the r

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:21<00:00,  7.17s/it]

2025/10/12 16:40:00 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:40:21 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for predict: You are a SQL-writing database expert assistant. You will be given two pieces of input for each task:
- sql_prompt: a natural-language request describing the data the user wants (or a data-changing request).
- sql_context: DDL and sample INSERT statements that define table names, column names and types, and example data.

Your job: produce a precise SQL statement that satisfies the sql_prompt using the provided schema and data. Also produce a short, clear "reasoning" explanation of assumptions and important choices you made.

Output format (strict):
- A single "sql" section containing the exact SQL statement to run.

Rules and guidance you must follow:

1. Use the provided schema and sample data to infer:
   - Exact table and column names to use (do not invent or change names).
   - Data types (e.g., DATE vs INT) and any implications for filters/aggregations.
   - Whether certain result row

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:16<00:00,  5.43s/it]

2025/10/12 16:41:57 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 16:41:57 INFO dspy.teleprompt.gepa.gepa: Iteration 13: All subsample scores perfect. Skipping.
2025/10/12 16:41:57 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Reflective mutation did not propose a new candidate
2025/10/12 16:41:57 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Selected program 2 score: 0.5



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:25<00:00,  8.43s/it] 

2025/10/12 16:42:22 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:42:56 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: You are building an assistant that, given a SQL schema/context and a natural-language request, must produce a single correct SQL statement that answers the request plus a short plain-English reasoning paragraph describing the approach and any explicit assumptions. The instruction below describes exactly how to behave and all domain-specific rules learned from past examples.

Input format you will receive:
- Two strings: sql_context and sql_prompt.
  - sql_context contains DDL (CREATE SCHEMA, CREATE TABLE, optional fully-qualified names) and may include INSERT rows.
  - sql_prompt is a natural language question asking for data from the schema.

Output format (two parts, in this order):
1) reasoning — a concise (one–a few sentences) plain-English description of the plan used to produce the query and any explicit assumptions that affect results (for example: NULLs excluded, choice of JOIN type

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:26<00:00,  8.70s/it]

2025/10/12 16:43:38 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/10/12 16:44:12 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for predict: You are a SQL-generation expert. You will be given:
- sql_prompt: a natural-language request describing the data needed or an action to perform.
- sql_context: DDL (CREATE TABLE) and optional INSERTs that define the schema and sample data.

Your job: produce a correct, precise SQL statement that implements the sql_prompt using only the tables/columns shown in sql_context, plus a short reasoning/assumptions section describing any interpretation choices.

Rules and guidance (encode all domain-specific details and common pitfalls from the examples):

1. Parse schema and data first
   - Use the CREATE TABLE and INSERT statements to learn column names, data types, primary-key-like columns (e.g., *_id), and whether values are unique in the sample data.
   - If a column is clearly an identifier (client_id, Company_ID) prefer using it for uniqueness checks; if selecting a non-key column (e.g., Comp

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:24<00:00,  8.26s/it]

2025/10/12 16:45:10 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 16:45:10 INFO dspy.teleprompt.gepa.gepa: Iteration 16: All subsample scores perfect. Skipping.
2025/10/12 16:45:10 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Reflective mutation did not propose a new candidate
2025/10/12 16:45:10 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Selected program 0 score: 0.4375



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:23<00:00,  7.75s/it]

2025/10/12 16:45:34 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:45:59 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for predict: You are a SQL-writing assistant (a database expert). You will be given:
- sql_context: SQL DDL/DML statements (CREATE TABLE, INSERT, etc.) that define the available tables, columns and sample data.
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce a single SQL query (the final answer) that returns the data the user requested, using only the schema and names provided in sql_context.

Rules and expectations:
1. Output format
   - Return only the final SQL query text. Do not include any additional commentary, reasoning, or explanation.
   - The SQL should run against the schema in sql_context (use the exact table and column names shown there).

2. Result shape must match the user's intent precisely
   - Pay careful attention to whether the user wants:
     - a single scalar/aggregate (e.g., "the total", "combined sum", "total revenue for the United

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [01:30<00:00, 30.09s/it] 

2025/10/12 16:48:35 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:49:06 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for predict: You are a database SQL expert whose job is to produce one correct SQL statement and a very short explanation of the plan used to produce it, given two inputs:
- sql_context: SQL DDL/DML that defines one or more tables (CREATE TABLE, optional CREATE SCHEMA, optional fully-qualified table names, and optional INSERT rows). Use this to discover exact table names, schema names (if present), and column names and types.
- sql_prompt: a natural-language request for the data or DML change to perform against the schema in sql_context.

Required output format (exactly two parts; return both every time):
1. reasoning — a concise plain-English description (one to a few sentences) describing the plan used to write the query. It must:
   - state any assumptions that affect results (for example: how NULLs are treated, whether ties are broken, whether you used DISTINCT, whether you excluded NULLs, which joi

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:33<00:00, 11.06s/it] 

2025/10/12 16:50:30 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:50:53 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for predict: You are a SQL-writing database expert. Input to you will always consist of two labelled fields:
- sql_prompt: a natural-language request describing the data the user wants.
- sql_context: SQL DDL and sample INSERT statements that define the schema and show example data; this lets you infer column names, types, and example values.

Your job:
- Produce a correct SQL query that returns the data requested by sql_prompt, using the schema in sql_context.
- Also produce a short, clear reasoning/explanation (1–6 sentences) that describes the approach and any assumptions you made.

Output format:
- Provide two clearly-labelled sections: "reasoning" and "sql".
  - In "reasoning" state your intent, any assumptions, edge cases you considered, and why the chosen SQL matches the prompt.
  - In "sql" provide the final SQL query only (no extra text). Use ANSI SQL where possible; when you use dialect-specif

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [01:08<00:00, 22.81s/it] 

2025/10/12 16:52:41 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:53:05 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Proposed new text for predict: You are a SQL/database expert assistant whose job is to read a natural-language data request plus a provided SQL schema/data context, map the user's intent to a precise SQL aggregation/selection, and return a correct SQL query plus a short explanation of your mapping/assumptions. Use the examples and feedback below to guide style and correctness.

Required output format
- Provide two labeled sections: "reasoning" and "sql".
  - reasoning: 2–6 sentences explaining how you interpreted the natural-language prompt, which tables/columns you used, any assumptions you made, and any safeguards for edge cases (e.g., division by zero, missing tables/columns).
  - sql: a single SQL statement (or a small set of statements using CTEs) that implements the described metric. Keep SQL standard and compatible with mainstream RDBMS (Postgres/MySQL/SQL Server) where possible. Use COALESCE/NULLIF/CASE to safely

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:22<00:00,  7.50s/it] 

2025/10/12 16:54:28 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:54:53 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Proposed new text for predict: You are a database expert SQL generator. You will be given two text inputs:
- sql_context: a sequence of CREATE TABLE and INSERT statements (sometimes schema-qualified) that define the available tables, columns, sample values and types.
- sql_prompt: a natural-language request describing the data the user wants from those tables.

Your job: produce a single SQL query that returns exactly the data requested by sql_prompt, using only the tables/columns and values shown in sql_context. Also produce a short reasoning explanation that documents how you mapped the natural-language request into SQL filters/joins/aggregations. Follow these rules precisely:

1. Parse the context literally
   - Use the exact table names (including schema prefix if present) and column names from sql_context.
   - Match string literal casing exactly as shown in the INSERT values when writing equality comparisons (e.g.,

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:24<00:00,  8.00s/it] 

2025/10/12 16:56:18 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:56:45 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Proposed new text for predict: You are a SQL-generation assistant. Given a database schema snippet (sql_context) and a natural-language request (sql_prompt), produce a correct single SQL statement that answers the prompt plus a very short explanation of your plan and any assumptions.

Input format you will receive:
- Two strings:
  1. sql_context — DDL/DML that defines tables (CREATE TABLE, optional schema qualification, and may include INSERT rows). Use this to discover table names, schema(s), and column names/types.
  2. sql_prompt — a natural-language request describing the desired data.

Output format (must follow exactly):
- Provide two parts only:
  1. reasoning — A concise plain-English description (one to a few sentences) of the approach you used to produce the query and any explicit assumptions that affect results (e.g., whether NULLs are excluded, how ties are broken when using ORDER BY/LIMIT, any inferred cate

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:30<00:00, 10.21s/it] 

2025/10/12 16:57:32 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 16:58:05 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Proposed new text for predict: You are a SQL-writing assistant ("database expert") whose job is: given (1) a natural-language sql_prompt describing what the user wants and (2) a sql_context containing CREATE TABLE and optional INSERT statements that define the schema and sample data, produce a correct SQL query (and a short explanation of your reasoning) that returns the requested result using only the schema and information provided.

Required behavior and conventions
- Output format: always include two clearly separated parts:
  1. reasoning — a concise explanation of how you translated the prompt to SQL (one or a few short sentences).
  2. sql — the exact SQL statement to run against the provided schema.
  Follow the example format in the prompts (labels "reasoning" and "sql").
- Use only tables, columns, and values that are present in sql_context (the CREATE TABLE / INSERT statements). Do not invent new tables, colum

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:18<00:00,  6.15s/it] 

2025/10/12 16:58:46 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 16:59:14 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Proposed new text for predict: You are a SQL-generation assistant. For each request you will receive two strings:
- sql_context: DDL (CREATE TABLE, optional CREATE SCHEMA) and optional INSERTs that define the available schema (tables, columns, types, and example rows).
- sql_prompt: a natural-language request describing the data to return from that schema.

Your job is to produce two outputs exactly as follows:
1) reasoning — a short plain-English description (one to a few sentences) of the plan you used to produce the SQL statement and any assumptions you made that affect results.
2) sql — a single valid standard SQL statement that answers the prompt and will run against the provided schema.

Follow these rules exactly:

General
- Parse sql_context to discover actual table names, schema names (if any), and column names and types. Use only those tables and columns that appear in sql_context. Do NOT invent or fabricate an

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:21<00:00,  7.33s/it] 

2025/10/12 16:59:48 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:00:15 INFO dspy.teleprompt.gepa.gepa: Iteration 25: Proposed new text for predict: You are a SQL-writing assistant whose job is to read a provided database construction context and a natural-language prompt, then produce the SQL statement(s) that satisfy the prompt. Follow these rules and conventions precisely.

Input format you will receive
- sql_context: contains CREATE TABLE and INSERT statements that define the schema and sample data.
- sql_prompt: a natural-language request describing what the user wants (SELECT, UPDATE, INSERT, DELETE, aggregation, join, etc.).

Output format (always produce both parts)
1) reasoning — a short explanation (1–4 sentences) of the approach you took and any assumptions or clarifying questions if applicable.
2) sql — the final SQL statement(s) that implement the requested change or query. Include semicolons and produce only SQL in this section (no commentary). If multiple statements are required, separate them with semicolons.

Behavioral

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [02:57<00:00, 59.09s/it]

2025/10/12 17:03:42 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 17:03:42 INFO dspy.teleprompt.gepa.gepa: Iteration 26: All subsample scores perfect. Skipping.
2025/10/12 17:03:42 INFO dspy.teleprompt.gepa.gepa: Iteration 26: Reflective mutation did not propose a new candidate
2025/10/12 17:03:42 INFO dspy.teleprompt.gepa.gepa: Iteration 27: Selected program 2 score: 0.5



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:26<00:00,  9.00s/it] 

2025/10/12 17:04:09 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 17:04:30 INFO dspy.teleprompt.gepa.gepa: Iteration 27: Proposed new text for predict: You are a SQL-generation assistant. Given two input strings:
- sql_context: DDL/DML (CREATE TABLE, optional schema qualification, optional INSERTs) describing the available tables and columns.
- sql_prompt: a natural-language request for data or data changes.

Your job is to produce two parts of output exactly as described below:

1) reasoning — a short plain-English description (concise: 1–3 sentences) of the plan you used to produce the SQL and any explicit assumptions that affect results. Always state any assumptions you make that change semantics (for example excluding NULLs, strict vs inclusive date boundaries, which ties are broken and how, or which join keys you assume when none are explicit). If the prompt cannot be satisfied because required tables/columns are missing from sql_context, state that clearly in reasoning and do not return a SQL statement.

2) sql — a single valid stand

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:26<00:00,  8.80s/it]

2025/10/12 17:05:09 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 17:05:09 INFO dspy.teleprompt.gepa.gepa: Iteration 28: All subsample scores perfect. Skipping.
2025/10/12 17:05:09 INFO dspy.teleprompt.gepa.gepa: Iteration 28: Reflective mutation did not propose a new candidate
2025/10/12 17:05:09 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Selected program 2 score: 0.5



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:30<00:00, 10.22s/it] 

2025/10/12 17:05:39 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:05:56 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Proposed new text for predict: You are a SQL-generation assistant. Given a database schema/context and a natural-language request, produce one concise plain-English reasoning statement and a single valid SQL statement that answers the request using only objects in the provided schema/context.

Input format you will receive:
- Two strings:
  1) sql_context — SQL DDL/DML (CREATE TABLE, optional schema qualification, and optionally INSERT rows). Use this to discover table names, schema names, and column names/types. Do not invent tables/columns not present here.
  2) sql_prompt — a natural-language request describing the data to return or the change to make.

Output format (exact):
- reasoning — A short (1–3 sentences) plain-English description of the plan and any explicit assumptions that affect the results (for example: NULLs excluded, tie-breaking for top-N, join assumptions, or inability to satisfy because of missing co

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:28<00:00,  9.62s/it] 

2025/10/12 17:06:40 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 17:07:12 INFO dspy.teleprompt.gepa.gepa: Iteration 30: Proposed new text for predict: You are a SQL-writing database expert. You will be given two inputs:
- sql_context: DDL (CREATE TABLE) statements and example INSERTs that define the schema and sample data.
- sql_prompt: a natural-language question asking for some data derived from that schema.

Your job is to return a correct SQL query that answers the prompt, plus a brief explanation of your reasoning and any assumptions you made.

Rules and expectations (detailed and prescriptive):
1. Use the provided schema and sample data in sql_context to determine which columns, tables and values are available and how to interpret ambiguous language in the prompt. Do not invent columns or tables that are not present.

2. Output format:
   - Provide a short "reasoning" section that:
     - states how you mapped the natural-language request to specific table(s) and column(s),
     - lists any assumptions you made (for any ambiguous or

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:19<00:00,  6.49s/it]

2025/10/12 17:07:57 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 17:07:57 INFO dspy.teleprompt.gepa.gepa: Iteration 31: All subsample scores perfect. Skipping.
2025/10/12 17:07:57 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Reflective mutation did not propose a new candidate
2025/10/12 17:07:57 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Selected program 2 score: 0.5



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:22<00:00,  7.33s/it]

2025/10/12 17:08:19 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 17:08:19 INFO dspy.teleprompt.gepa.gepa: Iteration 32: All subsample scores perfect. Skipping.
2025/10/12 17:08:19 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Reflective mutation did not propose a new candidate
2025/10/12 17:08:19 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Selected program 0 score: 0.4375



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:28<00:00,  9.34s/it]

2025/10/12 17:08:47 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 17:09:21 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Proposed new text for predict: You are a SQL-generation expert. You will be given two inputs:
- sql_context: a string containing CREATE TABLE statements and sample INSERTs that show the table names, column names and example values.
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce a single, correct SQL query (standard SQL) that answers the sql_prompt against the schema in sql_context. Also produce a very short plain-English reasoning/plan (1–3 sentences) describing what the query does.

Follow these explicit rules and heuristics so your SQL is robust and general (not just tailored to the sample INSERT rows):

1. Parsing and intent
   - Parse table names and column names from sql_context. Use those columns in the SQL.
   - Use the sample INSERTs only to understand possible values and column semantics; do not hard-code filters that only match the sample rows unless the prompt e

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:18<00:00,  6.07s/it] 

2025/10/12 17:10:15 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:10:43 INFO dspy.teleprompt.gepa.gepa: Iteration 34: Proposed new text for predict: You are an SQL-generation assistant. Given an input pair of strings (sql_context and sql_prompt), produce a correct SQL query and a short reasoning paragraph describing how you built the query and any assumptions you made.

Input format
- You will receive two strings:
  1. sql_context: one or more CREATE TABLE/CREATE SCHEMA and optional INSERT statements describing the schema and example data. Use this to learn exact table names, schema names (if any), and column names/types.
  2. sql_prompt: a natural-language request describing the data to return from the schema in sql_context.

Output format (two parts; return both exactly as described)
1. reasoning — A short plain-English description (one to a few sentences) of the plan used to produce the SQL, including any explicit assumptions that affect results (e.g., how NULLs are treated, tie-breaking choices, chosen join keys if not obvious). Be 

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:17<00:00,  5.69s/it] 

2025/10/12 17:11:19 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:11:43 INFO dspy.teleprompt.gepa.gepa: Iteration 35: Proposed new text for predict: You are a SQL-generation assistant. Your job: given a schema/context and a natural-language request, produce (A) a concise reasoning paragraph that explains the plan and any assumptions, and (B) a single SQL statement that answers the request and will run against the provided schema.

Input format (you will receive exactly these two inputs):
- sql_context (string): DDL/DML that defines the available table(s) and columns (CREATE TABLE, optional CREATE SCHEMA, and possibly INSERT rows). Use this to discover table names, schema names (if present), and column names and types.
- sql_prompt (string): a natural-language question asking for data from the schema in sql_context.

Required output format (return exactly two parts):
1. reasoning — a short, plain-English description (one to a few sentences) of how you produced the SQL, including any explicit assumptions that affect results (for example: 

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:24<00:00,  8.16s/it] 

2025/10/12 17:12:30 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:13:01 INFO dspy.teleprompt.gepa.gepa: Iteration 36: Proposed new text for predict: You are a database expert SQL generator. For each input you receive you will be given:
- sql_context: CREATE TABLE / INSERT statements that define the schema and show example data.
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce a single correct ANSI-SQL query that returns the data requested by sql_prompt using only the tables and columns present in sql_context.

Rules & detailed strategy (use these every time):

1. Parse the schema and sample rows from sql_context first.
   - Use the exact table and column names from sql_context in your SQL.
   - Note data encodings visible in sample rows (how categories are represented, boolean values, id foreign keys, string patterns, etc.) and use them to interpret the prompt.

2. Determine required tables and joins:
   - If a column name ends with _id and matches another table's primary id column, join on

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:26<00:00,  8.94s/it] 

2025/10/12 17:13:54 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:14:20 INFO dspy.teleprompt.gepa.gepa: Iteration 37: Proposed new text for predict: You are an SQL-generation assistant. Your job is to produce one correct, standard-SQL query (and a very short explanation of how you constructed it) given a database schema/data snippet and a natural-language request. Follow these rules exactly.

Inputs you will receive
- Two strings:
  1. sql_context — SQL DDL/DML that defines one or more tables (e.g., CREATE TABLE, optional CREATE SCHEMA, and optionally INSERT statements). Use sql_context to discover exact table names, optional schema qualification, and column names and types.
  2. sql_prompt — a natural-language request describing the data to return from the schema in sql_context.

Required output format (two parts)
- reasoning — A concise (one to a few sentences) plain-English description of the plan you used to produce the query and any explicit assumptions you made that affect results (for example: whether NULLs are excluded, how ties

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:16<00:00,  5.58s/it]

2025/10/12 17:15:08 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 17:15:39 INFO dspy.teleprompt.gepa.gepa: Iteration 38: Proposed new text for predict: You are a SQL-generation assistant (a "database expert"). You will be given two inputs:
- sql_context: SQL DDL/DML statements that define tables and show example rows (CREATE TABLE and INSERT INTO). Use this to discover exact table names, column names, data types, and sample values.
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce a single correct SQL query that returns the data requested, plus a brief reasoning paragraph that (1) explains how the natural-language request maps to SQL operations and (2) lists any assumptions you had to make. Do not produce extra unrelated text.

Rules and checklist (follow exactly):

1. Inspect schema and sample rows first
   - Use the exact table and column names as shown in sql_context (case and spelling).
   - Use the sample values to understand how types are represented (e.g., year as INT, date as DATE/TIMES

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:17<00:00,  5.71s/it] 

2025/10/12 17:17:31 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:18:02 INFO dspy.teleprompt.gepa.gepa: Iteration 39: Proposed new text for predict: You are building an assistant whose job is: given a SQL schema/data snippet and a natural-language request, produce a correct single SQL statement that answers the request plus a brief, factual explanation of the plan/assumptions. Use the detailed rules below exactly.

Input format you will receive (two strings):
- sql_context: SQL DDL/DML (CREATE TABLE statements, optional schema qualification, and optionally INSERT rows). Parse this to discover table names, schema names (if present), column names and types. Do not invent tables or columns.
- sql_prompt: a natural-language request describing the data to return from the schema(s) in sql_context.

Required output (two parts):
1) reasoning — a short plain-English description (one to a few sentences) describing:
   - the plan used to produce the query (which tables and columns used, any grouping/aggregation, join keys).
   - any explicit assum

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:33<00:00, 11.03s/it] 

2025/10/12 17:19:25 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:19:50 INFO dspy.teleprompt.gepa.gepa: Iteration 40: Proposed new text for predict: You are a SQL-query-writing assistant. Given two input strings (sql_context and sql_prompt), produce a correct SQL statement that answers the natural-language sql_prompt using only the tables and columns discoverable in sql_context, plus a very short explanation of your plan/assumptions.

Input format you will receive:
- sql_context: SQL DDL/DML (CREATE TABLE, optional CREATE SCHEMA, optional fully-qualified names, and optionally INSERT rows). Parse it to discover table names, schema names (if present), column names and types. Use those exact identifiers in your SQL; do not invent table or column names.
- sql_prompt: a natural-language request describing the data to return from the schema in sql_context.

Required output format (exactly two labeled parts):
1) reasoning — a concise (one to a few sentences) plain-English description of the plan used to produce the SQL statement and any explic

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:26<00:00,  8.90s/it]

2025/10/12 17:20:44 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 17:20:44 INFO dspy.teleprompt.gepa.gepa: Iteration 41: All subsample scores perfect. Skipping.
2025/10/12 17:20:44 INFO dspy.teleprompt.gepa.gepa: Iteration 41: Reflective mutation did not propose a new candidate
2025/10/12 17:20:44 INFO dspy.teleprompt.gepa.gepa: Iteration 42: Selected program 6 score: 0.4375



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:36<00:00, 12.27s/it] 

2025/10/12 17:21:21 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:21:46 INFO dspy.teleprompt.gepa.gepa: Iteration 42: Proposed new text for predict: You are a SQL-generation assistant (a "database expert"). You will be given two inputs:
- sql_context: one or more CREATE TABLE and INSERT INTO statements (DDL/DML) that define the schema and show sample rows.
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce exactly two sections in plain text (no extra commentary): a short "reasoning" paragraph (1–4 sentences) followed by an executable SQL query prefixed by the token "sql" (on its own line or immediately before the query). Follow these detailed rules exactly.

1) Inspect schema and samples first
   - Parse sql_context to learn exact table names, column names, and sample values. Use those exact names (case and spelling) in your SQL.
   - Use sample rows to infer data types and value formatting (e.g., whether years are stored as INT, dates as DATE/TIMESTAMP, booleans as true/false, categories as 

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:19<00:00,  6.64s/it]

2025/10/12 17:22:42 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 17:22:42 INFO dspy.teleprompt.gepa.gepa: Iteration 43: All subsample scores perfect. Skipping.
2025/10/12 17:22:42 INFO dspy.teleprompt.gepa.gepa: Iteration 43: Reflective mutation did not propose a new candidate
2025/10/12 17:22:42 INFO dspy.teleprompt.gepa.gepa: Iteration 44: Selected program 2 score: 0.5



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:20<00:00,  6.86s/it] 

2025/10/12 17:23:03 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 17:23:26 INFO dspy.teleprompt.gepa.gepa: Iteration 44: Proposed new text for predict: You are a database expert whose job is to produce a correct single SQL statement (and a short explanation of the approach) given two text inputs:
- sql_context: SQL DDL/DML (CREATE TABLE, optional schema qualification, and optional INSERTs) that defines the available tables, schemas and columns (and sometimes sample rows). Parse sql_context to discover table names, optional schema names, column names and types and use only those names in your query.
- sql_prompt: a natural-language request describing the data to return from the schema in sql_context.

Input format you will receive:
- Two strings, named sql_context and sql_prompt.

Output format (must follow exactly):
Return two parts in plain text labeled exactly as:
1. reasoning — a concise (one to a few sentences) plain-English description of the plan you used to produce the SQL, and any explicit assumptions you made that affect results (

## Review original and optimized prompts

In [19]:
print(program.predict.signature.instructions)

You are a database expert. You are provided with context for how some table(s) were constructed, and a natural language prompt for what the user wants. Your job is to write a SQL query to provide them with the required data.


In [20]:
print(optimized_program.predict.signature.instructions)

You are a database expert whose job is to produce a correct SQL query (and a brief explanation of your approach) given:
- sql_context: SQL DDL/DML that defines one or more tables (CREATE TABLE, optional schema qualification, and optionally INSERT rows) that you can use to infer table and column names and types.
- sql_prompt: a natural-language request for the data to return from the schema in sql_context.

Input format you will receive:
- Two strings: sql_context and sql_prompt. Parse sql_context to discover table names, schema names (if present), and column names. Use that information when writing queries.

Output format you must return (two parts):
1. reasoning — a short plain-English description of the plan you used to produce the query, including any assumptions you explicitly made that affect results (for example whether NULLs are excluded, whether ties are broken, whether ordering is added for readability). Keep this concise and factual.
2. sql — a single SQL statement that answe

## Evals

In [21]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import Dataset
from time import perf_counter
from typing import Dict, Any, Optional

def evaluate_program(
    program,
    ds_test: Dataset,
    limit: int = 100,
    max_workers: int = 8,
    field_map: Optional[Dict[str, str]] = None,
) -> Dict[str, Any]:
    """
    Evaluate a DSPy program on the first `limit` rows of a HF Dataset split.

    Args:
        program: a DSPy Module with signature program(sql_prompt=..., sql_context=...)
        ds_test: Hugging Face Dataset (e.g., ds["test"])
        limit: number of rows to evaluate (default 100)
        max_workers: parallel threads for I/O-bound LM + judge
        field_map: optional mapping if your column names differ:
                   {"sql_prompt": "...", "sql_context": "...", "sql": "..."}

    Returns:
        {
          "accuracy": float,
          "correct": int,
          "total": int,
          "avg_latency_s": float,
          "failures": [ {idx, reason, pred_sql, feedback} ... up to 20 ],
        }
    """
    if field_map is None:
        field_map = {"sql_prompt": "sql_prompt", "sql_context": "sql_context", "sql": "sql"}

    n = min(limit, len(ds_test))
    subset = ds_test.select(range(n))
    start = perf_counter()

    def _eval_one(i_row):
        i, row = i_row
        try:
            pred = program(
                sql_prompt=row[field_map["sql_prompt"]],
                sql_context=row[field_map["sql_context"]],
            )
            pred_sql = getattr(pred, "sql", None) or (pred.get("sql") if isinstance(pred, dict) else None) or ""
            jr = judge(
                sql_context=row[field_map["sql_context"]],
                sql_prompt=row[field_map["sql_prompt"]],
                golden_sql=row[field_map["sql"]],
                candidate_sql=pred_sql,
            )
            ok = bool(getattr(jr, "similar", False))
            feedback = getattr(jr, "reasoning", "") or ""
            return (i, ok, pred_sql, feedback, None)
        except Exception as e:
            return (i, False, "", "", f"{type(e).__name__}: {e}")

    results = []
    # Threaded evaluation (I/O bound: LM + judge). Tune max_workers to your provider limits.
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(_eval_one, (i, subset[i])) for i in range(n)]
        for f in as_completed(futures):
            results.append(f.result())

    # Sort back to input order
    results.sort(key=lambda x: x[0])

    correct = sum(1 for _, ok, *_ in results if ok)
    total = n
    acc = correct / total if total else 0.0
    elapsed = perf_counter() - start
    avg_lat = elapsed / total if total else 0.0

    failures = []
    for i, ok, pred_sql, feedback, err in results:
        if not ok and len(failures) < 20:
            failures.append({
                "idx": i,
                "reason": ("error: " + err) if err else "mismatch",
                "pred_sql": pred_sql,
                "feedback": feedback,
            })

    return {
        "accuracy": acc,
        "correct": correct,
        "total": total,
        "avg_latency_s": avg_lat,
        "failures": failures,
    }


In [27]:
# Evaluate original and optimized on ds["test"][:100]
test_split = ds["test"]
orig_metrics = evaluate_program(program, test_split, limit=500, max_workers=32)

print("Original:", orig_metrics["accuracy"], f"({orig_metrics['correct']}/{orig_metrics['total']})")

Original: 0.634 (317/500)


In [28]:
opt_metrics  = evaluate_program(optimized_program, test_split, limit=500, max_workers=32)
print("Optimized:", opt_metrics["accuracy"], f"({opt_metrics['correct']}/{opt_metrics['total']})")

Optimized: 0.624 (312/500)
