# Setup

## Import dependencies

In [1]:
%pip install -U dspy datasets tabulate duckdb pandas numpy ipywidgets "sqlglot[rs]" wandb --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import dspy
from datasets import load_dataset
import tabulate
import pandas as pd
import os
from dotenv import load_dotenv

In [3]:
load_dotenv(".env.local")
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

wandb_api_key = os.getenv("WANDB_API_KEY")
if not wandb_api_key:
    raise ValueError("WANDB_API_KEY not found in environment variables")

lm = dspy.LM("openai/gpt-5-mini", api_key=openai_api_key, temperature=1, max_tokens=16000)
dspy.configure(lm=lm)

## Load data

In [4]:
ds = load_dataset("gretelai/synthetic_text_to_sql")

# Set up DSPy

## Set up Signature and Modules

In [5]:
class ProblemDef(dspy.Signature):
    """You are a database expert. You are provided with context for how some table(s) were constructed, and a natural language prompt for what the user wants. Your job is to write a SQL query to provide them with the required data."""
    
    sql_context: str = dspy.InputField(description="SQL queries for creating the table(s) and loading some data")
    sql_prompt: str = dspy.InputField(description="User's natural language prompt")
    sql: str = dspy.OutputField(description="SQL query that delivers on the user's request. Format as code that can be directly run without any changes – do not use new lines or anything else of that sort.")

program = dspy.ChainOfThought(ProblemDef)

In [6]:
# !pip install duckdb pandas numpy sqlglot --quiet
import duckdb, pandas as pd, numpy as np, re
import sqlglot
from sqlglot import parse_one

_ORDER_BY = re.compile(r"\border\s+by\b", re.IGNORECASE)

def _split_sql_statements(script: str):
    out, buf, q = [], [], None
    i, n = 0, len(script)
    while i < n:
        ch = script[i]
        if q:
            buf.append(ch)
            if ch == q:
                if i + 1 < n and script[i+1] == q:
                    buf.append(script[i+1]); i += 1
                else:
                    q = None
        else:
            if ch in ("'", '"', "`"):
                q = ch; buf.append(ch)
            elif ch == ';':
                s = "".join(buf).strip()
                if s: out.append(s)
                buf = []
            else:
                buf.append(ch)
        i += 1
    tail = "".join(buf).strip()
    if tail: out.append(tail)
    return out

import re
from sqlglot import parse_one

_SQLITE_DATE_RE = re.compile(
    r"""\bdate\s*\(\s*'now'\s*(?:,\s*'([+-])\s*(\d+)\s*(year|month|day)s?'\s*)?\)""",
    re.IGNORECASE,
)
_SQLITE_DATETIME_RE = re.compile(
    r"""\bdatetime\s*\(\s*'now'\s*(?:,\s*'([+-])\s*(\d+)\s*(year|month|day|hour|minute|second)s?'\s*)?\)""",
    re.IGNORECASE,
)

def _normalize_sqlite_dates(sql: str) -> str:
    # date('now') or date('now','-1 year') -> CURRENT_DATE +/- INTERVAL 'N unit'
    def _date_subst(m):
        sign, num, unit = m.group(1), m.group(2), m.group(3)
        if not sign:  # just date('now')
            return "CURRENT_DATE"
        op = "-" if sign == "-" else "+"
        return f"CURRENT_DATE {op} INTERVAL '{num} {unit.lower()}'"
    sql = _SQLITE_DATE_RE.sub(_date_subst, sql)

    # datetime('now') / datetime('now','+/-N unit') -> CURRENT_TIMESTAMP +/- INTERVAL 'N unit'
    def _dt_subst(m):
        sign, num, unit = m.group(1), m.group(2), m.group(3)
        if not sign:
            return "CURRENT_TIMESTAMP"
        op = "-" if sign == "-" else "+"
        return f"CURRENT_TIMESTAMP {op} INTERVAL '{num} {unit.lower()}'"
    sql = _SQLITE_DATETIME_RE.sub(_dt_subst, sql)

    return sql

def _mysql_to_duckdb(stmt: str) -> str:
    s = _normalize_sqlite_dates(stmt)  # <-- NEW: normalize SQLite first
    try:
        return parse_one(s, read="mysql").sql(dialect="duckdb")
    except Exception:
        # minimal fallbacks for MySQLisms if parse fails
        s = re.sub(r"`([^`]+)`", r'"\1"', s)
        s = re.sub(
            r"DATE_SUB\s*\(\s*(CURRENT_DATE|NOW\(\))\s*,\s*INTERVAL\s+(\d+)\s+(YEAR|MONTH|DAY)\s*\)",
            lambda m: f"{'CURRENT_DATE' if m.group(1).startswith('CURRENT') else 'CURRENT_DATE'} - INTERVAL '{m.group(2)} {m.group(3).lower()}'",
            s, flags=re.IGNORECASE,
        )
        s = re.sub(
            r"DATE_ADD\s*\(\s*(CURRENT_DATE|NOW\(\))\s*,\s*INTERVAL\s+(\d+)\s+(YEAR|MONTH|DAY)\s*\)",
            lambda m: f"{'CURRENT_DATE' if m.group(1).startswith('CURRENT') else 'CURRENT_DATE'} + INTERVAL '{m.group(2)} {m.group(3).lower()}'",
            s, flags=re.IGNORECASE,
        )
        s = re.sub(r"\bIFNULL\s*\(", "COALESCE(", s, flags=re.IGNORECASE)
        s = re.sub(r"\bLOCATE\s*\(\s*([^,]+)\s*,\s*([^)]+)\)", r"STRPOS(\2, \1)", s, flags=re.IGNORECASE)
        return s

def _normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == "O":
            try:
                df[c] = pd.to_numeric(df[c])
            except Exception:
                pass
    return df.replace({np.nan: None})

def _exec_script_capture_last_select(con, script: str):
    last_df, last_sel_sql = None, None
    for raw in _split_sql_statements(script):
        stmt = _mysql_to_duckdb(raw)
        # detect SELECT after minimal comment strip
        s = re.sub(r"^\s*(--[^\n]*\n|/\*.*?\*/\s*)*", "", stmt, flags=re.DOTALL)
        if re.match(r"(?is)^\s*(with\b.*?select|select)\b", s):
            last_df = con.execute(stmt).fetchdf()
            last_sel_sql = stmt
        else:
            con.execute(stmt)
    if last_df is not None:
        last_df = _normalize_df(last_df)
    return last_df, last_sel_sql

def evaluate_sql(sql_context: str, golden_sql: str, predicted_sql: str):
    con = duckdb.connect(":memory:")

    # context
    try:
        for raw in _split_sql_statements(sql_context):
            con.execute(_mysql_to_duckdb(raw))
    except Exception as e:
        return 0, {"reason": "context_error", "detail": str(e)}

    # golden
    try:
        gold_df, gold_last_select = _exec_script_capture_last_select(con, golden_sql)
    except Exception as e:
        return 0, {"reason": "gold_error", "detail": str(e)}
    if gold_df is None:
        return 0, {"reason": "gold_no_select", "detail": "No SELECT in golden_sql."}

    # predicted
    try:
        pred_df, pred_last_select = _exec_script_capture_last_select(con, predicted_sql)
    except Exception as e:
        return 0, {"reason": "pred_error", "detail": str(e)}
    if pred_df is None:
        return 0, {"reason": "pred_no_select", "detail": "No SELECT in predicted_sql."}

    # column alignment (allow pred supersets; else try set/positional)
    gold_cols, pred_cols = list(gold_df.columns), list(pred_df.columns)
    if gold_cols == pred_cols:
        pass
    elif set(gold_cols).issubset(pred_cols):
        pred_df = pred_df[gold_cols]
    elif set(gold_cols) == set(pred_cols):
        pred_df = pred_df[gold_cols]
    elif gold_df.shape[1] == pred_df.shape[1]:
        new_names = [f"c{i}" for i in range(gold_df.shape[1])]
        gold_df = gold_df.copy(); pred_df = pred_df.copy()
        gold_df.columns = new_names; pred_df.columns = new_names
    else:
        return 0, {"reason": "column_mismatch",
                   "detail": f"Different number of columns: expected {gold_df.shape[1]}, got {pred_df.shape[1]}"}

    # ordering rule from gold's last SELECT
    gold_has_order = bool(_ORDER_BY.search(gold_last_select or ""))
    if not gold_has_order:
        try:
            g = gold_df.sort_values(by=list(gold_df.columns), kind="mergesort").reset_index(drop=True)
            p = pred_df.sort_values(by=list(gold_df.columns), kind="mergesort").reset_index(drop=True)
        except Exception:
            g = gold_df.reset_index(drop=True); p = pred_df.reset_index(drop=True)
    else:
        g = gold_df.reset_index(drop=True); p = pred_df.reset_index(drop=True)

    # value compare
    if g.shape != p.shape:
        return 0, {"reason": "shape_mismatch", "detail": f"gold {g.shape} vs pred {p.shape}"}

    for c in g.columns:
        if pd.api.types.is_numeric_dtype(g[c]) and pd.api.types.is_numeric_dtype(p[c]):
            if not np.allclose(g[c].values, p[c].values, rtol=1e-6, atol=1e-8, equal_nan=True):
                return 0, {"reason": "value_mismatch", "detail": f"Numeric mismatch in '{c}'",
                           "gold_head": g.head(10).to_dict("records"),
                           "pred_head": p.head(10).to_dict("records")}
        else:
            eq = [(x == y) or (x is None and y is None) for x, y in zip(g[c].values, p[c].values)]
            if not all(eq):
                return 0, {"reason": "value_mismatch", "detail": f"Mismatch in '{c}'",
                           "gold_head": g.head(10).to_dict("records"),
                           "pred_head": p.head(10).to_dict("records")}
    return 1, None


## Test

In [7]:
demo_index = 4
context = ds['train'][demo_index]['sql_context']
prompt = ds['train'][demo_index]['sql_prompt']
golden_sql = ds['train'][demo_index]['sql']

print(f"Context: {context}")
print(f"Prompt: {prompt}")
print(f"Golden sql: {golden_sql}")
result = program(sql_context=context, sql_prompt=prompt)
print(result)

Context: CREATE TABLE upgrades (id INT, cost FLOAT, type TEXT); INSERT INTO upgrades (id, cost, type) VALUES (1, 500, 'Insulation'), (2, 1000, 'HVAC'), (3, 1500, 'Lighting');
Prompt: Find the energy efficiency upgrades with the highest cost and their types.
Golden sql: SELECT type, cost FROM (SELECT type, cost, ROW_NUMBER() OVER (ORDER BY cost DESC) as rn FROM upgrades) sub WHERE rn = 1;
Prediction(
    reasoning='We need the upgrade(s) that have the maximum cost. Use a subquery to get MAX(cost) and return rows matching that value (including id, type, and cost).',
    sql='SELECT id, type, cost FROM upgrades WHERE cost = (SELECT MAX(cost) FROM upgrades);'
)


In [8]:
score, info = evaluate_sql(context, golden_sql, result.sql)
print(score, info)


1 None


## Environment didn't work, let's use LLM as Judge

In [9]:
class Judge(dspy.Signature):
    """You are required to judge two SQL queries for functional similarity. You will be given a context of how the table(s) and data were created, and the natural language prompt from the user"""

    sql_context: str = dspy.InputField(description="SQL statement(s) creating the table(s) and the input data")
    sql_prompt: str = dspy.InputField(description="Natural language prompt from the user")
    golden_sql: str = dspy.InputField(description="The golden SQL query from our dataset")
    candidate_sql: str = dspy.InputField(description="A SQL query generated by a model for the same prompt")
    similar: bool = dspy.OutputField(description="True if the candidate SQL query is functionally similar to the golden SQL query")

judge = dspy.ChainOfThought(Judge)
    

In [10]:
judge_response = judge(sql_context=context, sql_prompt=prompt, golden_sql=golden_sql, candidate_sql=result.sql)
print(f"Context: {context}")
print(f"Prompt: {prompt}")
print(f"Golden SQL: {golden_sql}")
print(f"Candidate SQL: {result.sql}")
print(f"Judge Response: {judge_response}")


Context: CREATE TABLE upgrades (id INT, cost FLOAT, type TEXT); INSERT INTO upgrades (id, cost, type) VALUES (1, 500, 'Insulation'), (2, 1000, 'HVAC'), (3, 1500, 'Lighting');
Prompt: Find the energy efficiency upgrades with the highest cost and their types.
Golden SQL: SELECT type, cost FROM (SELECT type, cost, ROW_NUMBER() OVER (ORDER BY cost DESC) as rn FROM upgrades) sub WHERE rn = 1;
Candidate SQL: SELECT id, type, cost FROM upgrades WHERE cost = (SELECT MAX(cost) FROM upgrades);
Judge Response: Prediction(
    reasoning='Both queries return the upgrade(s) that have the maximum cost and include the type and cost information. Differences:\n- The candidate also returns the id column (extra column not present in the golden query).\n- The golden query uses ROW_NUMBER() and will return a single row (even if there are ties), whereas the candidate uses cost = MAX(cost) and will return all rows that tie for the maximum cost.\n\nDespite these differences in returned columns and tie-handling

# Get ready to GEPA

In [11]:
# pip install datasets dspy-ai
import math, random
from typing import Callable, List, Tuple, Optional
from datasets import Dataset, DatasetDict
from dspy import GEPA

def split_for_gepa(
    ds: Dataset,
    to_example: Callable[[dict], "dspy.Example"],
    val_size: float = 0.15,
    seed: int = 42,
    group_col: Optional[str] = None,
    stratify_col: Optional[str] = None,
) -> Tuple[List["dspy.Example"], List["dspy.Example"]]:
    """
    Return (train_set, val_set) as lists of dspy.Example.
    - If group_col is set: group-aware split (no group leakage).
    - Else if stratify_col is set: use HF stratified split.
    - Else: random split.
    """
    assert 0.0 < val_size < 1.0, "val_size must be in (0,1)"
    rng = random.Random(seed)

    # --- Group-aware split (preferred for text2sql) ---
    if group_col:
        groups = ds[group_col]
        # Build group -> indices
        g2idx = {}
        for i, g in enumerate(groups):
            g2idx.setdefault(g, []).append(i)
        uniq_groups = list(g2idx.keys())
        rng.shuffle(uniq_groups)
        n_val_groups = max(1, math.floor(val_size * len(uniq_groups)))
        val_groups = set(uniq_groups[:n_val_groups])

        val_idx = [i for g in val_groups for i in g2idx[g]]
        train_idx = [i for g in uniq_groups[n_val_groups:] for i in g2idx[g]]

        # Edge case: if a group is gigantic, ensure both splits non-empty
        if not train_idx or not val_idx:
            # fallback: plain random split
            perm = list(range(len(ds)))
            rng.shuffle(perm)
            cut = max(1, math.floor(val_size * len(ds)))
            val_idx, train_idx = perm[:cut], perm[cut:]

        ds_train = ds.select(train_idx)
        ds_val = ds.select(val_idx)

    # --- Stratified split (when you have a label/cluster column) ---
    elif stratify_col:
        # HF does stratify on categorical-like columns
        parts: DatasetDict = ds.train_test_split(
            test_size=val_size,
            seed=seed,
            stratify_by_column=stratify_col,
        )
        ds_train, ds_val = parts["train"], parts["test"]

    # --- Simple random split ---
    else:
        parts: DatasetDict = ds.train_test_split(test_size=val_size, seed=seed)
        ds_train, ds_val = parts["train"], parts["test"]

    # Map to dspy.Example lists
    train_set = [to_example(r) for r in ds_train]
    val_set = [to_example(r) for r in ds_val]
    return train_set, val_set

def to_dspy_example(row):
    # mark inputs; leave gold 'sql' as label
    return dspy.Example(
        sql_prompt=row["sql_prompt"],
        sql_context=row["sql_context"],
        sql=row["sql"],          # gold label
    ).with_inputs("sql_prompt", "sql_context")


# call function that splits ds['train'] into train_set and val_set as needed
# ds is your loaded HF dataset dict; we split ds["train"]
train_set, val_set = split_for_gepa(
    ds["train"],
    to_dspy_example,          # your to_dspy_example(row)
    val_size=0.05,
    seed=42,
    group_col=None,      # e.g., "db_id" if available
    stratify_col=None,   # or a column like "op_class" if you want stratification
)

In [13]:
max_variants_to_try = 20 # number of variants to test
mini_batch_size = 3 # mini-batch size
val_set_size = 200 # val-set size

def budget_for_variants(N, V, k, slack=2):
    # slack handles occasional extra probes/promotions
    return V + N * (k + slack)

def metric_with_feedback(example, pred, trace=None, pred_name=None, pred_trace=None):
    judge_response = judge(sql_context=example.sql_context, sql_prompt=example.sql_prompt, golden_sql=example.sql, candidate_sql=pred.sql)
    score = 0
    if (judge_response.similar):
        score = 1
    return dspy.Prediction(score=score, feedback=judge_response.reasoning)

val_for_tracking = val_set[:val_set_size]   # 128–512 is a good range
train_set_for_optimization = train_set[:val_set_size]
optimizer = GEPA(
    metric=metric_with_feedback,
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=mini_batch_size,
    reflection_lm=lm,
    use_wandb=True,
    wandb_api_key=wandb_api_key,
    log_dir="logs",
    auto="light"   
)

optimized_program = optimizer.compile(
    program,
    trainset=train_set_for_optimization,
    valset=val_for_tracking,
)

2025/10/12 20:24:02 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1180 metric calls of the program. This amounts to 2.95 full evals on the train+val set.
2025/10/12 20:24:02 INFO dspy.teleprompt.gepa.gepa: Using 200 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/raveesh/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraveeshbhalla90[0m ([33mraveeshbhalla90-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [dspy, litellm, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
GEPA Optimization:   0%|          | 0/1180 [00:00<?, ?rollouts/s]2025/10/12 20:24:06 INFO dspy.evaluate.evaluate: Average Metric: 111.0 / 200 (55.5%)
2025/10/12 20:24:07 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.555
GEPA Optimization:  17%|█▋        | 200/1180 [00:02<00:11, 87.44rollouts/s]2025/10/12 20:24:07 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.555


Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:24<00:00,  8.31s/it]

2025/10/12 20:24:31 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 20:24:31 INFO dspy.teleprompt.gepa.gepa: Iteration 1: All subsample scores perfect. Skipping.
2025/10/12 20:24:31 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Reflective mutation did not propose a new candidate
2025/10/12 20:24:31 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Selected program 0 score: 0.555



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:22<00:00,  7.42s/it]

2025/10/12 20:24:54 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 20:24:54 INFO dspy.teleprompt.gepa.gepa: Iteration 2: All subsample scores perfect. Skipping.
2025/10/12 20:24:54 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Reflective mutation did not propose a new candidate
2025/10/12 20:24:54 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Selected program 0 score: 0.555



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:35<00:00, 11.93s/it] 

2025/10/12 20:25:30 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 20:26:03 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: You are a SQL-writing assistant (a database expert). You will be given two inputs:
- sql_context: DDL and INSERT statements that define table schemas and sample data (always present).
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce a correct SQL query (and a short, explicit reasoning/assumptions block) that returns the data requested by sql_prompt when applied to the schema and sample data in sql_context.

Strict requirements and behavior rules
1. Output format
   - Provide two clearly labeled parts: "reasoning" and "sql".
   - In reasoning, state any assumptions or disambiguations you make (see below). Keep this concise but explicit.
   - In sql, output only the SQL statement (valid generic SQL) that implements the requested behavior and matches your stated assumptions.

2. Always inspect sql_context
   - Parse the CREATE TABLE and INSERT line

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:25<00:00,  8.50s/it]

2025/10/12 20:27:09 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 20:27:09 INFO dspy.teleprompt.gepa.gepa: Iteration 4: All subsample scores perfect. Skipping.
2025/10/12 20:27:09 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Reflective mutation did not propose a new candidate
GEPA Optimization:  18%|█▊        | 215/1180 [03:04<20:03,  1.25s/rollouts]2025/10/12 20:27:09 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Selected program 0 score: 0.555



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:36<00:00, 12.10s/it] 

2025/10/12 20:27:45 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 20:28:21 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: You are a database expert whose job is to write accurate, functionally equivalent SQL queries from:
- a natural-language request (sql_prompt), and
- the provided table creation / sample data context (sql_context).

Follow these rules for every task. Be explicit about assumptions, preserve semantics, and produce a correct, ready-to-run SQL statement for an appropriate SQL dialect.

1) Output format
   - Provide a short "reasoning/assumptions" paragraph (1–6 lines) describing:
     - which SQL dialect you inferred from sql_context (and why), or what default/ANSI dialect you are assuming if ambiguous; and
     - any disambiguating assumptions you made for ambiguous NL (e.g., "last month" = sliding last 30 days vs. previous calendar month).
   - Then provide the SQL query only (single statement if possible). Keep the SQL syntactically valid for the stated dialect.

2) Use the provided sql_contex

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:26<00:00,  8.72s/it] 

2025/10/12 20:29:17 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 20:29:43 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: You are a SQL-generation expert whose sole job is to read a short natural-language request (sql_prompt) together with a SQL DDL/DML snippet (sql_context) that defines the available tables and example rows, infer the schema and intent, and produce a single SQL query that answers the request against that schema.

Input format you will receive:
- sql_context: one or more CREATE TABLE statements plus INSERTs that define table names, column names, and data types / sample values. Use this to infer exact table and column names and types.
- sql_prompt: a short natural-language instruction describing exactly what columns/aggregations/filters/aggregation-granularity the user wants.

Strict rules you must follow when producing the SQL:
1. Output only one SQL query and nothing else. Do not include extra text, commentary, or explanation.
2. Return exactly the columns the prompt requests (and no additiona

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:32<00:00, 10.95s/it] 

2025/10/12 20:33:38 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 20:34:10 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Proposed new text for predict: You are a SQL-generation expert whose only job is to read two inputs — a short natural-language request (sql_prompt) and a SQL DDL/DML snippet (sql_context) — infer the exact schema and intent, and produce one single SQL statement that satisfies the request against that schema.

Input expectations
- sql_context will include one or more CREATE TABLE statements and may include INSERT ... VALUES(...) sample rows. Use these to determine exact table names, column names, and the data representations (e.g., whether strings use single quotes, whether booleans are TRUE/FALSE, date formats).
- sql_prompt is a concise instruction describing what the user wants (a SELECT, an INSERT/UPDATE/DELETE, aggregations, filters, grouping, ordering, etc.).

Strict output rules (follow these exactly)
1. Output exactly one SQL statement and nothing else. The output must be the SQL only inside a single code block and

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:53<00:00, 17.75s/it] 

2025/10/12 20:35:27 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 20:36:12 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: You are a SQL-generation assistant. You will be given two inputs:
- sql_prompt: a natural-language question asking for data from a database.
- sql_context: a sequence of SQL statements (CREATE TABLE, INSERT, UPDATE, DELETE, etc.) that define the schema and show example data and operations in the order they were applied.

Your job: produce a single SQL query (and a short, explicit reasoning line) that answers sql_prompt correctly with respect to the information in sql_context.

Rules, conventions, and detailed guidance (follow these exactly):

1. Treat sql_context as an authoritative, ordered log of schema + operations:
   - Read the statements in the order given (CREATE then INSERT/UPDATE/DELETE etc.). The final state of a table is the result of applying those operations in sequence.
   - If the natural-language question refers to the current contents of a table (typical case), write a query

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [02:04<00:00, 41.53s/it] 

2025/10/12 20:39:05 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 20:39:32 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: You are a SQL-generation expert whose sole job is:
- Read two inputs: sql_context and sql_prompt.
  - sql_context: one or more CREATE TABLE statements plus INSERTs that define the available tables, column names, and example values (use these to infer exact table/column names and types and the representation conventions used, e.g., boolean TRUE/FALSE or 't'/'f', date/time formats, string quoting).
  - sql_prompt: a short natural-language instruction describing exactly what columns/aggregations/filters/aggregation-granularity the user wants.

Your output:
- Produce exactly one SQL statement (query or DML) that answers the sql_prompt against the schema in sql_context.
- Output only that SQL statement, and nothing else.
- Place the SQL statement inside a single code block and do not include any other text, comments, or explanation outside the code block.

Hard rules you must follow:
1. Single SQ

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:34<00:00, 11.64s/it] 

2025/10/12 20:43:42 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 20:44:37 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: You are a SQL-generation expert whose only job is to produce a single correct SQL statement (query or DML) that implements a natural-language sql_prompt against a provided sql_context. Follow these rules exactly every time.

Input format you will receive
- sql_context: one or more CREATE TABLE statements plus INSERT statements. Use these statements as authoritative for:
  - Exact table and column names (use them verbatim).
  - Column types and examples of literal formatting (how strings are quoted, how booleans are represented, timestamp/date formats, numeric formats).
  - Any sample values that reveal representation conventions or implicit relationships.
- sql_prompt: a short natural-language instruction describing what the user wants (which columns, aggregates, filters, grouping, time windows, top-N, etc.).

Required output format
- Produce exactly one SQL statement and nothing else.
- Pl

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:43<00:00, 14.48s/it]

2025/10/12 20:46:04 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 20:46:31 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: You are a SQL-generation expert whose sole job is to read two inputs and produce exactly one SQL statement that answers the user's natural-language request against the provided schema and example data.

Inputs you will be given:
- sql_context: one or more CREATE TABLE statements and optional INSERT statements. These define the available tables, columns, types, and example values. Use these to infer exact table/column names, literal representation conventions (string quoting, boolean literal form, date/time formats), and any idiomatic SQL shown in the context.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, and grouping the user wants.

Hard requirements (must follow every time):
1. Output exactly one SQL statement and nothing else. Place that single SQL statement inside a single code block. Do not include any text outside the code blo

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:43<00:00, 14.58s/it]

2025/10/12 20:48:07 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 20:48:07 INFO dspy.teleprompt.gepa.gepa: Iteration 12: All subsample scores perfect. Skipping.
2025/10/12 20:48:07 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Reflective mutation did not propose a new candidate
GEPA Optimization:  56%|█████▌    | 660/1180 [24:03<31:39,  3.65s/rollouts]2025/10/12 20:48:07 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Selected program 0 score: 0.555



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:24<00:00,  8.23s/it]

2025/10/12 20:48:32 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 20:48:32 INFO dspy.teleprompt.gepa.gepa: Iteration 13: All subsample scores perfect. Skipping.
2025/10/12 20:48:32 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Reflective mutation did not propose a new candidate
GEPA Optimization:  56%|█████▌    | 663/1180 [24:27<32:53,  3.82s/rollouts]2025/10/12 20:48:32 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Selected program 2 score: 0.56



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:44<00:00, 14.89s/it]

2025/10/12 20:49:17 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 20:49:51 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: You are a SQL-generation expert assistant. You will receive two inputs: sql_context and sql_prompt. Your job is to produce exactly one ANSI-ish SQL statement (query or DML) that answers the sql_prompt against the schema and value conventions shown in sql_context.

Hard requirements (must-follow):
1. Output exactly one SQL statement, and nothing else. Put that statement inside a single code block and do not include any other text, comments, or explanation.
2. Use table names and column names exactly as they appear in sql_context. Do not invent or rename tables/columns.
3. Return exactly the columns the prompt requests (and no additional columns). You may use concise aliases but do not introduce extra logical columns.
4. Do not change, add, or remove filters/aggregations in a way that changes the semantics of the user's request. Only apply WHERE, GROUP BY, HAVING, ORDER BY when required by th

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [01:04<00:00, 21.56s/it] 

2025/10/12 20:51:24 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 20:51:55 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for predict: You are a specialized SQL-generation assistant. Your sole job is to read two inputs and produce exactly one SQL statement (query or DML) that implements the natural-language request against the provided schema/examples. Follow these rules exactly.

Inputs you will receive:
- sql_context: One or more CREATE TABLE / CREATE VIEW statements and optional INSERT statements. Use these to determine:
  - Exact table and column names and types.
  - Literal/format conventions used in the database examples (boolean literal form, string quoting style, date/time formats, numeric formats).
  - Any existing view definitions you may reuse.
- sql_prompt: A short natural-language instruction describing exactly what columns/aggregations/filters/grouping/aggregation-granularity the user wants.

Output requirements (hard constraints you must always follow):
1. Produce exactly one SQL statement and nothing else. 

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:33<00:00, 11.23s/it] 

2025/10/12 20:53:13 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 20:53:36 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for predict: You are a SQL-generation expert. Your job is to read two inputs and produce exactly one ANSI-ish SQL query (and nothing else) that answers the natural-language request against the schema/examples provided.

INPUTS you will receive:
- sql_context: one or more CREATE TABLE statements plus INSERTs that define table names, column names, data types and sample values. Use this to infer exact table & column names, their types, and how values (including NULLs and empty strings) are represented.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, grouping ("per X" / "for each X"), ordering, or other transformations the user wants.

HIGH-PRIORITY OUTPUT RULES (must follow exactly):
1. Output exactly one SQL query and nothing else. No extra text, explanation, or commentary.
2. Put the SQL query only inside a single code block. Do not output anything

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:38<00:00, 12.78s/it] 

2025/10/12 20:58:04 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 20:58:29 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for predict: You are a SQL-generation expert. Your job is to read two inputs and produce exactly one ANSI-ish SQL statement (and nothing else) that implements the user's natural-language request against the provided schema/examples.

INPUTS
- sql_context: one or more CREATE TABLE statements plus INSERTs that define table names, column names, data types and sample values. Use these statements to infer:
  - exact table names and column names (case and spelling as given),
  - data types where possible,
  - how values are represented (strings quoting style, whether missing values are NULL or empty string '', how booleans appear: TRUE/FALSE or 1/0),
  - and any sample value conventions that determine presence/absence semantics.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, grouping ("per X" / "for each X"), ordering, or other transformations the user

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:37<00:00, 12.61s/it] 

2025/10/12 20:59:59 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 21:00:26 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for predict: You are a SQL-generation expert assistant. Read two inputs and produce exactly one ANSI-ish SQL query (and nothing else) that answers the natural-language request against the provided schema/examples.

INPUTS you will receive:
- sql_context: one or more CREATE TABLE and INSERT statements. Use these to infer exact table names, column names, data types, and how missing values are represented (NULL vs empty string). Also infer boolean representation (TRUE/FALSE, 1/0) and quoting style for string literals.
- sql_prompt: a short natural-language instruction describing exactly which columns, aggregates, filters, grouping ("per X" / "for each X"), ordering, or other transformations are required.

OUTPUT FORMAT (must follow exactly):
1. Output exactly one SQL statement and nothing else.
2. Put that single SQL statement inside one code block and output no other text outside it.
3. The SELECT must re

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:24<00:00,  8.14s/it] 

2025/10/12 21:01:28 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:01:59 INFO dspy.teleprompt.gepa.gepa: Iteration 19: Proposed new text for predict: You are a database expert whose job is to produce a correct SQL query (and a short explanation of your approach) given:
- sql_context: DDL and any provided sample data (CREATE TABLE, INSERTs). Use this to infer available tables, columns, datatypes, and constraints.
- sql_prompt: a natural-language request describing the data the user wants.

Input format (what you'll receive):
- sql_context: a string containing CREATE TABLE statements and optional INSERTs.
- sql_prompt: a single natural-language question/requirement referencing the tables from sql_context.

Your output must contain two parts:
1) reasoning — a concise explanation of your interpretation, assumptions, and any important decisions (e.g., why you chose INNER vs LEFT JOIN, how you interpret ambiguous phrases like "last year" or "not completed yet", how ties are handled). Mention any schema-derived facts you rely on (e.g., "complet

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:24<00:00,  8.11s/it] 

2025/10/12 21:03:15 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:03:39 INFO dspy.teleprompt.gepa.gepa: Iteration 20: Proposed new text for predict: You are a database expert assistant. For each task you will be given:
- sql_context: SQL DDL/DML (CREATE TABLE and INSERT statements) describing table schemas, types, and example data.
- sql_prompt: a natural-language request describing the data the user wants (or a modification such as DELETE/UPDATE).

Your job: produce a correct, minimal SQL statement that satisfies the sql_prompt using the schema given in sql_context, plus a short (1–3 sentence) explanation of your approach.

Rules and important domain facts to follow (use these every time):
1. Output format
   - Return two labeled parts: a brief "reasoning" (1–3 sentences) followed by the "sql" statement.
   - Keep the reasoning concise and focused on why the chosen SQL matches the prompt.
   - The SQL should be just the statement(s) needed (no extra commentary or formatting). Use standard SQL syntax appropriate to the functions already

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:59<00:00, 19.81s/it] 

2025/10/12 21:05:00 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:05:21 INFO dspy.teleprompt.gepa.gepa: Iteration 21: Proposed new text for predict: You are a SQL-generation expert. Your job: read a short natural-language request (sql_prompt) together with a SQL DDL/DML snippet (sql_context) that defines available tables and sample rows, infer the schema and the user intent, and produce exactly one SQL query that answers the request against that schema.

Input you will receive:
- sql_context: one or more CREATE TABLE and INSERT statements. Use these to learn exact table names, column names, and sample value formats. Do not invent tables or columns that are not present.
- sql_prompt: a concise natural-language instruction describing exactly what the user wants returned (which columns, which aggregations, filters, grouping/granularity, ordering, top N, etc).

Hard rules you must follow exactly:
1. Output exactly one SQL query and nothing else. No extra text, explanation, or commentary.
2. Wrap the query in a single code block and produce 

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:14<00:00,  4.83s/it]

2025/10/12 21:06:35 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 21:06:35 INFO dspy.teleprompt.gepa.gepa: Iteration 22: All subsample scores perfect. Skipping.
2025/10/12 21:06:35 INFO dspy.teleprompt.gepa.gepa: Iteration 22: Reflective mutation did not propose a new candidate
GEPA Optimization:  77%|███████▋  | 914/1180 [42:31<30:01,  6.77s/rollouts]2025/10/12 21:06:35 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Selected program 3 score: 0.555



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:29<00:00,  9.81s/it] 

2025/10/12 21:07:05 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:07:29 INFO dspy.teleprompt.gepa.gepa: Iteration 23: Proposed new text for predict: You are a SQL-generation expert whose job is to read two inputs and produce exactly one ANSI-ish SQL query (and nothing else) that answers the natural-language request against the schema/examples provided.

Inputs:
- sql_context: one or more CREATE TABLE statements plus INSERTs that define table names, column names, data types and sample values. Use these to infer exact table & column names, value conventions (how NULLs and empty strings are used), and boolean/string literal styles.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, grouping ("per X" / "for each X"), ordering, or other transformations to return.

Hard output rules (must be followed exactly):
1. Output exactly one SQL query and nothing else.
2. Put the SQL query only inside a single code block. Do not output anything outside the code block.
3. Return exactly the columns t

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:18<00:00,  6.03s/it] 

2025/10/12 21:08:08 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 21:08:37 INFO dspy.teleprompt.gepa.gepa: Iteration 24: Proposed new text for predict: You are a SQL-generation assistant (a database expert). You are given two inputs:
- sql_context: a string containing DDL/DML for the available tables (CREATE TABLE and INSERT statements). Use these to discover table and column names and example values. Do not invent tables or columns not present in sql_context.
- sql_prompt: a natural-language request describing the data the user wants.

Your job: produce a correct, minimal, portable SQL query that satisfies the sql_prompt when run against the schema/data in sql_context. Also produce a short, 1–3 sentence reasoning that states any assumptions you made (especially if the prompt is ambiguous) and explains key choices (e.g., aggregating across tables vs per-table, substring vs exact string matching, NULL handling).

Rules, heuristics and domain details to follow:

1. Respect the schema in sql_context
   - Always use the exact table names and c

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:32<00:00, 10.67s/it] 

2025/10/12 21:11:25 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:11:47 INFO dspy.teleprompt.gepa.gepa: Iteration 25: Proposed new text for predict: You are a SQL-generation expert. Your job is to read two inputs and produce exactly one SQL statement that answers the user's natural-language request against the provided schema/examples.

Inputs you will receive:
- sql_context: one or more CREATE TABLE statements and optional INSERT statements that define table names, column names, types, and example values. Use these to infer:
  - Exact table and column names (use them verbatim).
  - Literal conventions for values (string quoting style, boolean literal form, date/time formats).
  - Which columns are numeric, text, boolean, timestamps, etc., and any representation quirks shown by the INSERT examples.
- sql_prompt: a short natural-language instruction describing exactly what to return (columns, filters, aggregations, grouping, time windows, DML operations).

Hard rules you must always follow (no exceptions):
1. Output exactly one SQL state

Average Metric: 0.00 / 3 (0.0%): 100%|██████████| 3/3 [00:33<00:00, 11.31s/it]

2025/10/12 21:12:55 INFO dspy.evaluate.evaluate: Average Metric: 0.0 / 3 (0.0%)





2025/10/12 21:13:30 INFO dspy.teleprompt.gepa.gepa: Iteration 26: Proposed new text for predict: You are a SQL-generation expert. Your job is to read two inputs and produce exactly one ANSI-ish SQL statement (and nothing else) that answers the natural-language request against the schema/examples provided.

INPUT FORMAT you will receive:
- sql_context: one or more CREATE TABLE statements plus INSERTs that define the schema and show sample values. Use these to infer exact table and column names, data types, and conventions for representing missing values and booleans.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, grouping ("per X"/"for each X"), ordering, or other transformations the user wants.

PRIMARY OBJECTIVE:
- Produce exactly one SQL statement that correctly implements the sql_prompt using only the schema and value conventions shown in sql_context.

HIGH-PRIORITY OUTPUT RULES (must follow exactly):
1. Output exactly one SQ

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:23<00:00,  7.86s/it] 

2025/10/12 21:14:34 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:15:11 INFO dspy.teleprompt.gepa.gepa: Iteration 27: Proposed new text for predict: You are a SQL-generation expert. You will be given two inputs each time:
- sql_context: one or more CREATE TABLE statements plus INSERTs that define table names, column names, data types and sample values. Use these to infer exact table & column identifiers, the types, how NULLs and empty strings are represented, and how booleans are represented.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, grouping, ordering, or other transformations are required.

Your job: read the inputs and produce exactly one ANSI-ish SQL query (and nothing else) that answers the natural-language request against the schema/examples in sql_context.

Hard output rules (follow exactly):
1. Output exactly one SQL statement and nothing else. No extra text, explanation, or commentary.
2. Put the SQL statement inside a single code block only. Do not output anything 

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:38<00:00, 12.88s/it]

2025/10/12 21:16:08 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 21:16:08 INFO dspy.teleprompt.gepa.gepa: Iteration 28: All subsample scores perfect. Skipping.
2025/10/12 21:16:08 INFO dspy.teleprompt.gepa.gepa: Iteration 28: Reflective mutation did not propose a new candidate
GEPA Optimization:  97%|█████████▋| 1147/1180 [52:03<02:10,  3.96s/rollouts]2025/10/12 21:16:08 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Selected program 0 score: 0.555



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:21<00:00,  7.08s/it] 

2025/10/12 21:16:29 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 21:16:53 INFO dspy.teleprompt.gepa.gepa: Iteration 29: Proposed new text for predict: You are a database expert assistant whose job is to produce a correct, schema-valid SQL query (and a short explanation of the approach) given:
- sql_context: DDL and INSERT statements that define the available table names, column names and example data.
- sql_prompt: a natural-language request describing the data the user wants.

General requirements
1. Always read sql_context first and use only tables and columns that actually exist in that schema. Do not reference columns (e.g., continent) or functions that are not present in the provided context unless the prompt explicitly allows or asks you to assume them.
2. If sql_context uses particular identifier types (e.g., integer id columns), prefer joining on those keys rather than on textual columns such as names (unless the schema only provides names).
3. Do not hardcode values (for example, a list of country names) unless:
   - the schema c

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:34<00:00, 11.44s/it]

2025/10/12 21:17:49 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 21:17:49 INFO dspy.teleprompt.gepa.gepa: Iteration 30: All subsample scores perfect. Skipping.
2025/10/12 21:17:49 INFO dspy.teleprompt.gepa.gepa: Iteration 30: Reflective mutation did not propose a new candidate
GEPA Optimization:  98%|█████████▊| 1156/1180 [53:44<02:06,  5.26s/rollouts]2025/10/12 21:17:49 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Selected program 4 score: 0.525



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:17<00:00,  5.86s/it]

2025/10/12 21:18:06 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 21:18:06 INFO dspy.teleprompt.gepa.gepa: Iteration 31: All subsample scores perfect. Skipping.
2025/10/12 21:18:06 INFO dspy.teleprompt.gepa.gepa: Iteration 31: Reflective mutation did not propose a new candidate
GEPA Optimization:  98%|█████████▊| 1159/1180 [54:01<01:51,  5.32s/rollouts]2025/10/12 21:18:06 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Selected program 0 score: 0.555



Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:17<00:00,  5.86s/it]

2025/10/12 21:18:24 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/10/12 21:18:24 INFO dspy.teleprompt.gepa.gepa: Iteration 32: All subsample scores perfect. Skipping.
2025/10/12 21:18:24 INFO dspy.teleprompt.gepa.gepa: Iteration 32: Reflective mutation did not propose a new candidate
GEPA Optimization:  98%|█████████▊| 1162/1180 [54:19<01:36,  5.39s/rollouts]2025/10/12 21:18:24 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Selected program 3 score: 0.555



Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [00:29<00:00,  9.97s/it] 

2025/10/12 21:18:54 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/10/12 21:19:25 INFO dspy.teleprompt.gepa.gepa: Iteration 33: Proposed new text for predict: You are a SQL-generation expert assistant. Your job is: given two inputs (sql_context and sql_prompt), produce exactly one ANSI-ish SQL query (and nothing else) that answers the natural-language request against the schema and example data provided in sql_context.

INPUTS you will receive:
- sql_context: one or more CREATE TABLE and INSERT statements. Use these to infer exact table names, column names, data types, value conventions (how NULLs and empty strings are represented) and quoting style.
- sql_prompt: a short natural-language instruction describing exactly what columns, aggregates, filters, grouping ("per X" / "for each X"), ordering, or other transformations are required.

OUTPUT REQUIREMENTS (must follow exactly):
1. Output exactly one SQL query and nothing else. Do not output any explanation, commentary, or extra characters.
2. Put the SQL query only inside a single code block (tr

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:21<00:00,  7.06s/it] 

2025/10/12 21:20:16 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:20:45 INFO dspy.teleprompt.gepa.gepa: Iteration 34: Proposed new text for predict: You are a SQL-generation expert. Your single job: read a natural-language request (sql_prompt) together with a SQL snippet that defines the schema and sample rows (sql_context), infer the schema and intent, and produce exactly one SQL query that answers the request against that schema.

Input you will receive:
- sql_context: one or more CREATE TABLE statements plus INSERTs that define exact table names, column names, data types and sample values. Use this to infer the available tables, the exact identifier spellings/casing, data types, boolean formatting (TRUE/FALSE, 1/0, etc.), date formats, and any other representation details.
- sql_prompt: a short natural-language instruction that specifies the columns/aggregations/filters/ordering/grouping the user wants.

Hard requirements for your output:
1. Output exactly one SQL statement and nothing else. No commentary, no explanation, no surround

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [00:14<00:00,  4.88s/it] 

2025/10/12 21:21:24 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/10/12 21:21:53 INFO dspy.teleprompt.gepa.gepa: Iteration 35: Proposed new text for predict: You are a SQL-writing database expert. For each task you will be given:
- sql_context: SQL CREATE TABLE and INSERT statements that define the available tables, columns and sample data.
- sql_prompt: a natural-language request for the data the user wants.

Your job: produce a correct, minimal, and robust SQL query that answers sql_prompt given the schema and sample data, plus a 1–3 sentence concise reasoning that explains your approach and any assumptions. Always base your answer on the exact schema and sample INSERTs in sql_context.

Detailed rules and best practices you must follow:

1. Examine schema and sample data first
   - Use the exact table and column names from the CREATE TABLE statements.
   - Use sample INSERTs to sanity-check that the query will return the expected rows/values.

2. Match the prompt semantics exactly
   - If the prompt implies a filter (e.g., "union members" in c

## Review original and optimized prompts

In [14]:
print(program.predict.signature.instructions)

You are a database expert. You are provided with context for how some table(s) were constructed, and a natural language prompt for what the user wants. Your job is to write a SQL query to provide them with the required data.


In [15]:
print(optimized_program.predict.signature.instructions)

You are a SQL-generation expert whose sole job is:
- Read two inputs: sql_context and sql_prompt.
  - sql_context: one or more CREATE TABLE statements plus INSERTs that define the available tables, column names, and example values (use these to infer exact table/column names and types and the representation conventions used, e.g., boolean TRUE/FALSE or 't'/'f', date/time formats, string quoting).
  - sql_prompt: a short natural-language instruction describing exactly what columns/aggregations/filters/aggregation-granularity the user wants.

Your output:
- Produce exactly one SQL statement (query or DML) that answers the sql_prompt against the schema in sql_context.
- Output only that SQL statement, and nothing else.
- Place the SQL statement inside a single code block and do not include any other text, comments, or explanation outside the code block.

Hard rules you must follow:
1. Single SQL statement only. No extra text, no explanations, no alternative queries.
2. Use table names and

## Evals

In [16]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from datasets import Dataset
from time import perf_counter
from typing import Dict, Any, Optional

def evaluate_program(
    program,
    ds_test: Dataset,
    limit: int = 100,
    max_workers: int = 8,
    field_map: Optional[Dict[str, str]] = None,
) -> Dict[str, Any]:
    """
    Evaluate a DSPy program on the first `limit` rows of a HF Dataset split.

    Args:
        program: a DSPy Module with signature program(sql_prompt=..., sql_context=...)
        ds_test: Hugging Face Dataset (e.g., ds["test"])
        limit: number of rows to evaluate (default 100)
        max_workers: parallel threads for I/O-bound LM + judge
        field_map: optional mapping if your column names differ:
                   {"sql_prompt": "...", "sql_context": "...", "sql": "..."}

    Returns:
        {
          "accuracy": float,
          "correct": int,
          "total": int,
          "avg_latency_s": float,
          "failures": [ {idx, reason, pred_sql, feedback} ... up to 20 ],
        }
    """
    if field_map is None:
        field_map = {"sql_prompt": "sql_prompt", "sql_context": "sql_context", "sql": "sql"}

    n = min(limit, len(ds_test))
    subset = ds_test.select(range(n))
    start = perf_counter()

    def _eval_one(i_row):
        i, row = i_row
        try:
            pred = program(
                sql_prompt=row[field_map["sql_prompt"]],
                sql_context=row[field_map["sql_context"]],
            )
            pred_sql = getattr(pred, "sql", None) or (pred.get("sql") if isinstance(pred, dict) else None) or ""
            jr = judge(
                sql_context=row[field_map["sql_context"]],
                sql_prompt=row[field_map["sql_prompt"]],
                golden_sql=row[field_map["sql"]],
                candidate_sql=pred_sql,
            )
            ok = bool(getattr(jr, "similar", False))
            feedback = getattr(jr, "reasoning", "") or ""
            return (i, ok, pred_sql, feedback, None)
        except Exception as e:
            return (i, False, "", "", f"{type(e).__name__}: {e}")

    results = []
    # Threaded evaluation (I/O bound: LM + judge). Tune max_workers to your provider limits.
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(_eval_one, (i, subset[i])) for i in range(n)]
        for f in as_completed(futures):
            results.append(f.result())

    # Sort back to input order
    results.sort(key=lambda x: x[0])

    correct = sum(1 for _, ok, *_ in results if ok)
    total = n
    acc = correct / total if total else 0.0
    elapsed = perf_counter() - start
    avg_lat = elapsed / total if total else 0.0

    failures = []
    for i, ok, pred_sql, feedback, err in results:
        if not ok and len(failures) < 20:
            failures.append({
                "idx": i,
                "reason": ("error: " + err) if err else "mismatch",
                "pred_sql": pred_sql,
                "feedback": feedback,
            })

    return {
        "accuracy": acc,
        "correct": correct,
        "total": total,
        "avg_latency_s": avg_lat,
        "failures": failures,
    }


In [17]:
# Evaluate original and optimized on ds["test"][:100]
test_split = ds["test"]
orig_metrics = evaluate_program(program, test_split, limit=500, max_workers=32)

print("Original:", orig_metrics["accuracy"], f"({orig_metrics['correct']}/{orig_metrics['total']})")

Original: 0.634 (317/500)


In [18]:
opt_metrics  = evaluate_program(optimized_program, test_split, limit=500, max_workers=32)
print("Optimized:", opt_metrics["accuracy"], f"({opt_metrics['correct']}/{opt_metrics['total']})")

Optimized: 0.646 (323/500)
