In [40]:
USE_GROQ = True  # Set to False to use OpenAI
REQUEST_DELAY = 0.5

In [41]:
!pip install datasets



In [42]:
!pip install openai
!pip install groq



In [44]:
# Core libraries
import os
import re
import json
from groq import Groq
import openai
import json
import ast

# Data handling
import pandas as pd
from datasets import load_dataset
from collections import defaultdict


# Model & inference (if using Hugging Face transformers)
# from transformers import (
#     AutoTokenizer,
#     AutoModelForCausalLM,
#     BitsAndBytesConfig,
#     pipeline
# )

# Evaluation (optional at this stage since you're not doing full metrics yet)
from sklearn.metrics import accuracy_score

# PyTorch (only if needed for model management or tensor ops)
import torch

from itertools import cycle

from huggingface_hub import login
login("token")


In [45]:

# List of your GROQ API keys
GROQ_API_KEYS = [
    # paid
    "TEST_KEY"
]

# Create a cycling iterator over the API keys
groq_clients = [Groq(api_key=key) for key in GROQ_API_KEYS]
groq_client_cycle = cycle(groq_clients)



In [48]:
dataset_name = "cardiffnlp/databench"
semeval_train = load_dataset(dataset_name, name="qa", split="train")
# semeval_dev = load_dataset(dataset_name, name="semeval", split="dev")

# dev_target_datasets = ['051_Pokemon', '052_Professional', '055_German']
# semeval_dev = semeval_dev.filter(lambda sample: sample['dataset'] in dev_target_datasets)

# train_target_datasets = ['051_Pokemon', '052_Professional', '055_German']


train_target_datasets = [
    "001_Forbes", "002_Titanic", "004_Taxi", "005_NYC",
    "006_London", "007_Fifa", "008_Tornados", "009_Central", "010_ECommerce",
    "011_SF", "012_Heart", "013_Roller", "015_Food",
    "016_Holiday", "017_Hacker", "018_Staff", "019_Aircraft",
    "021_Telco", "022_Airbnbs", "023_Climate", "024_Salary", "025_Data",
    "026_Predicting", "027_Supermarket", "028_Predict", "029_NYTimes", "030_Professionals",
    "031_Trustpilot", "032_Delicatessen", "033_Employee", "034_World",
    "036_US", "037_Ted", "038_Stroke", "039_Happy", "040_Speed",
    "041_Airline", "042_Predict", "043_Predict", "044_IMDb", "045_Predict",
    "046_120", "047_Bank", "048_Data",  "050_ING",
    "051_Pokemon", "052_Professional", "053_Patents", "055_German",
    "056_Emoji", "057_Spain", "058_US", "059_Second", "060_Bakery",
    "061_Disneyland", "062_Trump", "063_Influencers", "064_Clustering", "065_RFM"
]



# Not running:  "020_Real", "035_Billboard", "049_Boris", "054_Joe",

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

In [49]:
print(len(train_target_datasets))

59


In [53]:
dev_dataset_map = {}
unique_dev_datasets = list(set(semeval_train.unique("dataset")))

for dataset in train_target_datasets:
    dev_dataset_map[dataset] = pd.read_parquet(f"hf://datasets/cardiffnlp/databench/data/{dataset}/sample.parquet")


In [54]:
print(len(dev_dataset_map))

59


In [55]:
def clean_column_name(col_name):
    # Use regex to remove the <gx:...> part
    cleaned = re.sub(r'<gx:[^>]+>', '', col_name)
    return cleaned

for ds in dev_dataset_map.keys():
    # Get the original dataframe
    df = dev_dataset_map[ds]

    column_mapping = {col: clean_column_name(col) for col in df.columns}

    # Rename the columns in the dataframe
    df = df.rename(columns=column_mapping)

    # Store the updated dataframe back in the hashmap
    dev_dataset_map[ds] = df

In [56]:
def safe_parse_list(raw):
    import ast
    import re

    if isinstance(raw, list):
        return raw

    if isinstance(raw, str):
        try:
            # Step 1: Try literal_eval first (may work if quotes are fine)
            parsed = ast.literal_eval(raw)
            if isinstance(parsed, list):
                return parsed
        except:
            pass

        try:
            # Step 2: Manually repair unquoted or single-quoted elements
            if raw.startswith("[") and raw.endswith("]"):
                inner = raw[1:-1].strip()
                if inner and not inner.startswith(("'", '"')):
                    # Case: [Weight, Height]
                    parts = [p.strip() for p in inner.split(",")]
                    quoted = [f'"{p}"' for p in parts if p]
                    fixed = "[" + ", ".join(quoted) + "]"
                else:
                    # Case: ["What's your name?"]
                    # Escape inner quotes
                    inner = raw[1:-1]
                    fixed_inner = re.sub(r'(["\'])', r'\\\1', inner)
                    fixed = f'["{fixed_inner}"]'
                parsed = ast.literal_eval(fixed)
                if isinstance(parsed, list):
                    return parsed
        except Exception as e:
            print(f"Failed to parse used_cols string: {raw} | Reason: {e}")

    return raw # should never happen

def clean_columns_used(col_list):
    parsed = safe_parse_list(col_list)

    if not isinstance(parsed, list):
        return [str(parsed)] if parsed is not None else []

    cleaned = []
    for col in parsed:
        col = str(col)
        col = re.sub(r"<gx:[^>]+>", "", col).strip()
        cleaned.append(col)
    return cleaned

def update_columns_used(example):
    example["columns_used"] = clean_columns_used(example.get("columns_used"))
    return example

semeval_train = semeval_train.map(update_columns_used)

In [57]:
def serialize_to_kv_format(df, dropna=True):
    kv_serialized = []
    for _, row in df.iterrows():
        kv_pairs = []
        for col, val in row.items():
            if pd.isna(val) and dropna:
                continue
            if isinstance(val, str):
                val = f'"{val}"'
            kv_pairs.append(f"{col}: {val}")
        row_str = "{" + ", ".join(kv_pairs) + "}"
        kv_serialized.append(row_str)
    return kv_serialized

def build_prompt(df: pd.DataFrame, question: str, explain: bool = False, use_csv_format: bool = True) -> str:
    if use_csv_format:
        serialized_data = df.to_csv(index=False)
        data_block = f"In the following CSV formatted data:\n```csv\n{serialized_data}```"
    else:
        serialized_data = serialize_to_kv_format(df)
        data_block = f"In the following key-value formatted data:\n⁠kv\n{serialized_data}\n⁠ "

    response_format = (
        'You must answer in a single JSON with two fields:\n'
        '* "answer": your final answer based on the records.\n'
        '* "columns_used": list of relevant columns.'
    )

    prompt_body = (
        "You are an assistant tasked with answering questions asked of a given dataset in JSON format.\n"
        f"{response_format}\n"
        "Requirements:\n"
        "* Only respond with the JSON. Do not include explanations or full objects.\n"
        "* Your answer must use valid Python data types:\n"
        "  - Use `True` or `False` (capitalized) for boolean values.\n"
        "  - Use numbers as Python `int` or `float` (e.g., `3`, `3.14`).\n"
        "  - Use double-quoted Python strings for categorical values (e.g., \"USA\").\n"
        "  - Use Python lists for answers involving multiple values:\n"
        "    - For list[category], return a list of strings.\n"
        "    - For list[number], return a list of ints or floats.\n"
        "    - Ensure all inner values match the correct type.\n"
        "* Do not include explanations or justification.\n\n"
        f"{data_block}\n"
        f"USER: {question}\n"
        "ASSISTANT:"
    )

    return f"[INST]\n{prompt_body}\n[/INST]"



In [58]:
# def generate_model_response(prompt):
#     if USE_GROQ:
#         response = groq_client.chat.completions.create(
#             # model="llama3-8b-8192",
#             model="meta-llama/llama-4-scout-17b-16e-instruct",
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.7,
#             max_tokens=512,
#             top_p=1.0
#         )
#         return response.choices[0].message.content

#     else:
#         response = openai.ChatCompletion.create(
#             model="gpt-3.5-turbo",
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.7,
#             max_tokens=512,
#             top_p=1.0
#         )
#         return response.choices[0].message["content"]

def generate_model_response(prompt):
    """
    Generates a response using Groq (LLaMA-3) or OpenAI (GPT-3.5).
    Falls back across multiple Groq keys if needed.
    """
    if USE_GROQ:
        for attempt in range(len(groq_clients)):
            groq_client = next(groq_client_cycle)

            try:
                response = groq_client.chat.completions.create(
                    model="llama3-8b-8192",
                    # model="llama3-70b-8192",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.7,
                    max_tokens=512,
                    top_p=1.0
                )
                return response.choices[0].message.content

            except Exception as e:
                print(f"[Groq Attempt {attempt + 1}] Error: {e}")
                continue

        return "All Groq API keys exhausted or rate limited."

    else:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.7,
                max_tokens=512,
                top_p=1.0
            )
            return response.choices[0].message["content"]

        except Exception as e:
            return f"OpenAI request failed: {e}"


In [59]:
def normalize_number(value):
    if isinstance(value, (int, float)):
        return float(value)
    if isinstance(value, str):
        return float(value.strip())
    raise ValueError(f"Expected numeric type for number, got: {type(value)}")

def normalize_category(value):
    if isinstance(value, str):
        return value.strip()
    raise ValueError(f"Expected string for category, got: {type(value)}")

def normalize_boolean(value):
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        val = value.strip().lower()
        if val in {"true", "1", "yes"}:
            return True
        elif val in {"false", "0", "no"}:
            return False
        else:
            raise ValueError(f"Unrecognized string for boolean: {value!r}")
    if isinstance(value, (int, float)):
        if value == 1:
            return True
        elif value == 0:
            return False
        else:
            raise ValueError(f"Numeric value not valid for boolean: {value}")
    raise ValueError(f"Expected bool, int, float, or string for boolean, got: {type(value)}")

In [60]:
def normalize_list_category(answer):
    """
    Normalize an answer of type list[category] into a set of cleaned strings.
    Handles both true lists and stringified list representations,
    and removes stray square brackets from individual elements.
    """
    def clean_item(x):
        x = str(x).strip()
        if x.startswith("["):
            x = x[1:]
        if x.endswith("]"):
            x = x[:-1]
        return x.strip()

    if isinstance(answer, str):
        try:
            parsed = ast.literal_eval(answer)
            if isinstance(parsed, list):
                answer = parsed
            else:
                answer = [item.strip() for item in answer.split(",") if item.strip()]
        except:
            answer = [item.strip() for item in answer.split(",") if item.strip()]

    if isinstance(answer, list):
        return set(clean_item(x) for x in answer)

    return set()  # fallback


In [61]:
def normalize_list_number(value):
    """
    Normalizes a predicted or gold value for list[number] questions.

    Expected input:
    - a string representing a list of numbers, e.g., "[2, 2, 2]"
    - OR a Python list of numbers
    - OR a stringified CSV like "2, 2, 2"

    Returns:
    - A set of floats
    """
    try:
        if isinstance(value, str):
            try:
                # Try parsing as JSON list
                value = json.loads(value)
            except json.JSONDecodeError:
                try:
                    value = ast.literal_eval(value)
                except:
                    # fallback: comma-separated
                    value = [item.strip() for item in value.split(",") if item.strip()]
        return set(float(v) for v in value)
    except Exception as e:
        print(f"normalize_list_number error: {e}")
        return None


In [62]:
def normalize_answer(value, expected_type):
    """
    Dispatches to the appropriate normalization function based on expected_type.
    """
    try:
        if expected_type == "number":
            return normalize_number(value)
        elif expected_type == "category":
            return normalize_category(value)
        elif expected_type == "boolean":
            return normalize_boolean(value)
        elif expected_type == "list[category]":
            return normalize_list_category(value)
        elif expected_type == "list[number]":
            return normalize_list_number(value)
        else:
            raise ValueError(f"Unsupported expected type: {expected_type}")
    except Exception as e:
        print(f"normalize_answer error for type '{expected_type}': {e}")
        return None


In [63]:
def normalize_columns(value):
    """
    Normalize a gold or predicted column list into a set of strings,
    preserving casing and special characters.
    """
    try:
        if isinstance(value, str):
            # Try parsing as a list
            try:
                parsed = ast.literal_eval(value)
                if isinstance(parsed, list):
                    value = parsed
                else:
                    # fallback: comma-split string
                    value = [item.strip() for item in value.split(",") if item.strip()]
            except:
                value = [item.strip() for item in value.split(",") if item.strip()]

        if isinstance(value, list):
            return set(str(x).strip() for x in value)

    except Exception as e:
        print(f"normalize_columns error: {e}")

    return set()  # fallback


In [None]:
error_dict = {}

In [64]:
import json
import ast

def process_response(generated_text, question, expected_type, error_set):
    """
    Processes the raw LLM response to extract and normalize the answer and columns.

    Args:
        generated_text (str): Raw text output from the LLM.
        question (str): The question (used for debugging).
        expected_type (str): The expected type of the answer (e.g., boolean, number).
        error_set (set): A set to store questions that had format errors.

    Returns:
        Tuple[bool, Any, List[str]]:
            - is_error (bool): True if formatting/parsing failed.
            - norm_answer: normalized answer or None on failure.
            - norm_columns: normalized list of columns or [] on failure.
    """
    is_error = False
    norm_answer = None
    norm_columns = []

    try:
        # Step 1: Trim response
        generated_text = generated_text.strip()

        # Step 2: Try to isolate a dictionary from the output
        start = generated_text.find('{')
        end = generated_text.rfind('}') + 1
        if start == -1 or end == -1:
            raise ValueError("Could not find a JSON-like object")

        json_str = generated_text[start:end]

        # Fix lowercase true/false if needed
        json_str_fixed = json_str.replace("true", "True").replace("false", "False")

        # Step 3: Try parsing as JSON first, fallback to ast.literal_eval
        try:
            response_json = json.loads(json_str)
        except json.JSONDecodeError:
            response_json = ast.literal_eval(json_str_fixed)

        # Step 4: Ensure expected keys exist
        if "answer" not in response_json or "columns_used" not in response_json:
            raise KeyError("Missing 'answer' or 'columns_used' in response")

        raw_answer = response_json["answer"]
        raw_columns = response_json["columns_used"]

        # Step 5: Normalize both fields
        norm_answer = normalize_answer(raw_answer, expected_type)
        norm_columns = normalize_columns(raw_columns)

    except Exception as e:
        # On any failure, flag error and return safe defaults
        print(f"[process_response] Failed to parse response for question: {question}")
        print(f"Error: {e}")
        is_error = True
        error_set.add(question)
        norm_answer = None
        norm_columns = []

    # Return tuple: (was error?, normalized answer, normalized columns)
    return is_error, norm_answer, norm_columns


In [65]:
example = semeval_train[0]
question = example["question"]
dataset_id = example["dataset"]
expected_type = example["type"]
true_answer = normalize_answer(example["sample_answer"], expected_type)
true_columns = normalize_columns(example["columns_used"])

print(f"Type(true_answer) {type(true_answer)}")
print(f"Type(true_columns) {type(true_columns)}")

df = dev_dataset_map[dataset_id]
prompt = build_prompt(df, question)
# print("=== Prompt ===\n", prompt[:1000])  # Truncated for readability

model_output = generate_model_response(prompt)
# print("=== Raw Model Output ===\n", model_output)

sample_error_set = set()

isPredError, pred_answer, pred_cols = process_response(model_output, question, expected_type, sample_error_set)

print(f"\nType(pred_answer) {type(pred_answer)}")
print(f"Type(pred_columns) {type(pred_cols)}")

print(f"\nQuestion: {question}")
print(f"Is Error? : {isPredError}")
print(f"Gold Answer: {true_answer}, Predicted: {pred_answer}")
print(f"Gold Columns: {true_columns}, Predicted: {pred_cols}\n")


Type(true_answer) <class 'bool'>
Type(true_columns) <class 'set'>

Type(pred_answer) <class 'bool'>
Type(pred_columns) <class 'set'>

Question: Is the person with the highest net worth self-made?
Is Error? : False
Gold Answer: False, Predicted: True
Gold Columns: {'selfMade', 'finalWorth'}, Predicted: {'selfMade', 'finalWorth'}



In [66]:
import time

def safe_generate_response(prompt, retries=1, delay=3):
    attempts = 0
    while attempts <= retries:
        try:
            return generate_model_response(prompt)
        except Exception as e:
            print(f"[Attempt {attempts+1}] Error: {repr(e)}")
            time.sleep(delay)
            attempts += 1
    print("Failed all attempts. Returning empty response.")
    return ""


In [67]:
def evaluate_dataset(dataset_rows, dataset_name, dev_dataset_map, request_delay=1.5):
    pred_answers = []
    gold_answers = []
    pred_columns = []
    gold_columns = []
    question_types = []

    type_wise_correct = defaultdict(int)
    type_wise_total = defaultdict(int)
    column_match_count = 0
    error_set = set()
    formatting_errors_by_type = defaultdict(int)

    print(f"# Questions in {dataset_name}: {len(dataset_rows)}")

    for i, row in enumerate(dataset_rows):
        print(f"\n--- Query {i+1}/{len(dataset_rows)} ---")

        question = row["question"]
        dataset = row["dataset"]
        expected_type = row["type"]

        gold_answer = normalize_answer(row["sample_answer"], expected_type)
        gold_cols = normalize_columns(row["columns_used"])

        df = dev_dataset_map[dataset]
        prompt = build_prompt(df, question)
        # print(f"promot: {prompt}")

        response = safe_generate_response(prompt)
        time.sleep(request_delay)
        print("\n--- Raw LLM Response ---")
        print(response)

        is_error, pred_answer, pred_cols = process_response(response, question, expected_type, error_set)
        if is_error:
            formatting_errors_by_type[expected_type] += 1

        print(f"\nQuestion: {question}")
        print(f"Pred Answer: {pred_answer}, Gold Answer: {gold_answer}")
        print(f"Pred Columns: {pred_cols}, Gold Columns: {gold_cols}")

        pred_answers.append(pred_answer)
        gold_answers.append(gold_answer)
        pred_columns.append(pred_cols)
        gold_columns.append(gold_cols)
        question_types.append(expected_type)
        type_wise_total[expected_type] += 1

        correct = False
        try:
            if expected_type == "number":
                correct = abs(pred_answer - gold_answer) < 1e-3
            else:
                correct = pred_answer == gold_answer
        except:
            correct = False

        if correct:
            type_wise_correct[expected_type] += 1

        if isinstance(pred_cols, (list, set)) and set(pred_cols) == set(gold_cols):
            column_match_count += 1

    print("\n=== Answer Accuracy by Type ===")
    for qtype in type_wise_total:
        total = type_wise_total[qtype]
        correct = type_wise_correct[qtype]
        acc = correct / total if total else 0
        print(f"{qtype:15}: {acc:.2%} ({correct}/{total})")

    total = len(dataset_rows)
    col_acc = column_match_count / total if total else 0
    print(f"\n=== Column Selection Accuracy ===\n{col_acc:.2%} ({column_match_count}/{total})")

    eval_records = []
    for i in range(len(dataset_rows)):
        eval_records.append({
            "type": question_types[i],
            "gold_answer": gold_answers[i],
            "pred_answer": pred_answers[i],
            "gold_columns": gold_columns[i],
            "pred_columns": pred_columns[i],
        })

    # Final column stats
    wrong_cols = 0
    right_cols = 0
    format_errors = 0
    for i, (pred, gold) in enumerate(zip(pred_columns, gold_columns)):
        question = dataset_rows[i]["question"]

        if question in error_set:
            format_errors += 1
            continue

        if isinstance(pred, (list, set)):
            if set(pred) == set(gold):
                right_cols += 1
            else:
                wrong_cols += 1
        else:
            format_errors += 1  # fallback, shouldn't happen


    column_stats = {
        "wrong_cols": wrong_cols,
        "right_cols": right_cols,
        "format_error": format_errors,
        "total": len(dataset_rows)
    }

    return eval_records, column_stats, formatting_errors_by_type

In [68]:
def compute_metrics(eval_records, model_name, model_results):
    results = defaultdict(float)
    total = len(eval_records)
    correct_all = 0
    typewise = defaultdict(lambda: [0, 0])
    colwise = {'single': [0, 0], 'multi': [0, 0]}

    for record in eval_records:
        t = record['type'].strip().lower()
        gold_answer = record['gold_answer']
        pred_answer = record['pred_answer']
        gold_cols = set(record['gold_columns'])
        pred_cols = set(record['pred_columns'])

        try:
            if t == "number":
                answer_match = abs(pred_answer - gold_answer) < 1e-3
            else:
                answer_match = pred_answer == gold_answer
        except:
            answer_match = False

        col_match = gold_cols == pred_cols
        joint_match = answer_match and col_match
        if joint_match:
            correct_all += 1

        typewise[t][1] += 1
        if joint_match:
            typewise[t][0] += 1

        col_count = len(gold_cols)
        if col_count == 1:
            colwise['single'][1] += 1
            if joint_match:
                colwise['single'][0] += 1
        else:
            colwise['multi'][1] += 1
            if joint_match:
                colwise['multi'][0] += 1

    def get_acc(dic, key):
        correct, total = dic[key]
        return correct / total if total else 0

    results['avg'] = correct_all / total if total else 0
    results['boolean'] = get_acc(typewise, 'boolean')
    results['number'] = get_acc(typewise, 'number')
    results['category'] = get_acc(typewise, 'category')
    results['list[category]'] = get_acc(typewise, 'list[category]')
    results['list[number]'] = get_acc(typewise, 'list[number]')
    results['single col'] = get_acc(colwise, 'single')
    results['multiple cols'] = get_acc(colwise, 'multi')

    model_results[model_name] = dict(results)


In [69]:
def format_percent_and_count(val, total):
    percent = 100 * val / total if total else 0
    return f"{percent:.1f} ({val})"

In [None]:
# # Step 1: Filter dataset
# pokemon_rows = semeval_dev.filter(lambda sample: sample["dataset"] == "051_Pokemon")

# # Step 2: Dictionary to hold results
# model_results = {}

# # Step 3: Run evaluation
# records, col_stats, formatting_errors_by_type = evaluate_dataset(
#     pokemon_rows, dataset_name="Pokemon", dev_dataset_map=dev_dataset_map
# )

# # Step 4: Compute main metrics
# compute_metrics(records, "baseline_llm", model_results)

# # Step 5a: Table 1 – main metrics
# results_df = pd.DataFrame(model_results).T.round(3)
# print("### Main Metrics Table:")
# print(results_df.to_markdown())

# # Step 5b: Table 2 – column quality summary

# col_eval_table = [{
#     "model": "baseline_llm",
#     "wrong cols": format_percent_and_count(col_stats["wrong_cols"], col_stats["total"]),
#     "right cols": format_percent_and_count(col_stats["right_cols"], col_stats["total"]),
#     "format error": format_percent_and_count(col_stats["format_error"], col_stats["total"]),
# }]

# col_df = pd.DataFrame(col_eval_table)
# print("\n### Column Quality Table:")
# print(col_df.to_markdown(index=False))

# # Step 5c: Formatting errors by type
# print("\n### Formatting Errors by Question Type:")
# for qtype, count in formatting_errors_by_type.items():
#     print(f"{qtype:15}: {count} formatting errors")


In [70]:
# model_results = {}
# column_quality_table = []
# formatting_errors_summary = defaultdict(int)

# for dataset_id in train_target_datasets:

#     dataset_rows = semeval_train.filter(lambda sample: sample["dataset"] == dataset_id)
#     print(f"\n\n### Evaluating {dataset_id} ###")

#     records, col_stats, formatting_errors_by_type = evaluate_dataset(
#         dataset_rows, dataset_name=dataset_id, dev_dataset_map=dev_dataset_map
#     )

#     compute_metrics(records, dataset_id, model_results)

#     column_quality_table.append({
#         "model": dataset_id,
#         "wrong cols": format_percent_and_count(col_stats["wrong_cols"], col_stats["total"]),
#         "right cols": format_percent_and_count(col_stats["right_cols"], col_stats["total"]),
#         "format error": format_percent_and_count(col_stats["format_error"], col_stats["total"]),
#     })

#     for qtype, count in formatting_errors_by_type.items():
#         formatting_errors_summary[qtype] += count



all_eval_records = []
model_results = {}
column_quality_table = []
formatting_errors_summary = defaultdict(int)

for dataset_id in train_target_datasets:

    dataset_rows = semeval_train.filter(lambda sample: sample["dataset"] == dataset_id)
    print(f"\n\n### Evaluating {dataset_id} ###")

    records, col_stats, formatting_errors_by_type = evaluate_dataset(
        dataset_rows, dataset_name=dataset_id, dev_dataset_map=dev_dataset_map
    )

    compute_metrics(records, dataset_id, model_results)

    all_eval_records.extend(records)

    column_quality_table.append({
        "model": dataset_id,
        "wrong cols": format_percent_and_count(col_stats["wrong_cols"], col_stats["total"]),
        "right cols": format_percent_and_count(col_stats["right_cols"], col_stats["total"]),
        "format error": format_percent_and_count(col_stats["format_error"], col_stats["total"]),
    })

    for qtype, count in formatting_errors_by_type.items():
        formatting_errors_summary[qtype] += count


Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 001_Forbes ###
# Questions in 001_Forbes: 25

--- Query 1/25 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["selfMade", "finalWorth"]
}

Question: Is the person with the highest net worth self-made?
Pred Answer: False, Gold Answer: False
Pred Columns: {'selfMade', 'finalWorth'}, Gold Columns: {'selfMade', 'finalWorth'}

--- Query 2/25 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["gender", "age"]
}

Question: Does the youngest billionaire identify as male?
Pred Answer: True, Gold Answer: True
Pred Columns: {'age', 'gender'}, Gold Columns: {'age', 'gender'}

--- Query 3/25 ---

--- Raw LLM Response ---
{
"answer": "Atlanta",
"columns_used": ["city", "selfMade", "country"]
}
normalize_answer error for type 'boolean': Unrecognized string for boolean: 'Atlanta'

Question: Is the city with the most billionaires in the United States?
Pred Answer: None, Gold Answer: True
Pred Columns: {'country', 'selfMade', 'city'}, Gold Columns: {'count

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 002_Titanic ###
# Questions in 002_Titanic: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["Age"]
}

Question: Did any children below the age of 18 survive?
Pred Answer: False, Gold Answer: True
Pred Columns: {'Age'}, Gold Columns: {'Age', 'Survived'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Fare"]
}

Question: Were there any passengers who paid a fare of more than $500?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Fare'}, Gold Columns: {'Fare'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["Name"]}

Question: Is every passenger's name unique?
Pred Answer: False, Gold Answer: True
Pred Columns: {'Name'}, Gold Columns: {'Name'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
    "answer": True,
    "columns_used": ["Sex", "Pclass", "Survived"]
}

Question: Were there any female passengers in the 3rd class who survived?
Pred Answer: True, Gold Answer

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 004_Taxi ###
# Questions in 004_Taxi: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["trip_distance"]}

Question: Are there any trips with a total distance greater than 30 miles?
Pred Answer: True, Gold Answer: False
Pred Columns: {'trip_distance'}, Gold Columns: {'trip_distance'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["total_amount"]
}

Question: Were there any trips that cost more than $100 in total?
Pred Answer: True, Gold Answer: False
Pred Columns: {'total_amount'}, Gold Columns: {'total_amount'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["passenger_count"]}

Question: Is there any trip with more than 6 passengers?
Pred Answer: True, Gold Answer: False
Pred Columns: {'passenger_count'}, Gold Columns: {'passenger_count'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["payment_type"]
}

Question: Did all the trips use a

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 005_NYC ###
# Questions in 005_NYC: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["borough"]}

Question: Are there any complaints made in Brooklyn?
Pred Answer: True, Gold Answer: True
Pred Columns: {'borough'}, Gold Columns: {'borough'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["descriptor"]}

Question: Do any complaints have 'Dog' as a descriptor?
Pred Answer: False, Gold Answer: False
Pred Columns: {'descriptor'}, Gold Columns: {'descriptor'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["month_name"]
}

Question: Were there any complaints raised in April?
Pred Answer: True, Gold Answer: True
Pred Columns: {'month_name'}, Gold Columns: {'month_name'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["agency"]
}

Question: Is the Mayor's office of special enforcement one of the agencies handling complaints?
Pred Answer: False,

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 006_London ###
# Questions in 006_London: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["host_neighbourhood", "room_type"]}

Question: Are all properties in the dataset located in the same neighbourhood?
Pred Answer: False, Gold Answer: False
Pred Columns: {'room_type', 'host_neighbourhood'}, Gold Columns: {'neighbourhood_cleansed'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["host_identity_verified"]}

Question: Do all hosts verify their identity?
Pred Answer: True, Gold Answer: False
Pred Columns: {'host_identity_verified'}, Gold Columns: {'host_identity_verified'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["reviews_per_month"]
}

Question: Are all reviews_per_month values greater than 5?
Pred Answer: False, Gold Answer: False
Pred Columns: {'reviews_per_month'}, Gold Columns: {'reviews_per_month'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": 

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 007_Fifa ###
# Questions in 007_Fifa: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["Joined", "Overall", "Position", "Value_€", "Potential"]
[process_response] Failed to parse response for question: Are there players who have a greater overall score than their potential score?
Error: invalid syntax (<unknown>, line 0)

Question: Are there players who have a greater overall score than their potential score?
Pred Answer: None, Gold Answer: False
Pred Columns: [], Gold Columns: {'Overall', 'Potential'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Joined", "Age"]
}

Question: Are there any players who joined their current club before they were 18 years old?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Age', 'Joined'}, Gold Columns: {'Age', 'Joined'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["Preferred Foot", "Nationality"]
}

Question: Are there any pla

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 008_Tornados ###
# Questions in 008_Tornados: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["inj"]
}

Question: There are no tornadoes that resulted in more than 500 injuries.
Pred Answer: False, Gold Answer: True
Pred Columns: {'inj'}, Gold Columns: {'inj'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["yr"]
}

Question: All tornadoes in the dataset occurred in the 21st century.
Pred Answer: False, Gold Answer: False
Pred Columns: {'yr'}, Gold Columns: {'yr'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["len"]}

Question: No tornado has a length greater than 100 miles.
Pred Answer: False, Gold Answer: True
Pred Columns: {'len'}, Gold Columns: {'len'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["fat"]}

Question: There are no tornadoes that resulted in more than 100 fatalities.
Pred Answer: True, Gold Answer: True
Pred Colum

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 009_Central ###
# Questions in 009_Central: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["PRCP"]
}

Question: There were no days when the precipitation was greater than 5 inches.
Pred Answer: True, Gold Answer: True
Pred Columns: {'PRCP'}, Gold Columns: {'PRCP'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["TMIN"]
}

Question: All recorded temperatures are above freezing point.
Pred Answer: True, Gold Answer: False
Pred Columns: {'TMIN'}, Gold Columns: {'TMAX', 'TMIN'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["SNWD"]
}

Question: There were no days when the snow depth was more than 10 inches.
Pred Answer: False, Gold Answer: True
Pred Columns: {'SNWD'}, Gold Columns: {'SNWD'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["TMAX"]
}

Question: There were no days when the maximum temperature was below freezing point.
Pred A

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 010_ECommerce ###
# Questions in 010_ECommerce: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["Clothing ID"]
}

Question: Are there more than 20 unique clothing items in the dataset?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Clothing ID'}, Gold Columns: {'Clothing ID'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": 56.0,
"columns_used": ["Age"]
}
normalize_answer error for type 'boolean': Numeric value not valid for boolean: 56.0

Question: Is the age of the reviewers above 50 years on average?
Pred Answer: None, Gold Answer: False
Pred Columns: {'Age'}, Gold Columns: {'Age'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Department Name"]}

Question: Do all reviews come from the same department?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Department Name'}, Gold Columns: {'Department Name'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_u

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 011_SF ###
# Questions in 011_SF: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Incident Year", "Filed Online"]
}

Question: Was the highest reported incident in the year 2023 filed online?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Filed Online', 'Incident Year'}, Gold Columns: {'Incident Number', 'Filed Online', 'Incident Year'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Incident Day of Week", "Resolution"]
}

Question: Are all incidents reported on Mondays resolved?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Resolution', 'Incident Day of Week'}, Gold Columns: {'Resolution', 'Incident Day of Week'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Police District", "Supervisor District"]}

Question: Do any incidents reported in Police District 'Central' fall in Supervisor District 5?
Pred Answer: True, Gold Answer: False
Pred Columns: {'S

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 012_Heart ###
# Questions in 012_Heart: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["ExerciseAngina"]
}

Question: Do all patients experience exercise-induced angina?
Pred Answer: False, Gold Answer: False
Pred Columns: {'ExerciseAngina'}, Gold Columns: {'ExerciseAngina'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["RestingBP"]
}

Question: Does any patient have a resting blood pressure above 200?
Pred Answer: False, Gold Answer: False
Pred Columns: {'RestingBP'}, Gold Columns: {'RestingBP'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
  "answer": true,
  "columns_used": ["HeartDisease"]
}

Question: Are there patients without heart disease?
Pred Answer: True, Gold Answer: True
Pred Columns: {'HeartDisease'}, Gold Columns: {'HeartDisease'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["RestingECG"]}

Question: Does everyone have normal resting electrocard

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 013_Roller ###
# Questions in 013_Roller: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Status", "year_introduced", "coaster_name"]
}

Question: Did the oldest roller coaster in the dataset still operate?
Pred Answer: True, Gold Answer: True
Pred Columns: {'year_introduced', 'Status', 'coaster_name'}, Gold Columns: {'year_introduced', 'Status'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["speed_mph"]
}

Question: Is there a roller coaster in the dataset that operates at a speed more than 100 mph?
Pred Answer: False, Gold Answer: False
Pred Columns: {'speed_mph'}, Gold Columns: {'speed_mph'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["Status", "Designer"]
}

Question: Are all roller coasters in the dataset designed by 'Werner Stengel' removed?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Designer', 'Status'}, Gold Columns: {'Designer', 'Status'}


Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 015_Food ###
# Questions in 015_Food: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["GROUP"]
}

Question: Is there a food item with 'Fruits' as its group?
Pred Answer: False, Gold Answer: False
Pred Columns: {'GROUP'}, Gold Columns: {'GROUP'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["FOOD NAME", "GROUP", "SUB GROUP"]
}

Question: Are there food items with 'Nuts' as their sub group?
Pred Answer: True, Gold Answer: True
Pred Columns: {'GROUP', 'FOOD NAME', 'SUB GROUP'}, Gold Columns: {'SUB GROUP'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["SCIENTIFIC NAME"]
}

Question: Is there a food item with scientific name 'Tilia argentea'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'SCIENTIFIC NAME'}, Gold Columns: {'SCIENTIFIC NAME'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["FOOD NAME"]
}

Question: Is 'Angelica'

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 016_Holiday ###
# Questions in 016_Holiday: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Occupation"]}

Question: Is there a customer with 'Large Business' as their occupation?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Occupation'}, Gold Columns: {'Occupation'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["ProductPitched"]
}

Question: Are there customers with 'King' as their pitched product?
Pred Answer: True, Gold Answer: True
Pred Columns: {'ProductPitched'}, Gold Columns: {'ProductPitched'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Designation"]
}

Question: Is there a customer with designation 'VP'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Designation'}, Gold Columns: {'Designation'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["MaritalStatus"]
}

Question: Is 'Unmarried' listed as a marital s

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 017_Hacker ###
# Questions in 017_Hacker: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["partofday", "weekday_name"]}

Question: Is there any entry posted on a weekend?
Pred Answer: True, Gold Answer: True
Pred Columns: {'partofday', 'weekday_name'}, Gold Columns: {'weekday_name'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["title"]}

Question: Are there titles with more than 100 characters?
Pred Answer: True, Gold Answer: False
Pred Columns: {'title'}, Gold Columns: {'title'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["title_gx_cardiff_nlp_sentiment"]}

Question: Do any entries have a negative sentiment according to the Cardiff NLP model?
Pred Answer: True, Gold Answer: False
Pred Columns: {'title_gx_cardiff_nlp_sentiment'}, Gold Columns: {'title_gx_cardiff_nlp_sentiment'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["Cluste

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 018_Staff ###
# Questions in 018_Staff: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Number of Projects"]
}

Question: Are there any employees with more than 7 projects?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Number of Projects'}, Gold Columns: {'Number of Projects'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Average Monthly Hours"]
}

Question: Has any employee worked for more than 300 hours on average per month?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Average Monthly Hours'}, Gold Columns: {'Average Monthly Hours'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Satisfaction Level"]
}

Question: Are all satisfaction levels above 0.5?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Satisfaction Level'}, Gold Columns: {'Satisfaction Level'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["Da

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 019_Aircraft ###
# Questions in 019_Aircraft: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Aircraft_Damage_Type"]
}

Question: Did any incident result in the total destruction of the aircraft?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Aircraft_Damage_Type'}, Gold Columns: {'Aircaft_Damage_Type'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Aircraft_Damage_Type", "Incident_Cause(es)"]
[process_response] Failed to parse response for question: Have there been any incidents where the cause was related to the undercarriage of the aircraft?
Error: invalid syntax (<unknown>, line 0)

Question: Have there been any incidents where the cause was related to the undercarriage of the aircraft?
Pred Answer: None, Gold Answer: False
Pred Columns: [], Gold Columns: {'Incident_Cause(es)'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Ground_Casualties"]
}

Questi

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 021_Telco ###
# Questions in 021_Telco: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["customerID", "MonthlyCharges"]
}

Question: Are there more than 2000 customers with a monthly charge higher than $80?
Pred Answer: True, Gold Answer: False
Pred Columns: {'MonthlyCharges', 'customerID'}, Gold Columns: {'MonthlyCharges'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["PhoneService"]
}

Question: Do all customers have phone service?
Pred Answer: True, Gold Answer: False
Pred Columns: {'PhoneService'}, Gold Columns: {'PhoneService'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["InternetService"]}

Question: Are there any customers with no internet service?
Pred Answer: True, Gold Answer: True
Pred Columns: {'InternetService'}, Gold Columns: {'InternetService'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": true, "columns_used": ["SeniorCitizen", "Dependen

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 022_Airbnbs ###
# Questions in 022_Airbnbs: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": true, "columns_used": ["review_scores_rating"]}

Question: Is there a listing with a review score rating of 100?
Pred Answer: True, Gold Answer: True
Pred Columns: {'review_scores_rating'}, Gold Columns: {'review_scores_rating'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["host_total_listings_count"]}

Question: Are there any hosts who have listed more than 10 properties?
Pred Answer: False, Gold Answer: True
Pred Columns: {'host_total_listings_count'}, Gold Columns: {'host_total_listings_count'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["instant_bookable"]
}

Question: Are all listings instantly bookable?
Pred Answer: False, Gold Answer: True
Pred Columns: {'instant_bookable'}, Gold Columns: {'instant_bookable'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": false, "columns_used":

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 023_Climate ###
# Questions in 023_Climate: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["tmin", "prec"]}

Question: Was there a day when the minimum temperature was below zero and it didn't rain?
Pred Answer: True, Gold Answer: True
Pred Columns: {'prec', 'tmin'}, Gold Columns: {'prec', 'tmin'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["sol", "tmax"]
}

Question: Are there records where the solar radiation exceeds 10 but the maximum temperature was below 20?
Pred Answer: True, Gold Answer: True
Pred Columns: {'sol', 'tmax'}, Gold Columns: {'sol', 'tmax'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["velmedia", "tmax"]
}

Question: Did any day with maximum wind speed above 15 also have average wind speed below 5?
Pred Answer: True, Gold Answer: False
Pred Columns: {'velmedia', 'tmax'}, Gold Columns: {'velmedia', 'racha'}

--- Query 4/20 ---

--- Raw LLM Resp

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 024_Salary ###
# Questions in 024_Salary: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["RETRINOIN_xRZI"]
}

Question: Are there records where the RETRINOIN_xRZI exceeds 10,000?
Pred Answer: True, Gold Answer: True
Pred Columns: {'RETRINOIN_xRZI'}, Gold Columns: {'RETRINOIN_xRZI'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["SEXO", "NUTS1"]
}

Question: Are there any female respondents who belong to the ESTE NUTS1 region?
Pred Answer: True, Gold Answer: True
Pred Columns: {'NUTS1', 'SEXO'}, Gold Columns: {'NUTS1', 'SEXO'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["CONTROL", "MERCADO"]
}

Question: Do we have respondents who fall under both PRIVADO control and NACIONAL market?
Pred Answer: True, Gold Answer: True
Pred Columns: {'MERCADO', 'CONTROL'}, Gold Columns: {'MERCADO', 'CONTROL'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 025_Data ###
# Questions in 025_Data: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["URLs", "Keyword"]}

Question: Is the most visited URL related to 'no code data science'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Keyword', 'URLs'}, Gold Columns: {'Keyword', 'Ranking', 'URLs'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
  "answer": true,
  "columns_used": ["Ranking", "Competition", "URLs"]
}

Question: Does any URL have a competition level of 'Low'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Ranking', 'Competition', 'URLs'}, Gold Columns: {'URLs', 'Competition'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["Ranking", "Competition", "Avg. monthly searches", "URLs"]}

Question: Are there any URLs with an average monthly searches above 1000?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Avg. monthly searches', 'Ranking', 'Competition', 'URLs'}, Gold Columns: {'Avg.

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 026_Predicting ###
# Questions in 026_Predicting: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["quality"]
}

Question: Is there any wine with a quality rating of 10?
Pred Answer: False, Gold Answer: False
Pred Columns: {'quality'}, Gold Columns: {'quality'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["residual sugar"]}

Question: Are there any wines with residual sugar above 15g/dm^3?
Pred Answer: True, Gold Answer: False
Pred Columns: {'residual sugar'}, Gold Columns: {'residual sugar'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["quality", "alcohol"]
}

Question: Is the highest alcohol content wine also the one with the highest quality rating?
Pred Answer: False, Gold Answer: False
Pred Columns: {'quality', 'alcohol'}, Gold Columns: {'quality', 'alcohol'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["pH"]
}

Question: 

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 027_Supermarket ###
# Questions in 027_Supermarket: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Total"]}

Question: Is there any purchase with a total cost above 1000?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Total'}, Gold Columns: {'Total'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Payment"]
}

Question: Are there any customers who made a purchase using cash?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Payment'}, Gold Columns: {'Payment'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Total", "Customer type"]
}

Question: Is the customer with the highest total purchase cost a 'Member'?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Customer type', 'Total'}, Gold Columns: {'Customer type', 'Total'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Rating", "Payment"]
}

Question: Does any custome

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 028_Predict ###
# Questions in 028_Predict: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
    "answer": True,
    "columns_used": ["Age"]
}

Question: Are there any individuals in the dataset who are above 60 years of age?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Age'}, Gold Columns: {'Age'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["DiabetesPedigreeFunction"]
}

Question: Does anyone have a Diabetes Pedigree Function score above 2.5?
Pred Answer: False, Gold Answer: False
Pred Columns: {'DiabetesPedigreeFunction'}, Gold Columns: {'DiabetesPedigreeFunction'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["Glucose", "Outcome"]
}

Question: Does the person with the highest glucose level also have diabetes?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Outcome', 'Glucose'}, Gold Columns: {'Outcome', 'Glucose'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": False, "c

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 029_NYTimes ###
# Questions in 029_NYTimes: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["material_type"]
}

Question: Are there any articles that have the material type 'Op-Ed'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'material_type'}, Gold Columns: {'material_type'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["date", "keywords", "headline"]}

Question: Does the article with the longest headline contain the keyword 'United States Politics and Government'?
Pred Answer: True, Gold Answer: False
Pred Columns: {'date', 'keywords', 'headline'}, Gold Columns: {'keywords', 'headline'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["date"]
}

Question: Is there any article published on '2021-01-05'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'date'}, Gold Columns: {'date'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"colum

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 030_Professionals ###
# Questions in 030_Professionals: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Geographies"]}

Question: Is the 'USA' the most common entry in the 'Geographies' column?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Geographies'}, Gold Columns: {'Geographies'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["What type of degree do you hold?", "In which country do you currently reside?", "Labeled Clusters"]
}

Question: Are there any participants who are unemployed with a bachelor's degree from Africa?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Labeled Clusters', 'What type of degree do you hold?', 'In which country do you currently reside?'}, Gold Columns: {'Geographies', 'Labeled Clusters'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["What programming language would you recommend an aspiring data scientist to learn first?"]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 031_Trustpilot ###
# Questions in 031_Trustpilot: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
    "answer": true,
    "columns_used": ["rating", "country_code"]
}

Question: Are there more reviews with rating 5 from 'GB' than 'US'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'country_code', 'rating'}, Gold Columns: {'country_code', 'rating'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": 5.0,
"columns_used": ["rating", "Company", "country_code"]
}
normalize_answer error for type 'boolean': Numeric value not valid for boolean: 5.0

Question: Is the average rating for 'Wise' in 'GB' above 4?
Pred Answer: None, Gold Answer: True
Pred Columns: {'Company', 'country_code', 'rating'}, Gold Columns: {'Company', 'country_code', 'rating'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["country_code"]
}

Question: Do any reviews originate from 'AO'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'country_code'}, 

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 032_Delicatessen ###
# Questions in 032_Delicatessen: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["Education"]
}

Question: Is there any customer with a PhD education level?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Education'}, Gold Columns: {'Education'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Marital_Status"]
}

Question: Do we have any customers who are married?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Marital_Status'}, Gold Columns: {'Marital_Status'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["Income"]
}

Question: Is there any customer with income higher than 100000?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Income'}, Gold Columns: {'Income'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": true, "columns_used": ["NumWebPurchases"]}

Question: Has any customer made more than 10 web purchases?
Pred An

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 033_Employee ###
# Questions in 033_Employee: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["salary"]
}

Question: Is there any customer with a high salary?
Pred Answer: False, Gold Answer: True
Pred Columns: {'salary'}, Gold Columns: {'salary'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Work Accident"]
}

Question: Do we have any employees who suffered a work accident?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Work Accident'}, Gold Columns: {'Work Accident'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Average Monthly Hours"]
}

Question: Is there any employee with more than 100 hours per month on average?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Average Monthly Hours'}, Gold Columns: {'Average Monthly Hours'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Satisfaction Level"]
}

Question: Does any

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 034_World ###
# Questions in 034_World: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Ladder score", "Generosity", "Social support"]
}

Question: Is there any country with a Ladder score above 7.5, a Generosity score above 0.2, and a Social support score above 0.7?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Social support', 'Generosity', 'Ladder score'}, Gold Columns: {'Social support', 'Generosity', 'Ladder score'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Perceptions of corruption", "Ladder score", "Social support", "Regional indicator"]
[process_response] Failed to parse response for question: Are there any countries in Western Europe with a Perceptions of corruption score above 0.5, a Ladder score above 7, and a Social support score above 0.7?
Error: invalid syntax (<unknown>, line 0)

Question: Are there any countries in Western Europe with a Perceptions of corruption score a

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 036_US ###
# Questions in 036_US: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
  "answer": True,
  "columns_used": ["count"]
}

Question: Are there records where the 'count' exceeds 20,000?
Pred Answer: True, Gold Answer: False
Pred Columns: {'count'}, Gold Columns: {'count'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["name_origin"]}

Question: Do we have any records originating from 'Jefferson'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'name_origin'}, Gold Columns: {'name_origin'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["name_dest"]
}

Question: Are there destinations with the name 'Baldwin'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'name_dest'}, Gold Columns: {'name_dest'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["lat_dest"]}

Question: Are there records where the latitude of the destination is greater than 60?
Pred Answe

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 037_Ted ###
# Questions in 037_Ted: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["views"]
}

Question: Are there talks with more than a million views?
Pred Answer: True, Gold Answer: True
Pred Columns: {'views'}, Gold Columns: {'views'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["speaker_1"]}

Question: Is there a talk by 'Elon Musk'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'speaker_1'}, Gold Columns: {'speaker_1'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["event"]
}

Question: Are there any TEDx events included?
Pred Answer: True, Gold Answer: True
Pred Columns: {'event'}, Gold Columns: {'event'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["available_lang"]
}

Question: Are there talks available in more than 10 languages?
Pred Answer: True, Gold Answer: True
Pred Columns: {'available_lang'}, Gold Columns

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 038_Stroke ###
# Questions in 038_Stroke: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["age", "stroke"]
}

Question: Is it true that the oldest person in the dataset has a stroke?
Pred Answer: False, Gold Answer: False
Pred Columns: {'age', 'stroke'}, Gold Columns: {'age', 'stroke'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["smoking_status", "hypertension"]}

Question: Do all smokers have hypertension?
Pred Answer: False, Gold Answer: False
Pred Columns: {'hypertension', 'smoking_status'}, Gold Columns: {'hypertension', 'smoking_status'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["heart_disease", "ever_married"]
}

Question: Are all people with heart diseases married?
Pred Answer: False, Gold Answer: True
Pred Columns: {'heart_disease', 'ever_married'}, Gold Columns: {'heart_disease', 'ever_married'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answ

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 039_Happy ###
# Questions in 039_Happy: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["reflection_period", "num_sentence"]
}

Question: Are there any reflections with more than 10 sentences?
Pred Answer: False, Gold Answer: False
Pred Columns: {'num_sentence', 'reflection_period'}, Gold Columns: {'num_sentence'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["reflection_period", "country"]
}

Question: Is there a reflection from 'USA' with over 5 sentences?
Pred Answer: False, Gold Answer: False
Pred Columns: {'country', 'reflection_period'}, Gold Columns: {'num_sentence', 'country'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": true, "columns_used": ["predicted_category", "reflection_period"]}

Question: Are there any reflections categorized as 'affection'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'reflection_period', 'predicted_category'}, Gold Columns: {'predicted_categor

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 040_Speed ###
# Questions in 040_Speed: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["age", "match"]}

Question: Is it true that the youngest participant has met their match?
Pred Answer: True, Gold Answer: False
Pred Columns: {'age', 'match'}, Gold Columns: {'age', 'match'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": true, "columns_used": ["race", "samerace", "match"]}

Question: Do all people who got a match have the same race as their partner?
Pred Answer: True, Gold Answer: False
Pred Columns: {'race', 'samerace', 'match'}, Gold Columns: {'samerace', 'match'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["expected_num_matches", "gender"]}

Question: Are all participants who expected more than 5 matches male?
Pred Answer: True, Gold Answer: True
Pred Columns: {'expected_num_matches', 'gender'}, Gold Columns: {'expected_num_matches', 'gender'}

--- Query 4/20 ---

--- Raw LLM Respons

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 041_Airline ###
# Questions in 041_Airline: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["retweet_count"]
}

Question: Are there any tweets with more than 10 retweets?
Pred Answer: True, Gold Answer: False
Pred Columns: {'retweet_count'}, Gold Columns: {'retweet_count'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["airline", "airline_sentiment"]
}

Question: Is there a negative sentiment tweet from 'United' airline?
Pred Answer: True, Gold Answer: True
Pred Columns: {'airline', 'airline_sentiment'}, Gold Columns: {'airline', 'airline_sentiment'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": false, "columns_used": ["negativereason"]}

Question: Are there any tweets categorized with 'Late Flight' as the negative reason?
Pred Answer: False, Gold Answer: True
Pred Columns: {'negativereason'}, Gold Columns: {'negativereason'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": true,


Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 042_Predict ###
# Questions in 042_Predict: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["absences", "Pstatus"]}

Question: Is it true that the student with the highest number of absences is from a rural area?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Pstatus', 'absences'}, Gold Columns: {'address', 'absences'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["romantic", "age"]
}

Question: Are all students who are in a romantic relationship older than 17?
Pred Answer: False, Gold Answer: False
Pred Columns: {'romantic', 'age'}, Gold Columns: {'romantic', 'age'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": 63.5,
"columns_used": ["G3", "Dalc"]
}
normalize_answer error for type 'boolean': Numeric value not valid for boolean: 63.5

Question: Is the average final grade (G3) of students who consume a lot of alcohol (Dalc > 2) lower than those who don't?
Pred Answer: None, Gold An

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 043_Predict ###
# Questions in 043_Predict: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["UrbanRural", "ApprovalFY"]
}

Question: Is it true that the oldest company (based on approval year) is from an urban (UrbanRural=1) area?
Pred Answer: True, Gold Answer: False
Pred Columns: {'ApprovalFY', 'UrbanRural'}, Gold Columns: {'ApprovalFY', 'UrbanRural'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["UrbanRural", "RetainedJob"]
}

Question: Are all companies with more than 10 employees located in an urban (UrbanRural=1) area?
Pred Answer: False, Gold Answer: True
Pred Columns: {'RetainedJob', 'UrbanRural'}, Gold Columns: {'NoEmp', 'UrbanRural'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["default_amount", "Sector"]
}

Question: Are all companies with a default amount greater than $250,000 in the food sector?
Pred Answer: False, Gold Answer: False
Pred Columns: {

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 044_IMDb ###
# Questions in 044_IMDb: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["genre", "year", "country"]
}

Question: Is it true that the oldest movie (based on year) in the drama genre is from the USA?
Pred Answer: True, Gold Answer: False
Pred Columns: {'country', 'year', 'genre'}, Gold Columns: {'country', 'year', 'genre'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["country", "duration", "genre", "language"]
}

Question: Are all movies from the USA, with more than 80 minutes duration in the drama genre, in English?
Pred Answer: False, Gold Answer: False
Pred Columns: {'language', 'country', 'duration', 'genre'}, Gold Columns: {'language', 'country', 'duration', 'genre'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["votes", "country", "language"]
}

Question: Are all movies with votes greater than 90 from the USA in English?
Pred Answer: False, Gol

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 045_Predict ###
# Questions in 045_Predict: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["popularity", "duration_ms"]
}

Question: Is it true that the song with the lowest popularity in the dataset is longer than 300000 ms?
Pred Answer: True, Gold Answer: False
Pred Columns: {'popularity', 'duration_ms'}, Gold Columns: {'popularity', 'duration_ms'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["release_year", "popularity"]}

Question: Did any song released in the year 2020 in the dataset achieve the maximum popularity?
Pred Answer: False, Gold Answer: False
Pred Columns: {'release_year', 'popularity'}, Gold Columns: {'release_year', 'popularity'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["duration_ms", "energy"]}

Question: Does the song with the longest duration also have the highest energy?
Pred Answer: True, Gold Answer: False
Pred Columns: {'energy', 'dur

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 046_120 ###
# Questions in 046_120: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["ID", "Age"]}

Question: Is it true that the athlete with the lowest ID is older than 30 years?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Age', 'ID'}, Gold Columns: {'Age', 'ID'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["ID", "Team", "Medal"]
}

Question: Did any athlete from the team with the most participants win two gold medals?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Team', 'Medal', 'ID'}, Gold Columns: {'Team', 'Medal'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Weight", "Sport"]}

Question: Does the athlete with the highest weight also participate in the sport with the most athletes?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Sport', 'Weight'}, Gold Columns: {'Sport', 'Weight'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer"

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 047_Bank ###
# Questions in 047_Bank: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["customer_age"]}

Question: Does the dataset contain customers who are under the age of 18?
Pred Answer: False, Gold Answer: False
Pred Columns: {'customer_age'}, Gold Columns: {'customer_age'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["total_trans_amt"]
}

Question: Are there any customers with a total transaction amount of zero?
Pred Answer: False, Gold Answer: False
Pred Columns: {'total_trans_amt'}, Gold Columns: {'total_trans_amt'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["credit_limit"]
}

Question: Does the dataset include customers with a credit limit exceeding $50,000?
Pred Answer: True, Gold Answer: False
Pred Columns: {'credit_limit'}, Gold Columns: {'credit_limit'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["months_inacti

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 048_Data ###
# Questions in 048_Data: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Location", "Job Title"]
}

Question: Are there any job positions available in 'New York, NY'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Location', 'Job Title'}, Gold Columns: {'Location'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["Founded"]
}

Question: Are there any job listings from companies founded before 1900?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Founded'}, Gold Columns: {'Founded'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["python_yn"]
}

Question: Are there any job listings that require Python skills?
Pred Answer: True, Gold Answer: True
Pred Columns: {'python_yn'}, Gold Columns: {'python_yn'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Job Title"]}

Question: Is there a job listing for a 'Data Engi

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 050_ING ###
# Questions in 050_ING: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["author_name", "lang"]}

Question: Is the most favorited author mainly communicating in Spanish?
Pred Answer: True, Gold Answer: True
Pred Columns: {'author_name', 'lang'}, Gold Columns: {'favorites', 'lang'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["author_name", "type"]}

Question: Does the author with the longest name post mainly original content?
Pred Answer: True, Gold Answer: False
Pred Columns: {'type', 'author_name'}, Gold Columns: {'type', 'author_name'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["author_name", "retweets"]
}

Question: Is there an author who received no retweets for any of their posts?
Pred Answer: True, Gold Answer: True
Pred Columns: {'author_name', 'retweets'}, Gold Columns: {'author_name', 'retweets'}

--- Query 4/20 ---

--- Raw LLM Response ---


Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 051_Pokemon ###
# Questions in 051_Pokemon: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["name"]}

Question: Is there a Pokémon named 'Pikachu' in the dataset?
Pred Answer: False, Gold Answer: False
Pred Columns: {'name'}, Gold Columns: {'name'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["total"]
}

Question: Are there any Pokémon with a total stat greater than 700?
Pred Answer: True, Gold Answer: False
Pred Columns: {'total'}, Gold Columns: {'total'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["legendary", "generation"]
}

Question: Are all Pokémon in the first generation legendary?
Pred Answer: False, Gold Answer: False
Pred Columns: {'generation', 'legendary'}, Gold Columns: {'generation', 'legendary'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["speed"]
}

Question: Is there any Pokémon with a speed greater than 150

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 052_Professional ###
# Questions in 052_Professional: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Extraversion", "Agreeableness"]
}

Question: Is the maximum level of Extraversion greater than the maximum level of Agreeableness?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Extraversion', 'Agreeableness'}, Gold Columns: {'Extraversion', 'Agreeableness'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
  "answer": False,
  "columns_used": ["Profession", "Openness", "Conscientousness"]
}

Question: Is the profession with the highest Openness the same as the profession with the highest Conscientousness?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Openness', 'Conscientousness', 'Profession'}, Gold Columns: {'Openness', 'Conscientousness', 'Profession'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Emotional_Range", "Conversation", "Profession"]
}

Question: Does the profession wit

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 053_Patents ###
# Questions in 053_Patents: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["kind", "title"]
}

Question: Is there a patent containing the word 'communication' in the title?
Pred Answer: True, Gold Answer: True
Pred Columns: {'title', 'kind'}, Gold Columns: {'title'}

--- Query 2/20 ---
[Groq Attempt 1] Error: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}

--- Raw LLM Response ---
{
"answer": "No, there are no patents associated with the organization 'IBM' in the provided dataset.",
"columns_used": ["kind", "organization"]
}
normalize_answer error for type 'boolean': Unrecognized string for boolean: "No, there are no patents associated with the organization 'IBM' in the provided dataset."

Question: Are there patents associated with the organization 'IBM'?
Pred Answer: None, Gold Answer: False
Pred Columns: {'organization', 'kind'}, Gold Columns: {'organization'}

---

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 055_German ###
# Questions in 055_German: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["Loan Amount", "Loan Duration - Months"]
}

Question: Is the borrower with the highest loan amount also the one with the longest loan duration?
Pred Answer: False, Gold Answer: False
Pred Columns: {'Loan Amount', 'Loan Duration - Months'}, Gold Columns: {'Loan Amount', 'Loan Duration - Months'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Number of Existing Loans", "Loan Amount"]}

Question: Does the borrower with the maximum number of existing loans also have the highest loan amount?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Loan Amount', 'Number of Existing Loans'}, Gold Columns: {'Loan Amount', 'Number of Existing Loans'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": false, "columns_used": ["Loan Duration - Months", "Number of Existing Loans"]}

Question: Does the borrower with the lo

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 056_Emoji ###
# Questions in 056_Emoji: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["Calories (kcal)", "name"]}

Question: Are there any foods with zero calories?
Pred Answer: False, Gold Answer: False
Pred Columns: {'name', 'Calories (kcal)'}, Gold Columns: {'Calories (kcal)'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["name", "Total Sugar (g)"]
}

Question: Do all foods contain sugar?
Pred Answer: True, Gold Answer: False
Pred Columns: {'name', 'Total Sugar (g)'}, Gold Columns: {'Total Sugar (g)'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["Calories (kcal)", "Total Fat (g)", "name"]
}

Question: Are there any foods that contain no fat at all?
Pred Answer: True, Gold Answer: False
Pred Columns: {'name', 'Calories (kcal)', 'Total Fat (g)'}, Gold Columns: {'Total Fat (g)'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["nam

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 057_Spain ###
# Questions in 057_Spain: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["edu"]}

Question: Are there any respondents who say they didn't complete primary education?
Pred Answer: True, Gold Answer: False
Pred Columns: {'edu'}, Gold Columns: {'edu'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["polInterest"]}

Question: Are all respondents interested in politics?
Pred Answer: True, Gold Answer: False
Pred Columns: {'polInterest'}, Gold Columns: {'polInterest'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["voteReason"]}

Question: Did any respondent indicate that they will not vote?
Pred Answer: True, Gold Answer: False
Pred Columns: {'voteReason'}, Gold Columns: {'Vote Intention'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["User self-placement on Left-Right economic values axis"]
}

Question: Did any respondent pl

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 058_US ###
# Questions in 058_US: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Are you registered to vote?"]
}

Question: Are there survey participants who are not registered to vote?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Are you registered to vote?'}, Gold Columns: {'Are you registered to vote?'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Who did you vote for in the 2016 Presidential election? (Four years ago)", "Did you vote in the 2016 Presidential election? (Four years ago)", "How likely are you to vote in the forthcoming US Presidential election? Early Voting Open"]
}

Question: Are there any respondents who participated in the 2016 Presidential election (four years ago) but are not planning to do so in the upcoming election?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Who did you vote for in the 2016 Presidential election? (Four years ago)', 'How likely are you 

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 059_Second ###
# Questions in 059_Second: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["make", "model"]
}

Question: Is there a car model named 'Golf' listed?
Pred Answer: False, Gold Answer: False
Pred Columns: {'model', 'make'}, Gold Columns: {'model'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["dealer"]
}

Question: Are there cars associated with the dealer 'Autos Raymara'?
Pred Answer: False, Gold Answer: False
Pred Columns: {'dealer'}, Gold Columns: {'dealer'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["make"]
}

Question: Is there a car version that mentions 'BMW'?
Pred Answer: True, Gold Answer: True
Pred Columns: {'make'}, Gold Columns: {'version'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["shift"]
}

Question: Are there cars with a 'Manual' shift?
Pred Answer: True, Gold Answer: True
Pred Columns: {'shift'}

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 060_Bakery ###
# Questions in 060_Bakery: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["Transaction"]
}

Question: Are all transactions IDs unique?
Pred Answer: False, Gold Answer: True
Pred Columns: {'Transaction'}, Gold Columns: {'Transaction'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": False,
"columns_used": ["date_time", "Transaction"]
}

Question: Is there any transaction that took place during the night?
Pred Answer: False, Gold Answer: False
Pred Columns: {'date_time', 'Transaction'}, Gold Columns: {'period_day'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["weekday_weekend", "Transaction"]
[process_response] Failed to parse response for question: Do all items have transactions recorded on weekdays?
Error: invalid syntax (<unknown>, line 0)

Question: Do all items have transactions recorded on weekdays?
Pred Answer: None, Gold Answer: False
Pred Columns: [], Gold Columns: 

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 061_Disneyland ###
# Questions in 061_Disneyland: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": false,
"columns_used": ["Rating", "Reviewer_Location"]
}

Question: Are all the reviews from Australia positive (rating > 3)?
Pred Answer: False, Gold Answer: True
Pred Columns: {'Reviewer_Location', 'Rating'}, Gold Columns: {'Reviewer_Location', 'Rating'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Branch"]}

Question: Is Disneyland_HongKong the most reviewed branch?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Branch'}, Gold Columns: {'Branch'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": true, "columns_used": ["Rating"]}

Question: Are there any reviews with a rating of 1?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Rating'}, Gold Columns: {'Rating'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Reviewer_Location", "Rating"]
}

Question: Does every reviewe

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 062_Trump ###
# Questions in 062_Trump: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["lang"]
}

Question: Are all the tweets in English?
Pred Answer: True, Gold Answer: True
Pred Columns: {'lang'}, Gold Columns: {'lang'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["author_handler", "retweets"]
}

Question: Has the author ever been retweeted?
Pred Answer: True, Gold Answer: True
Pred Columns: {'author_handler', 'retweets'}, Gold Columns: {'retweets'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["retweets"]}

Question: Are there any tweets with more than 10000 retweets?
Pred Answer: True, Gold Answer: True
Pred Columns: {'retweets'}, Gold Columns: {'retweets'}

--- Query 4/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["id", "retweets"]}

Question: Have any of the tweets been favorited more than 50000 times?
Pred Answer: True, Gold Answer: Tr

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 063_Influencers ###
# Questions in 063_Influencers: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["is_organization"]
}

Question: Are there any organizations in the dataset?
Pred Answer: True, Gold Answer: False
Pred Columns: {'is_organization'}, Gold Columns: {'is_organization'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["name", "is_organization"]
}

Question: Are there any individuals (non-organizations) in the dataset?
Pred Answer: True, Gold Answer: True
Pred Columns: {'name', 'is_organization'}, Gold Columns: {'is_organization'}

--- Query 3/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["pic"]
}

Question: Do all entities have a picture?
Pred Answer: True, Gold Answer: True
Pred Columns: {'pic'}, Gold Columns: {'pic'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
    "answer": True,
    "columns_used": ["name", "weight"]
}

Question: Are there any entities with a we

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 064_Clustering ###
# Questions in 064_Clustering: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{
"answer": ["gull"],
"columns_used": ["feathers", "animal_name"]
}
normalize_answer error for type 'boolean': Expected bool, int, float, or string for boolean, got: <class 'list'>

Question: Are there any animals with feathers in the dataset?
Pred Answer: None, Gold Answer: True
Pred Columns: {'feathers', 'animal_name'}, Gold Columns: {'feathers'}

--- Query 2/20 ---

--- Raw LLM Response ---
{
"answer": true,
"columns_used": ["venomous", "animal_name"]
}

Question: Are there any venomous animals in the dataset?
Pred Answer: True, Gold Answer: True
Pred Columns: {'venomous', 'animal_name'}, Gold Columns: {'venomous'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["breathes"]}

Question: Do all animals breathe?
Pred Answer: True, Gold Answer: False
Pred Columns: {'breathes'}, Gold Columns: {'breathes'}

--- Query 4/20 ---

--- Raw LLM Resp

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]



### Evaluating 065_RFM ###
# Questions in 065_RFM: 20

--- Query 1/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["Quantity"]}

Question: Are all the quantities greater than 0?
Pred Answer: True, Gold Answer: False
Pred Columns: {'Quantity'}, Gold Columns: {'Quantity'}

--- Query 2/20 ---

--- Raw LLM Response ---
{"answer": True, "columns_used": ["UnitPrice"]}

Question: Are all the unit prices greater than 0?
Pred Answer: True, Gold Answer: True
Pred Columns: {'UnitPrice'}, Gold Columns: {'UnitPrice'}

--- Query 3/20 ---

--- Raw LLM Response ---
{"answer": False, "columns_used": ["CustomerID"]}

Question: Are there any missing customer IDs?
Pred Answer: False, Gold Answer: True
Pred Columns: {'CustomerID'}, Gold Columns: {'CustomerID'}

--- Query 4/20 ---

--- Raw LLM Response ---
{
"answer": True,
"columns_used": ["Country"]
}

Question: Are there any transactions from the United Kingdom?
Pred Answer: True, Gold Answer: True
Pred Columns: {'Country'}, Gold Col

In [71]:
# # Table 1: Main Metrics Table
# results_df = pd.DataFrame(model_results).T.round(3)
# print("### Main Metrics Table:")
# print(results_df.to_markdown())

# # Table 2: Column Quality Table
# col_df = pd.DataFrame(column_quality_table)
# print("\n### Column Quality Table:")
# print(col_df.to_markdown(index=False))

# # Table 3: Formatting Errors by Type
# print("\n### Formatting Errors by Question Type:")
# for qtype, count in formatting_errors_summary.items():
#     print(f"{qtype:15}: {count} formatting errors")

# Table 1: Main Metrics Table (per dataset)
results_df = pd.DataFrame(model_results).T.round(3)
print("### Main Metrics Table:")
print(results_df.to_markdown())

# Table 2: Column Quality Table (per dataset)
col_df = pd.DataFrame(column_quality_table)
print("\n### Column Quality Table:")
print(col_df.to_markdown(index=False))

# Table 3: Formatting Errors by Question Type
print("\n### Formatting Errors by Question Type:")
for qtype, count in formatting_errors_summary.items():
    print(f"{qtype:15}: {count} formatting errors")

### Main Metrics Table:
|                   |   avg |   boolean |   number |   category |   list[category] |   list[number] |   single col |   multiple cols |
|:------------------|------:|----------:|---------:|-----------:|-----------------:|---------------:|-------------:|----------------:|
| 001_Forbes        |  0.32 |      0.6  |    0.4   |       0.4  |             0.2  |           0    |        0.429 |           0.278 |
| 002_Titanic       |  0.25 |      0.25 |    0.5   |       0.25 |             0    |           0.25 |        0.286 |           0.231 |
| 004_Taxi          |  0.1  |      0.25 |    0.25  |       0    |             0    |           0    |        0.1   |           0     |
| 005_NYC           |  0.25 |      1    |    0     |       0.25 |             0    |           0    |        0.25  |           0     |
| 006_London        |  0.2  |      0.25 |    0.5   |       0    |             0    |           0.25 |        0.2   |           0     |
| 007_Fifa          |  0.45 |  

In [72]:
# Store dataset sizes
dataset_sizes = {dataset_id: len(semeval_train.filter(lambda s: s["dataset"] == dataset_id)) for dataset_id in train_target_datasets}

def compute_weighted_global_metrics(model_results, dataset_sizes):
    weighted_sums = defaultdict(float)
    total_size = sum(dataset_sizes[ds] for ds in model_results if ds != "GLOBAL")

    for dataset_id, metrics in model_results.items():
        if dataset_id == "GLOBAL":
            continue
        weight = dataset_sizes[dataset_id]
        for k, v in metrics.items():
            weighted_sums[k] += v * weight

    return {k: weighted_sums[k] / total_size for k in weighted_sums}

# Compute and store weighted global average
global_model_results = {}
global_model_results["GLOBAL"] = compute_weighted_global_metrics(model_results, dataset_sizes)

# Display as markdown table
global_df = pd.DataFrame(global_model_results).T.round(3)
print("\n### Global Metrics Across All Datasets (Weighted):")
print(global_df.to_markdown())

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1308 [00:00<?, ? examples/s]


### Global Metrics Across All Datasets (Weighted):
|        |   avg |   boolean |   number |   category |   list[category] |   list[number] |   single col |   multiple cols |
|:-------|------:|----------:|---------:|-----------:|-----------------:|---------------:|-------------:|----------------:|
| GLOBAL | 0.267 |     0.451 |    0.314 |      0.311 |            0.139 |          0.118 |        0.326 |           0.181 |
