# In this document I will explore the CSV files that are the output from the TESTs of LLMs

## Load Libs

In [1]:
import sqlite3
import pandas as pd
from datetime import datetime
from pathlib import Path

## Load data

In [2]:
df_raw_results = pd.read_csv('MLARagas_results/results/20250923_181049_multilingual_eval_gdpr_EN/raw.csv')
print(df_raw_results.head())

  language                             question  \
0       en            What does GDPR stand for?   
1       en       What is the full form of GDPR?   
2       en        When did GDPR start applying?   
3       en      When did GDPR come into effect?   
4       en  What is the main objective of GDPR?   

                     original_question                           user_input  \
0            What does GDPR stand for?            What does GDPR stand for?   
1            What does GDPR stand for?       What is the full form of GDPR?   
2        When did GDPR start applying?        When did GDPR start applying?   
3        When did GDPR start applying?      When did GDPR come into effect?   
4  What is the main objective of GDPR?  What is the main objective of GDPR?   

          variant  paraphrase_number  question_index source_language  \
0  original_q0_p0                  0               0              en   
1  original_q0_p1                  1               0              en   
2 

In [3]:
df_agg_results = pd.read_csv("MLARagas_results/results/20250923_181049_multilingual_eval_gdpr_EN/agg.csv")
print(df_agg_results.head())

  language  total_rows_lang  unique_strategies_lang  avg_answer_relevancy  \
0       de               40                       1              0.955049   
1       en               40                       1              0.955216   
2       fr               40                       1              0.907272   

   avg_context_precision  avg_context_recall  avg_faithfulness  \
0               0.031250               0.850          0.263409   
1               0.458333               0.950          0.548859   
2               0.133333               0.875          0.155417   

   avg_noise_sensitivity  count_answer_relevancy  count_context_precision  \
0               0.235714                      40                       40   
1               0.403343                      40                       40   
2               0.458333                      40                       40   

   count_context_recall  count_faithfulness  count_noise_sensitivity  
0                    40                  40   

## Populate the SQL DB

In [4]:
DB_PATH = r"ai_sandbox_PSA_16_Oct_2025.db"
CSV_PATH = r"MLARagas_results/results/20250923_181049_multilingual_eval_gdpr_EN/agg.csv" 

In [5]:
# Fallback df
fallback_df = pd.DataFrame([
    {"language": "de", "total_rows_lang": 40, "unique_strategies_lang": 1,
     "avg_answer_relevancy": 0.955049, "avg_context_precision": 0.031250,
     "avg_context_recall": 0.850, "avg_faithfulness": 0.263409,
     "avg_noise_sensitivity": 0.235714, "count_answer_relevancy": 40,
     "count_context_precision": 40, "count_context_recall": 40,
     "count_faithfulness": 40, "count_noise_sensitivity": 4},
    {"language": "en", "total_rows_lang": 40, "unique_strategies_lang": 1,
     "avg_answer_relevancy": 0.955216, "avg_context_precision": 0.458333,
     "avg_context_recall": 0.950, "avg_faithfulness": 0.548859,
     "avg_noise_sensitivity": 0.403343, "count_answer_relevancy": 40,
     "count_context_precision": 40, "count_context_recall": 40,
     "count_faithfulness": 39, "count_noise_sensitivity": 18},
    {"language": "fr", "total_rows_lang": 40, "unique_strategies_lang": 1,
     "avg_answer_relevancy": 0.907272, "avg_context_precision": 0.133333,
     "avg_context_recall": 0.875, "avg_faithfulness": 0.155417,
     "avg_noise_sensitivity": 0.458333, "count_answer_relevancy": 40,
     "count_context_precision": 40, "count_context_recall": 40,
     "count_faithfulness": 40, "count_noise_sensitivity": 2},
])

In [6]:
def load_df():
    p = Path("MLARagas_results/results/20250923_181049_multilingual_eval_gdpr_EN/agg.csv")
    if p.exists():
        return pd.read_csv(p)
    return fallback_df

In [7]:
METRICS = {
    "avg_answer_relevancy":      ("answer_relevancy", "ratio"),
    "avg_context_precision":     ("context_precision", "ratio"),
    "avg_context_recall":        ("context_recall", "ratio"),
    "avg_faithfulness":          ("faithfulness", "ratio"),
    "avg_noise_sensitivity":     ("noise_sensitivity", "ratio"),
    "count_answer_relevancy":    ("answer_relevancy_count", "count"),
    "count_context_precision":   ("context_precision_count", "count"),
    "count_context_recall":      ("context_recall_count", "count"),
    "count_faithfulness":        ("faithfulness_count", "count"),
    "count_noise_sensitivity":   ("noise_sensitivity_count", "count"),
    "total_rows_lang":           ("total_rows", "count"),
    "unique_strategies_lang":    ("unique_strategies", "count"),
}

In [8]:
def one(conn, q, params=()):
    cur = conn.execute(q, params); row = cur.fetchone(); cur.close()
    return row[0] if row else None

def exists(conn, q, params=()):
    cur = conn.execute(q, params); row = cur.fetchone(); cur.close()
    return row is not None

def get_or_create_element(conn, type_spec, name, description, project_id):
    eid = one(conn,
        "SELECT id FROM element WHERE name=? AND type_spec=? AND project_id=?;",
        (name, type_spec, project_id)
    )
    if eid:
        return eid

    conn.execute(
        "INSERT INTO element (type_spec, name, description, project_id) VALUES (?,?,?,?);",
        (type_spec, name, description, project_id)
    )
    return one(conn,
        "SELECT id FROM element WHERE name=? AND type_spec=? AND project_id=?;",
        (name, type_spec, project_id)
    )


def get_or_create_datashape(conn):
    did = one(conn, "SELECT id FROM datashape LIMIT 1;")
    if did: return did
    conn.execute("INSERT INTO datashape (accepted_target_values) VALUES (?);", ("{}",))
    return one(conn, "SELECT id FROM datashape LIMIT 1;")

def get_or_create_project(conn, name="LLM Eval Project", status="Ready"):
    pid = one(conn, "SELECT id FROM project WHERE name=?;", (name,))
    if pid: return pid
    conn.execute("INSERT INTO project (name, status) VALUES (?,?);", (name, status))
    return one(conn, "SELECT id FROM project WHERE name=?;", (name,))

def get_or_create_config(conn, name="LLM Eval Config"):
    cid = one(conn, "SELECT id FROM configuration WHERE name=?;", (name,))
    if cid: return cid
    conn.execute("INSERT INTO configuration (name, description) VALUES (?,?);", (name, "Aggregate import"))
    return one(conn, "SELECT id FROM configuration WHERE name=?;", (name,))

def get_or_create_evaluation(conn, project_id, config_id, status="Done"):
    eid = one(conn, "SELECT id FROM evaluation WHERE project_id=? AND config_id=? LIMIT 1;", (project_id, config_id))
    if eid: return eid
    conn.execute("INSERT INTO evaluation (status, project_id, config_id) VALUES (?,?,?);", (status, project_id, config_id))
    return one(conn, "SELECT id FROM evaluation WHERE project_id=? AND config_id=? LIMIT 1;", (project_id, config_id))

def get_or_create_tool(conn, name="ImporterTool", source="LLM Eval Import", version="1.0", licensing="Open-source"):
    tid = one(conn, "SELECT id FROM tool WHERE name=?;", (name,))
    if tid: return tid
    conn.execute("INSERT INTO tool (source, version, name, licensing) VALUES (?,?,?,?);",
                 (source, version, name, licensing))
    return one(conn, "SELECT id FROM tool WHERE name=?;", (name,))

def get_or_create_metric(conn, name, description="", type_spec="Direct"):
    mid = one(conn, "SELECT id FROM metric WHERE name=?;", (name,))
    if mid:
        if type_spec == "Direct" and not exists(conn, "SELECT 1 FROM direct WHERE id=?;", (mid,)):
            conn.execute("INSERT INTO direct (id) VALUES (?);", (mid,))
        return mid
    conn.execute("INSERT INTO metric (type_spec, name, description) VALUES (?,?,?);",
                 (type_spec, name, description or name))
    mid = one(conn, "SELECT id FROM metric WHERE name=?;", (name,))
    if type_spec == "Direct":
        conn.execute("INSERT INTO direct (id) VALUES (?);", (mid,))
    return mid

def get_or_create_metric_cat(conn, name="LLM Eval Aggregates"):
    cid = one(conn, "SELECT id FROM metriccategory WHERE name=?;", (name,))
    if cid: return cid
    conn.execute("INSERT INTO metriccategory (name, description) VALUES (?,?);",
                 (name, "Language-level aggregate metrics"))
    return one(conn, "SELECT id FROM metriccategory WHERE name=?;", (name,))

def ensure_metric_in_cat(conn, metric_id, category_id):
    if not exists(conn, "SELECT 1 FROM metriccategory_metric WHERE category=? AND metrics=?;", (category_id, metric_id)):
        conn.execute("INSERT INTO metriccategory_metric (category, metrics) VALUES (?,?);", (category_id, metric_id))


def get_or_create_dataset_for_lang(conn, lang, datashape_id, project_id):
    name = f"Lang={lang}"
    descr = f"Language slice for {lang}"

    # element.id == dataset.id (joined inheritance)
    element_id = get_or_create_element(conn, "Dataset", name, descr, project_id)

    if not exists(conn, "SELECT 1 FROM dataset WHERE id=?;", (element_id,)):
        conn.execute("""
            INSERT INTO dataset (id, source, version, licensing, type, datashape_id)
            VALUES (?,?,?,?,?,?);
        """, (element_id, "LLM Eval Import", "agg-1.0", "Open-source", "Validation", datashape_id))

    return element_id


def get_or_create_observation(conn, lang, evaluation_id, tool_id, dataset_id):
    name = f"Obs {lang}"
    oid = one(conn, "SELECT id FROM observation WHERE name=?;", (name,))
    if oid: return oid
    when_str = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    conn.execute("""
        INSERT INTO observation (observer, whenObserved, tool_id, eval_id, dataset_2_id, name, description)
        VALUES (?,?,?,?,?,?,?);
    """, ("aggregate_import", when_str, tool_id, evaluation_id, dataset_id, name, f"Aggregate import for {lang}"))
    return one(conn, "SELECT id FROM observation WHERE name=?;", (name,))

In [9]:
def main():
    # 0) safety first
    print("Tip: back up your DB before running: cp ai_sandbox_PSA_16_Oct_2025.db ai_sandbox_PSA_16_Oct_2025.bak")

    df = load_df()
    conn = sqlite3.connect(DB_PATH)
    conn.execute("PRAGMA foreign_keys = ON;")

    with conn:
        datashape_id = get_or_create_datashape(conn)
        project_id   = get_or_create_project(conn)
        config_id    = get_or_create_config(conn)
        eval_id      = get_or_create_evaluation(conn, project_id, config_id)
        tool_id      = get_or_create_tool(conn)

        cat_id = get_or_create_metric_cat(conn)
        metric_ids = {}
        for _, (metric_name, unit) in METRICS.items():
            m_id = get_or_create_metric(conn, metric_name, f"{metric_name} ({unit})", "Direct")
            metric_ids[metric_name] = (m_id, unit)
            ensure_metric_in_cat(conn, m_id, cat_id)

        inserted = 0
        for _, r in df.iterrows():
            lang  = r["language"]
            ds_id = get_or_create_dataset_for_lang(conn, lang, datashape_id, project_id)
            obs_id = get_or_create_observation(conn, lang, eval_id, tool_id, ds_id)

            for col, (metric_name, unit) in METRICS.items():
                val = r[col]
                metric_id, unit_txt = metric_ids[metric_name]
                conn.execute("""
                    INSERT INTO measure (value, error, uncertainty, unit, metric_id, observation_id, measurand_id)
                    VALUES (?,?,?,?,?,?,?);
                """, (str(val), "", 0.0, unit_txt, metric_id, obs_id, ds_id))
                inserted += 1

    print(f"Inserted measures: {inserted}")
    # Quick check
    with sqlite3.connect(DB_PATH) as c2:
        q = """
        SELECT o.name AS observation, e.name AS measurand, mt.name AS metric, m.value, m.unit
        FROM measure m
        JOIN observation o ON o.id = m.observation_id
        JOIN element e     ON e.id = m.measurand_id
        JOIN metric  mt    ON mt.id = m.metric_id
        WHERE o.name IN ('Obs de','Obs en','Obs fr')
        ORDER BY o.name, mt.name;
        """
        preview = pd.read_sql_query(q, c2)
        print(preview.head(50).to_string(index=False))

In [None]:
if __name__ == "__main__":
    main()

Tip: back up your DB before running: cp ai_sandbox_PSA_16_Oct_2025.db ai_sandbox_PSA_16_Oct_2025.bak
Inserted measures: 36
observation measurand                  metric              value  unit
     Obs de   Lang=de        answer_relevancy 0.9550488333349998 ratio
     Obs de   Lang=de        answer_relevancy 0.9550488333349998 ratio
     Obs de   Lang=de  answer_relevancy_count                 40 count
     Obs de   Lang=de  answer_relevancy_count                 40 count
     Obs de   Lang=de       context_precision 0.0312499999976041 ratio
     Obs de   Lang=de       context_precision 0.0312499999976041 ratio
     Obs de   Lang=de context_precision_count                 40 count
     Obs de   Lang=de context_precision_count                 40 count
     Obs de   Lang=de          context_recall               0.85 ratio
     Obs de   Lang=de          context_recall               0.85 ratio
     Obs de   Lang=de    context_recall_count                 40 count
     Obs de   Lang=de    

: 