In [None]:
# Cell 1: Imports, Config & Data Pull + Parsing
import os, json, random, ast
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

# ─── Configuration ──────────────────────────────────────────────────────────────
DB_URL       = "postgresql+psycopg2://local:password@localhost:5433/mlops_local"
TABLE_NAME   = "arxiv_chunks_with_metadata"
EVAL_DIR     = "eval"
HOLDOUT_PCT  = 0.10    # 10% held-out
DEV_PCT      = 0.10    # 10% dev/slice
PERTURB_BASE = 200     # # base records to perturb
DRIFT_N      = 2000    # # of chunks for drift reference
random.seed(42)
# ────────────────────────────────────────────────────────────────────────────────

os.makedirs(EVAL_DIR, exist_ok=True)
engine = create_engine(DB_URL, pool_timeout=30, max_overflow=0)

# Pull only needed columns
df = pd.read_sql(f"""
    SELECT chunk_id,
           paper_cited,
           query,
           chunk_data,
           categories,
           created_yymm
    FROM {TABLE_NAME}
""", con=engine)

# Parse string columns into Python lists
def parse_queries(qstr):
    try:
        return ast.literal_eval(qstr)
    except:
        return [qstr]

def parse_papers(pstr):
    return pstr.strip("{}").split(",") if pstr.startswith("{") else [pstr]

def parse_categories(cstr):
    try:
        lst = ast.literal_eval(cstr)      # e.g. ["math.DS math.CV math.PR"]
        return lst[0].split()             # → ["math.DS","math.CV","math.PR"]
    except:
        return []

df["query_list"]    = df["query"].apply(parse_queries)
df["paper_list"]    = df["paper_cited"].apply(parse_papers)
df["category_list"] = df["categories"].apply(parse_categories)

# Flatten so each row = one (query, chunk_id, paper_list, chunk_data, category_list, created_yymm)
rows = []
for _, r in df.iterrows():
    for q in r["query_list"]:
        rows.append({
            "chunk_id":      r["chunk_id"],
            "query":         q,
            "paper_list":    r["paper_list"],
            "chunk_data":    r["chunk_data"],
            "category_list": r["category_list"],
            "created_yymm":  r["created_yymm"]
        })
df2 = pd.DataFrame(rows).sample(frac=1, random_state=42).reset_index(drop=True)
n = len(df2)
print(f"Total query-chunk rows: {n}")


Total query–chunk rows: 18525


In [4]:
# Cell 2: Held-Out Test Set (~10%)
n_hold = int(HOLDOUT_PCT * n)
hold   = df2.iloc[:n_hold]

with open(f"{EVAL_DIR}/heldout.jsonl", "w") as f:
    for _, r in hold.iterrows():
        rec = {
            "query":        r["query"],
            "ground_truth": r["paper_list"]
        }
        f.write(json.dumps(rec) + "\n")

print(f"Wrote {len(hold)} records to {EVAL_DIR}/heldout.jsonl")


Wrote 1852 records to eval/heldout.jsonl


In [5]:
# Cell 3: Dev Set + Slice Tags (~10%)
n_dev = int(DEV_PCT * n)
dev   = df2.iloc[n_hold : n_hold + n_dev].copy()

def make_slice(r):
    cats = r["category_list"]
    first = cats[0] if cats else "other"
    year  = r["created_yymm"][:4] if pd.notna(r["created_yymm"]) else "unk"
    return f"{first}_{year}"

dev["slice"] = dev.apply(make_slice, axis=1)

with open(f"{EVAL_DIR}/slices.jsonl", "w") as f:
    for _, r in dev.iterrows():
        rec = {
            "query":        r["query"],
            "ground_truth": r["paper_list"],
            "slice":        r["slice"]
        }
        f.write(json.dumps(rec) + "\n")

print(f"Wrote {len(dev)} records to {EVAL_DIR}/slices.jsonl")


Wrote 1852 records to eval/slices.jsonl


In [6]:
# Cell 4: Perturbation Stability (~PERTURB_BASE bases × 2 variants)
base = dev.sample(min(PERTURB_BASE, len(dev)), random_state=42)
perturbs = []
for _, r in base.iterrows():
    q, papers = r["query"], r["paper_list"]
    perturbs += [
        {"base_query": q,                   "perturbed": q.upper(),                  "expected_papers": papers},
        {"base_query": q, "perturbed": q.replace(" ", "   "), "expected_papers": papers},
    ]

with open(f"{EVAL_DIR}/perturbations.jsonl", "w") as f:
    for rec in perturbs:
        f.write(json.dumps(rec) + "\n")

print(f"Wrote {len(perturbs)} records to {EVAL_DIR}/perturbations.jsonl")


Wrote 400 records to eval/perturbations.jsonl


In [10]:
# Cell 5: Known Failure-Mode Set (~50 cases matching a keyword)
hard = dev[dev["query"].str.contains("quantum", case=False)].head(50)
failures = [
    {"query": r["query"], "correct_papers": r["paper_list"]}
    for _, r in hard.iterrows()
]

with open(f"{EVAL_DIR}/failures.jsonl", "w") as f:
    for rec in failures:
        f.write(json.dumps(rec) + "\n")

print(f"Wrote {len(failures)} records to {EVAL_DIR}/failures.jsonl")


Wrote 31 records to eval/failures.jsonl


In [11]:
# Cell 6: Drift Reference (sample DRIFT_N from remaining pool)
train_pool = df2.iloc[n_hold + n_dev :]
sampled    = train_pool.sample(min(DRIFT_N, len(train_pool)), random_state=42)

# Save chunk_ids + raw texts; embeddings happen later
np.savez(
    f"{EVAL_DIR}/drift_reference.npz",
    chunk_ids = sampled["chunk_id"].values,
    texts     = sampled["chunk_data"].values
)

print(f"Wrote {len(sampled)} records to {EVAL_DIR}/drift_reference.npz")


Wrote 2000 records to eval/drift_reference.npz
