In [9]:
import os, pandas as pd, importlib.util, subprocess

# 1) fresh clone & install deps
!rm -rf design_qa
!git clone -q https://github.com/anniedoris/design_qa.git
!pip -q install rouge nltk
import nltk; nltk.download("punkt", quiet=True)

# 2) import metrics helpers
repo_dir = os.path.abspath("design_qa")
metrics_path = os.path.join(repo_dir, "eval", "metrics", "metrics.py")
spec = importlib.util.spec_from_file_location("dq_metrics", metrics_path)
dq_metrics = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dq_metrics)
normalize_answer = dq_metrics.normalize_answer
bow_f1 = dq_metrics.token_f1_score

# 3) Upload GT + preds
from google.colab import files
print("Upload ground-truth CSV (rule_definition_qa.csv):")
GT_PATH = list(files.upload().keys())[0]

print("Upload predictions CSV (preds_v49p_scored.csv):")
PRED_PATH = list(files.upload().keys())[0]

gt = pd.read_csv(GT_PATH)
pred = pd.read_csv(PRED_PATH)

# normalize column names
if "ground_truth" not in gt.columns:
    for alt in ["gt","GroundTruth","label","expected","answer"]:
        if alt in gt.columns:
            gt = gt.rename(columns={alt:"ground_truth"})
            break
if "model_prediction" not in pred.columns:
    if "pred" in pred.columns:
        pred = pred.rename(columns={"pred":"model_prediction"})
    elif "answer" in pred.columns:
        pred = pred.rename(columns={"answer":"model_prediction"})

# merge on image
merged = gt.merge(pred, on="image", how="inner")

# 4) Build results DataFrame
df_results = pd.DataFrame({
    "id": range(len(merged)),
    "question": merged["question"] if "question" in merged.columns else "",
    "ground_truth": merged["ground_truth"],
    "prediction": merged["model_prediction"]
})

# normalize text
df_results["ground_truth"] = df_results["ground_truth"].apply(lambda x: normalize_answer(str(x)))
df_results["prediction"]   = df_results["prediction"].apply(lambda x: normalize_answer(str(x)))

# 5) Compute F1 (BoC) per row
def row_f1(pred, gt):
    return bow_f1(pred.split(), gt.split())

df_results["f1"] = df_results.apply(lambda r: row_f1(r["prediction"], r["ground_truth"]), axis=1)

# 6) Overall F1 = mean of per-row F1s
overall_f1 = df_results["f1"].mean()
print(f"Definition F1 (BoC ↑): {overall_f1:.6f} on {len(df_results)} questions")

# 7) Save outputs
df_results.to_csv("definition_detailed_with_f1.csv", index=False)
with open("definition.txt","w") as f:
    f.write("DesignQA Results\n")
    f.write("Subset: Definition\n")
    f.write(f"Num Questions: {len(df_results)}\n")
    f.write(f"F1 (BoC ↑): {overall_f1:.6f}\n")

print("definition_detailed_with_f1.csv written")
print("definition.txt written")


Upload ground-truth CSV (rule_definition_qa.csv):


Saving rule_definition_qa.csv to rule_definition_qa (4).csv
Upload predictions CSV (preds_v49p_scored.csv):


Saving preds_v49p_scored.csv to preds_v49p_scored (4).csv
Definition F1 (BoC ↑): 0.084485 on 31 questions
definition_detailed_with_f1.csv written
definition.txt written
