In [None]:
!pip -q install pymupdf faiss-cpu sentence-transformers pandas numpy rank-bm25 openai rouge nltk

In [None]:
import os, re, json, time, math, datetime, subprocess, random
import numpy as np
import pandas as pd
from collections import defaultdict, Counter

In [None]:
##import os
#os.environ["OPENAI_API_KEY"] = ""
#My Google collab

import os
from openai import OpenAI

api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing OPENAI_API_KEY environment variable. Please set it before running.")

client = OpenAI(api_key=api_key)

In [None]:
from google.colab import files
uploaded = files.upload()

pdf_path, csv_file = None, None
for f in uploaded.keys():
    if f.lower().endswith(".pdf"):
        pdf_path = f
    elif f.lower().endswith(".csv"):
        csv_file = f
if not pdf_path or not csv_file:
    raise FileNotFoundError("Upload the FSAE Rulebook PDF and the Rule Compilation CSV.")

print("PDF:", pdf_path, "| CSV:", csv_file)

In [None]:
import fitz, re

doc = fitz.open(pdf_path)

HEADER_RES = [
    re.compile(r'^\s*Formula SAE.*Page\s+\d+\s+of\s+\d+\s*$', re.I),
    re.compile(r'^\s*Version\s+\d+(\.\d+)?\s+\d{1,2}\s+\w+\s+\d{4}\s*$', re.I),
    re.compile(r'^\s*\d+\s*$'),
]
TOC_LINE_RE       = re.compile(r'.+\.\s?\.\s?\.\s+\d+$')
SECTION_BANNER_RE = re.compile(r'^[A-Z]{1,4}\s*-\s+.+$')


RULE_ID_RE = re.compile(r'^\s*[A-Z]{1,3}\s*[-.]?\s*\d+(\.\d+)*\b')
UNIT_RE    = re.compile(r'\b(\d+(\.\d+)?)\s*(mm|cm|in|inch(es)?)\b', re.I)

def extract_text_from_page(p):
    return p.get_text("text")

def clean_lines(lines):
    out = []
    for ln in lines:
        s = ln.rstrip().replace('\xa0', ' ')
        if any(rx.match(s) for rx in HEADER_RES):
            continue
        if TOC_LINE_RE.search(s):
            continue
        if SECTION_BANNER_RE.match(s):
            continue
        out.append(s)
    return out

def page_stats(txt: str):
    lines = [l for l in txt.splitlines() if l.strip()]
    n = len(lines)
    toc_hits   = sum(1 for l in lines if TOC_LINE_RE.search(l))
    rule_hits  = sum(1 for l in lines if RULE_ID_RE.match(l))
    unit_hits  = sum(1 for l in lines if UNIT_RE.search(l))
    return n, toc_hits, rule_hits, unit_hits

def autodetect_start(doc, scan_first=20):
    best = 0
    for i in range(min(scan_first, len(doc))):
        t = extract_text_from_page(doc[i])
        n, toc_hits, rule_hits, unit_hits = page_stats(t)
        if rule_hits >= 3 and toc_hits <= max(1, int(0.15 * n)):
            return i
        if rule_hits >= 1 and unit_hits >= 3:
            best = i
    return best

auto_start = autodetect_start(doc)

pages = []
for i in range(auto_start, len(doc)):
    t = extract_text_from_page(doc[i])
    lines = clean_lines(t.splitlines())
    if len(lines) > 0:
        toc_like = sum(1 for l in lines if TOC_LINE_RE.search(l))
        if toc_like > 0.3 * len(lines):
            continue
    pages.append("\n".join(lines))

raw_text = "\n".join(pages)
print(f"Start page (auto): {auto_start}  |  Collected text chars: {len(raw_text)}")
rule_chunks = {}
current_rule = None
buffer = []

for line in raw_text.splitlines():
    if RULE_ID_RE.match(line.strip()):
        if current_rule and buffer:
            rule_chunks[current_rule] = " ".join(buffer).strip()
        current_rule = line.strip().split()[0]   # take the rule ID
        buffer = [line.strip()]
    else:
        if current_rule:
            buffer.append(line.strip())

if current_rule and buffer:
    rule_chunks[current_rule] = " ".join(buffer).strip()

print("Extracted rules:", len(rule_chunks))

In [None]:
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import faiss

rule_ids   = list(rule_chunks.keys())
rule_texts = [rule_chunks[k] for k in rule_ids]

model = SentenceTransformer("all-MiniLM-L6-v2")
emb = model.encode(rule_texts, show_progress_bar=True, convert_to_numpy=True).astype("float32")
emb = emb / (np.linalg.norm(emb, axis=1, keepdims=True) + 1e-12)

index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)
print("FAISS index size:", index.ntotal)

def tokenize(s): return re.findall(r"[a-z0-9]+", s.lower())
bm25 = BM25Okapi([tokenize(t) for t in rule_texts])

In [None]:
import re
import numpy as np
from collections import defaultdict, Counter


RID_RE = re.compile(r'[A-Z]{1,4}\.\d+(?:\.\d+)*')
def tokenize(s: str):
    return re.findall(r"[a-z0-9]+", s.lower())

def extract_focus_phrase(q: str):
    m = re.search(r'"([^"]+)"', q) or re.search(r'`([^`]+)`', q)
    return (m.group(1).strip() if m else q.strip())
SYN = {
    "aerodynamics": ["aero", "wing", "splitter", "diffuser", "undertray", "endplate", "bodywork"],
    "aerodynamic":  ["aero", "wing", "splitter", "diffuser", "undertray", "endplate", "bodywork"],
    "impact attenuator": ["ia", "front impact", "energy absorber", "attenuator"],
    "accumulator": ["battery pack", "hv", "shutdown", "ams", "tsal"],
    "seat belt": ["harness", "restraint", "belts"],
    "inspection": ["tilt", "brake test", "noise test", "scrutineering"],
}

def build_cooccur_map(text, window=10, min_freq=5):
    toks = tokenize(text)
    pos = defaultdict(list)
    for i, t in enumerate(toks):
        pos[t].append(i)
    co = defaultdict(Counter)
    for w, idxs in pos.items():
        for i in idxs:
            lo, hi = max(0, i - window), min(len(toks), i + window + 1)
            for j in range(lo, hi):
                if j == i:
                    continue
                co[w][toks[j]] += 1
    return {w: {t: c for t, c in cnt.items() if c >= min_freq} for w, cnt in co.items()}

try:
    full_text = raw_text
except NameError:
    try:
        full_text = " \n".join(rule_chunks.values())
    except NameError:
        full_text = ""

CO = build_cooccur_map(full_text, window=10, min_freq=5)
def expand_focus(focus):
    f = (focus or "").strip().lower()
    extras = set(SYN.get(f, []))
    extras.update([t for t, _c in Counter(CO.get(f, {})).most_common(6)])
    extras = [e for e in extras if len(e) > 2]
    return focus if not extras else f"{focus} " + " ".join(sorted(set(extras))[:8])

SECTION_MAP = {
    "aero": ["T."], "aerodynamic": ["T."], "aerodynamics": ["T."],
    "wing": ["T."], "splitter": ["T."], "diffuser": ["T."], "undertray": ["T."], "endplate": ["T."], "bodywork": ["T."],
    "electrical": ["EV."], "accumulator": ["EV."], "hv": ["EV."], "shutdown": ["EV."], "insulation": ["EV."], "ams": ["EV."], "tsal": ["EV."],
    "chassis": ["F."], "frame": ["F."], "impact attenuator": ["F."], "attenuator": ["F."], "node": ["F."], "tube": ["F."],
    "inspection": ["IN."], "tilt": ["IN."], "brake": ["IN."], "noise": ["IN."], "scrutineering": ["IN."],
    "presentation": ["S."], "cost": ["C."], "design": ["DR."],
    "engine": ["IC."], "combustion": ["IC."], "intake": ["IC."],
    "electric": ["EV."],
}

def guess_sections_from_text(text):
    t = text.lower()
    sections = set()
    for k, secs in SECTION_MAP.items():
        if k in t:
            sections.update(secs)
    if sections:
        sections.add("GR.")
    return sections

def retrieve_hybrid(q, k_dense=140, k_bm25=140, rrf_k=60, w_dense=1.0, w_bm25=1.35):
    qv = model.encode([q], convert_to_numpy=True).astype("float32")
    qv = qv / (np.linalg.norm(qv, axis=1, keepdims=True) + 1e-12)
    D, I = index.search(qv, k_dense)
    dense_rank = {rule_ids[i]: r for r, i in enumerate(I[0], start=1)}
    scores = bm25.get_scores(tokenize(q))
    top = np.argsort(scores)[-k_bm25:][::-1]
    bm25_rank = {rule_ids[i]: r for r, i in enumerate(top, start=1)}
    def rrf(rank): return 1.0 / (rrf_k + rank)
    fused, seen = [], set(dense_rank) | set(bm25_rank)
    for rid in seen:
        s = w_dense * rrf(dense_rank.get(rid, 10_000)) + w_bm25 * rrf(bm25_rank.get(rid, 10_000))
        fused.append((s, rid))
    fused.sort(reverse=True)
    return [(rid, rule_chunks[rid]) for s, rid in fused]
def soft_section_filter(cands, allowed_secs, keep_top=80, spillover=20):
    if not allowed_secs:
        return cands[:keep_top]
    in_sec  = [(rid, txt) for rid, txt in cands if rid.split(".")[0] + "." in allowed_secs]
    out_sec = [(rid, txt) for rid, txt in cands if rid.split(".")[0] + "." not in allowed_secs]
    return in_sec[:keep_top] + out_sec[:spillover]

def first_line(rid):
    return rule_chunks[rid].splitlines()[0][:120].replace("\n", " ")

def rerank_with_gpt_strict(query, candidate_ids, max_pick=12):
    if client is None:
        return candidate_ids[:max_pick]
    prompt = f"""You are an FSAE rules expert.
From the candidate list, return ONLY rule IDs, comma-separated. At most {max_pick}.
No extra words.

Query: {query}
Candidates: {', '.join(candidate_ids)}
Output:"""
    try:
        resp = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role":"user","content":prompt}],
            temperature=0
        )
        raw = (resp.choices[0].message.content or "").strip()
    except Exception:
        return candidate_ids[:max_pick]
    ids = [x.strip().rstrip(".") for x in re.split(r'[,\s]+', raw)]
    ids = [x for x in ids if RID_RE.fullmatch(x)]
    if not ids:
        ids = candidate_ids[:max_pick]
    return ids[:max_pick]

def expand_hierarchy_smart(picked_ids, cands, focus=None, per_parent_cap=3, total_cap=40):
    pool_ids = {rid for rid, _ in cands}
    focus_kw = set(tokenize(focus or ""))
    def children_of(parent):
        pref = parent + "."
        return [rid for rid in pool_ids if rid.startswith(pref)]
    final, seen = [], set()
    for rid in picked_ids:
        if rid not in seen:
            final.append(rid); seen.add(rid)
        kids = children_of(rid)
        scored = []
        for kid in kids:
            text = rule_chunks.get(kid, "").lower()
            hit = sum(1 for w in focus_kw if w and w in text)
            scored.append((hit, kid))
        scored.sort(reverse=True)
        for _, kid in scored[:per_parent_cap]:
            if kid not in seen:
                final.append(kid); seen.add(kid)
        if len(final) >= total_cap:
            break
    return final[:total_cap]

NEG = {
    "aerodynamic": ["presentation", "cost", "business", "design event"],
    "aerodynamics": ["presentation", "cost", "business", "design event"],
}

def post_filter_final(final_ids, focus_kw):
    keep, neg = [], set()
    for f in focus_kw:
        neg.update(NEG.get(f, []))
    for rid in final_ids:
        txt = rule_chunks.get(rid, "").lower()
        if any(w in txt for w in focus_kw) or len(rid.split(".")) <= 2:
            if not any(n in txt for n in neg):
                keep.append(rid)
    return keep

def normalize_ids(ids, cap=50):
    out, seen = [], set()
    for x in ids:
        x = x.strip().rstrip(".")
        if RID_RE.fullmatch(x) and x not in seen:
            out.append(x); seen.add(x)
        if len(out) >= cap:
            break
    return out

def natural_key(rid):
    parts = rid.split(".")
    out = [parts[0]]
    for p in parts[1:]:
        out.append(int(p) if p.isdigit() else p)
    return out


In [None]:
qa_df = pd.read_csv(csv_file)
compiled = []

for i, row in qa_df.iterrows():
    qid   = row.get("id", f"q_{i:04d}")
    q_raw = row["question"]
    gt    = row.get("ground_truth","")

    focus_phrase = extract_focus_phrase(q_raw)
    query_for_retrieval = expand_focus(focus_phrase)
    allowed_secs = guess_sections_from_text(query_for_retrieval)

    cands = retrieve_hybrid(query_for_retrieval, k_dense=140, k_bm25=140)
    cands = soft_section_filter(cands, allowed_secs, keep_top=80, spillover=20)

    candidate_ids = [rid for rid, _ in cands[:60]]
    picked_ids = rerank_with_gpt_strict(query_for_retrieval, candidate_ids, max_pick=12)

    final_ids = expand_hierarchy_smart(picked_ids, cands, focus=focus_phrase, per_parent_cap=3, total_cap=40)
    focus_kw = set(tokenize(focus_phrase))
    final_ids = post_filter_final(final_ids, focus_kw)
    final_ids = normalize_ids(final_ids, cap=50)
    final_ids = sorted(set(final_ids), key=natural_key)[:50]

    compiled.append({
        "id": qid,
        "question": q_raw,
        "focus_used": focus_phrase,
        "ground_truth": gt,
        "compiled_rule_ids": ", ".join(final_ids)
    })

    if client is not None:
        time.sleep(0.35)

out_df = pd.DataFrame(compiled)
out_df.to_csv("compiled_rule_answers.csv", index=False)
print("Saved:", "compiled_rule_answers.csv")

In [None]:
import os, subprocess, re, pandas as pd

if not os.path.exists("design_qa"):
    !git clone -q https://github.com/anniedoris/design_qa.git
repo_dir = os.path.abspath("design_qa")
metrics_path = os.path.join(repo_dir, "eval", "metrics", "metrics.py")

import importlib.util
spec = importlib.util.spec_from_file_location("dq_metrics", metrics_path)
dq_metrics = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dq_metrics)

assert 'out_df' in globals(), "out_df is not defined. Build your predictions DataFrame first."
assert {'ground_truth','compiled_rule_ids'}.issubset(out_df.columns), \
    "out_df must have columns: ground_truth, compiled_rule_ids"

RID_RE = re.compile(r'[A-Z]{1,4}\.\d+(?:\.\d+)*')
def normalize_id_list(s):
    parts = re.split(r'[;,]\s*', str(s))
    ids = []
    seen = set()
    for p in parts:
        x = p.strip().rstrip(".")
        if RID_RE.fullmatch(x) and x not in seen:
            ids.append(x); seen.add(x)
    return ", ".join(ids)

EVAL_CSV = "compilation_evaluation.csv"

df_eval = pd.DataFrame({
    "ground_truth": out_df["ground_truth"].astype(str).str.strip(),
    "model_prediction": out_df["compiled_rule_ids"].fillna("").map(normalize_id_list)
})
df_eval.to_csv(EVAL_CSV, index=False, encoding="utf-8")
print("Wrote:", EVAL_CSV)

overall_f1, per_f1 = dq_metrics.eval_compilation_qa(EVAL_CSV)
print(f"Compilation F1 (macro): {overall_f1:.6f} on {len(per_f1)} questions")
os.makedirs("results", exist_ok=True)
with open(os.path.join("results", "compilation.txt"), "w", encoding="utf-8") as f:
    f.write("DesignQA Results\n")
    f.write("Subset: Compilation\n")
    f.write(f"F1: {overall_f1:.6f}\n")
    f.write(f"Num Questions: {len(per_f1)}\n")
    try:
        sha = subprocess.check_output(["git","-C","design_qa","rev-parse","--short","HEAD"]).decode().strip()
    except Exception:
        sha = "<git-sha>"

detail = out_df.copy()
detail["model_prediction_norm"] = df_eval["model_prediction"]
detail["f1"] = per_f1
detail.to_csv("compilation_detailed_with_f1.csv", index=False, encoding="utf-8")
print("Wrote: results/compilation.txt and compilation_detailed_with_f1.csv")