In [None]:
!pip install faiss-cpu sentence-transformers openai pymupdf
!pip -q install rank-bm25

In [None]:
#import os
#os.environ["OPENAI_API_KEY"] = ""
#My Google collab


In [None]:
import os
from openai import OpenAI

api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing OPENAI_API_KEY environment variable. Please set it before running.")

client = OpenAI(api_key=api_key)

In [None]:
from google.colab import files
import os

# Upload-FSAE PDF and Q&A CSV file
uploaded = files.upload()

pdf_path = None
csv_file = None

for f in uploaded.keys():
    if f.endswith(".pdf"):
        pdf_path = f
    elif f.endswith(".csv"):
        csv_file = f

if not pdf_path or not csv_file:
    raise FileNotFoundError("Both a PDF and a CSV file are required.")

In [None]:
#import fitz
##import re

#doc = fitz.open(pdf_path)
#raw_text = "\n".join(page.get_text() for page in doc)

#def clean_text(text):
   # text = text.replace('\xa0', ' ')
    #text = re.sub(r'-\n', '', text)
    #text = re.sub(r'\n+', ' ', text)
    #text = re.sub(r'\s{2,}', ' ', text)
    #return text.strip()
#full_text = clean_text(raw_text)
import fitz, re

doc = fitz.open(pdf_path)

HEADER_RES = [
    re.compile(r'^\s*Formula SAE.*Page\s+\d+\s+of\s+\d+\s*$', re.I),
    re.compile(r'^\s*Version\s+\d+(\.\d+)?\s+\d{1,2}\s+\w+\s+\d{4}\s*$', re.I),
    re.compile(r'^\s*\d+\s*$'),
]
TOC_LINE_RE       = re.compile(r'.+\.\s?\.\s?\.\s+\d+$')
SECTION_BANNER_RE = re.compile(r'^[A-Z]{1,4}\s*-\s+.+$')


RULE_ID_RE = re.compile(r'^\s*[A-Z]{1,3}\s*[-.]?\s*\d+(\.\d+)*\b')
UNIT_RE    = re.compile(r'\b(\d+(\.\d+)?)\s*(mm|cm|in|inch(es)?)\b', re.I)

def extract_text_from_page(p):
    return p.get_text("text")

def clean_lines(lines):
    out = []
    for ln in lines:
        s = ln.rstrip().replace('\xa0', ' ')
        if any(rx.match(s) for rx in HEADER_RES):
            continue
        if TOC_LINE_RE.search(s):
            continue
        if SECTION_BANNER_RE.match(s):
            continue
        out.append(s)
    return out

def page_stats(txt: str):
    lines = [l for l in txt.splitlines() if l.strip()]
    n = len(lines)
    toc_hits   = sum(1 for l in lines if TOC_LINE_RE.search(l))
    rule_hits  = sum(1 for l in lines if RULE_ID_RE.match(l))
    unit_hits  = sum(1 for l in lines if UNIT_RE.search(l))
    return n, toc_hits, rule_hits, unit_hits

def autodetect_start(doc, scan_first=20):
    best = 0
    for i in range(min(scan_first, len(doc))):
        t = extract_text_from_page(doc[i])
        n, toc_hits, rule_hits, unit_hits = page_stats(t)
        if rule_hits >= 3 and toc_hits <= max(1, int(0.15 * n)):
            return i
        if rule_hits >= 1 and unit_hits >= 3:
            best = i
    return best

auto_start = autodetect_start(doc)

pages = []
for i in range(auto_start, len(doc)):
    t = extract_text_from_page(doc[i])
    lines = clean_lines(t.splitlines())
    if len(lines) > 0:
        toc_like = sum(1 for l in lines if TOC_LINE_RE.search(l))
        if toc_like > 0.3 * len(lines):
            continue
    pages.append("\n".join(lines))

raw_text = "\n".join(pages)
print(f"Start page (auto): {auto_start}  |  Collected text chars: {len(raw_text)}")


In [None]:
import re

def clean_text(text):
    text = text.replace('\xa0', ' ')
    text = re.sub(r'-\n', '', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)
    return text.strip()

full_text = clean_text(raw_text)
print("Cleaned text chars:", len(full_text))

In [None]:
import re, json
RULE_HEAD_RE = re.compile(r'(?m)^(?P<rid>[A-Z]{1,4}\.\d+(?:\.\d+)*)(?:[ \t]+(?P<title>.+))?$')

def parse_rules_from_text(text):
    lines = text.splitlines()
    rules = []
    cur_id, buf = None, []

    def flush():
        nonlocal cur_id, buf, rules
        if cur_id and buf:
            content = "\n".join(buf).strip()
            if content:
                if not content.startswith(cur_id):
                    content = f"{cur_id} " + content
                rules.append((cur_id, content))
        cur_id, buf = None, []

    for ln in lines:
        s = ln.strip()
        m = RULE_HEAD_RE.match(s)
        if m:
            flush()
            cur_id = m.group("rid").strip()
            buf = [s]
        else:
            if cur_id:
                buf.append(ln)

    flush()
    return rules

rule_pairs = parse_rules_from_text(full_text)
rule_chunks = {rid: txt for rid, txt in rule_pairs}

with open("rule_chunks.json", "w", encoding="utf-8") as f:
    json.dump(rule_chunks, f, indent=2, ensure_ascii=False)

print("Total rules parsed:", len(rule_chunks))
print("Sample keys:", list(sorted(rule_chunks.keys()))[:20])

In [None]:
import pandas as pd
import re

rows = [{"rule_id": rid, "chars": len(txt), "text": txt} for rid, txt in rule_chunks.items()]
df_rules = pd.DataFrame(rows).sort_values(["rule_id"]).reset_index(drop=True)
df_rules.to_csv("rule_chunks_preview.csv", index=False)
print("Wrote: rule_chunks_preview.csv")

families = ["GR","AD","DR","V","F","T","EV","ES","IC","ICV"]
for fam in families:
    cnt = (df_rules["rule_id"].str.startswith(fam + ".")).sum()
    print(f"{fam}: {cnt} chunks")

In [None]:
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
import re
import faiss

rule_ids = list(rule_chunks.keys())
rule_texts = [rule_chunks[k] for k in rule_ids]

def tok(s):
    return re.findall(r"[A-Za-z0-9\.]+", s.lower())

bm25 = BM25Okapi([tok(t) for t in rule_texts])

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(rule_texts, normalize_embeddings=True, convert_to_numpy=True).astype("float32")

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

ce_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

print(f"FAISS index built with {len(rule_texts)} rules.")

In [None]:
import pandas as pd

df = pd.read_csv(csv_file)

if "ground_truth" in df.columns:
    df = df.rename(columns={"ground_truth": "answer"})

if "id" not in df.columns:
    df["id"] = [f"ret_{i+1:03}" for i in range(len(df))]

df = df[["id", "question", "answer"]]
qa_data = df.to_dict(orient="records")

print(f"Loaded {len(qa_data)} QA pairs.")

In [None]:
def extract_rule_id(question):
    match = re.search(r'([A-Z]+\.\d+(?:\.\d+)*)', question)
    return match.group(1) if match else None

def retrieve_chunks(question, model, index, rule_ids, rule_texts, top_k=5):
    q_embedding = model.encode([question], normalize_embeddings=True).astype("float32")
    D, I = index.search(q_embedding, top_k)
    return [(rule_ids[i], rule_texts[i]) for i in I[0]]

def build_prompt(question, retrieved_texts):
    context = "\n".join(retrieved_texts)
    return f"""You are an expert in the FSAE competition rules.

Question:
{question}

Retrieved Rules:
{context}

Task:
- Respond ONLY with the exact rule text as shown in the retrieved rules.
- Do not paraphrase, summarize, reformat, or add any extra words.
- Keep punctuation, capitalization, and spacing exactly as in the retrieved rule text.
- If multiple rules are shown, return only the one that exactly matches the requested rule number.
- If no matching rule exists, respond with exactly: No applicable rule found.
"""

def get_gpt4_answer(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
    )
    return response.choices[0].message.content.strip()

In [None]:
results = []
sampled_qa_data = qa_data

for idx, qa in enumerate(sampled_qa_data):
    try:
        question = qa["question"]
        qa_id = qa["id"]
        ground_truth = qa["answer"]

        rule_id = extract_rule_id(question)
        if rule_id and rule_id in rule_chunks:
            retrieved = [(rule_id, f"{rule_id} {rule_chunks[rule_id]}")]
        else:
            retrieved = retrieve_chunks(question, model, index, rule_ids, rule_texts, top_k=1)

        prompt = build_prompt(question, [text for _, text in retrieved])
        answer = get_gpt4_answer(prompt)

        results.append({
            "id": qa_id,
            "question": question,
            "ground_truth": ground_truth,
            "prediction": answer
        })

        print(f"{qa_id} ")

    except Exception as e:
        print(f"Error with {qa_id}: {e}")

In [None]:
import os, pandas as pd, importlib.util

!rm -rf design_qa
!git clone -q https://github.com/anniedoris/design_qa.git
!pip -q install rouge nltk

repo_dir = os.path.abspath("design_qa")
metrics_path = os.path.join(repo_dir, "eval", "metrics", "metrics.py")
spec = importlib.util.spec_from_file_location("dq_metrics", metrics_path)
dq_metrics = importlib.util.module_from_spec(spec)
spec.loader.exec_module(dq_metrics)
normalize_answer = dq_metrics.normalize_answer


df_results = pd.DataFrame(results)
df_results["ground_truth"] = df_results["ground_truth"].apply(lambda x: normalize_answer(str(x)))
df_results["prediction"] = df_results["prediction"].apply(lambda x: normalize_answer(str(x)))


eval_csv = "retrieval_evaluation.csv"
(df_results.rename(columns={"prediction": "model_prediction"})
          .loc[:, ["ground_truth", "model_prediction"]]
).to_csv(eval_csv, index=False)
print(f"{eval_csv} (columns: ground_truth, model_prediction)")

#scoring
overall_f1, per_f1 = dq_metrics.eval_retrieval_qa(eval_csv)
print(f"Official Retrieval F1: {overall_f1:.6f} on {len(per_f1)} questions")

# results.txt ---
with open("retrieval.txt", "w", encoding="utf-8") as f:
    f.write("DesignQA Results\n")
    f.write("Subset: Retrieval\n")
    f.write(f"Num Questions: {len(per_f1)}\n")
    f.write(f"F1: {overall_f1:.6f}\n")
print("results.txt")

# F1
def bow_f1(pred, gt):
    pt = dq_metrics.normalize_answer(str(pred)).split()
    gt = dq_metrics.normalize_answer(str(gt)).split()
    return dq_metrics.token_f1_score(pt, gt)

df_detailed = (df_results
               .rename(columns={"prediction": "model_prediction"})
               .loc[:, ["id", "question", "ground_truth", "model_prediction"]]
              ).copy()
df_detailed["f1"] = df_detailed.apply(lambda r: bow_f1(r["model_prediction"], r["ground_truth"]), axis=1)
df_detailed.to_csv("retrieval_detailed_with_f1.csv", index=False)
print("retrieval_detailed_with_f1.csv (columns:id, question, ground_truth, model_prediction, f1)")