In [1]:
!pip install -q beautifulsoup4 requests lxml pdfminer.six PyMuPDF tqdm pandas pytesseract pillow unidecode


In [29]:
import os, re, json, time, hashlib
from urllib.parse import urlparse, urljoin
from collections import deque, defaultdict

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import pandas as pd
from tqdm import tqdm

# ---------- Parameters (tweak if needed) ----------
BASE_URL     = "https://math.mit.edu/research/highschool/primes/materials/"
WHITELIST    = ["/research/highschool/primes/materials/"]
BLACKLIST    = ["#", "mailto:", "facebook.com", "twitter.com"]
USER_AGENT   = "IS688-M1-MathAware/1.0"
POLITE_DELAY = 0.6
MAX_PAGES    = 1000     # raised to improve coverage
MAX_PDFS     = 400     # raised to include more PDFs (esp. solutions)

OUT_DIR = "notebook_outputs"
os.makedirs(OUT_DIR, exist_ok=True)

def slugify(url: str) -> str:
    h = hashlib.sha256(url.encode("utf-8")).hexdigest()[:12]
    path = urlparse(url).path.strip("/").replace("/", "_") or "index"
    return f"{path}_{h}"

def allowed(u: str) -> bool:
    L = u.lower()
    if any(b in L for b in BLACKLIST): return False
    return any(w in L for w in WHITELIST)

def extract_main_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script","style","nav","header","footer"]): tag.decompose()
    return re.sub(r"\s+"," ", soup.get_text(" ", strip=True))


In [31]:
MATH_CHARS = set("∑∫√∞≈≠≤≥±→←↔×÷•·°πθλμσφψωαβγδΔΓΩℝℤℚℕ")

def _normalize_math_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.replace("\u00A0"," ").replace("\u00AD","")
    s = re.sub(r"[–—−]", "-", s)
    s = re.sub(r"\s+\n", "\n", s)
    s = re.sub(r"[ \t]+", " ", s)
    return s

def _page_to_image(page, zoom=3.0):
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

def _ocr_page(page) -> str:
    img = _page_to_image(page, zoom=3.0)
    txt = pytesseract.image_to_string(img, config="--oem 1 --psm 6")
    return _normalize_math_text(txt)

def _mupdf_page_text(page) -> str:
    txt = page.get_text("text") or ""
    if not txt.strip():
        blocks = page.get_text("blocks") or []
        txt = "\n".join(b[4] for b in blocks if len(b) >= 5 and isinstance(b[4], str))
    return _normalize_math_text(txt)

def extract_pdf_text_math_aware(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    out = []
    for page in doc:
        t = _mupdf_page_text(page)
        bad = (len(t.strip()) < 40 or t.count("�") > 2)
        need_ocr = bad or (len(MATH_CHARS.intersection(set(t))) == 0 and page.get_images())
        if need_ocr:
            try:
                ocr_t = _ocr_page(page)
                t = max([t, ocr_t], key=lambda s: (len(s), -s.count("�")))
            except Exception:
                pass
        out.append(t)
    return "\n".join(out).strip()


In [33]:
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})
retries = Retry(total=3, backoff_factor=0.5, status_forcelist=[429,500,502,503,504])
session.mount("http://", HTTPAdapter(max_retries=retries))
session.mount("https://", HTTPAdapter(max_retries=retries))

seen, q = set(), deque([BASE_URL])
inventory = []

print("[crawl] starting…")
t0 = time.time()
while q and len(inventory) < MAX_PAGES:
    url = q.popleft()
    if url in seen: continue
    seen.add(url)
    try:
        r = session.get(url, timeout=10)
        ctype = r.headers.get("Content-Type","").lower()
    except Exception:
        continue

    inventory.append({"url": url, "ctype": ctype})
    if "text/html" in ctype or url.endswith("/"):
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select("a[href]"):
            link = urljoin(url, a["href"].strip())
            if allowed(link) and link not in seen:
                q.append(link)
    time.sleep(POLITE_DELAY)

print(f"[crawl] discovered {len(inventory)} resources in {int(time.time()-t0)}s")
len(inventory)


[crawl] starting…
[crawl] discovered 1000 resources in 689s


1000

In [34]:
records = []
pdf_count = 0

for rec in tqdm(inventory, desc="Scraping"):
    url, ctype = rec["url"], rec.get("ctype","")
    try:
        if ("application/pdf" in ctype or url.lower().endswith(".pdf")) and pdf_count < MAX_PDFS:
            r = session.get(url, timeout=20); r.raise_for_status()
            pdf_path = os.path.join(OUT_DIR, slugify(url) + ".pdf")
            with open(pdf_path, "wb") as f: f.write(r.content)
            text = extract_pdf_text_math_aware(pdf_path)
            records.append({"url": url, "type": "pdf", "text": text})
            pdf_count += 1
        elif "text/html" in ctype or url.endswith("/"):
            r = session.get(url, timeout=15); r.raise_for_status()
            text = extract_main_text(r.text)
            records.append({"url": url, "type": "html", "text": text})
    except Exception:
        continue

df = pd.DataFrame.from_records(records)
print("Scraped records:", len(df), "| PDFs:", int((df['type']=='pdf').sum()), "| HTML:", int((df['type']=='html').sum()))
df.head(8)


Scraping: 100%|████████████████████████████████████████████████████████████████████| 1000/1000 [22:39<00:00,  1.36s/it]


Scraped records: 456 | PDFs: 351 | HTML: 105


Unnamed: 0,url,type,text
0,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
1,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
2,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
3,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
4,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
5,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
6,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...
7,https://math.mit.edu/research/highschool/prime...,html,Index of /research/highschool/primes/materials...


In [35]:
df.to_csv(os.path.join(OUT_DIR, "records.csv"), index=False)
with open(os.path.join(OUT_DIR, "records.jsonl"), "w", encoding="utf-8") as f:
    for r in df.to_dict("records"):
        f.write(json.dumps(r, ensure_ascii=False) + "\n")
print("Saved records to", OUT_DIR)

def count_math_symbols(s): return sum(1 for ch in str(s) if ch in MATH_CHARS)
df["math_symbol_count"] = df["text"].map(count_math_symbols)
print("Avg math symbols per record:", float(df["math_symbol_count"].mean() or 0))

all_symbols = set()
for t in df["text"].dropna():
    for ch in t:
        if ch in MATH_CHARS: all_symbols.add(ch)
print("Unique math symbols captured:", len(all_symbols))
print("Symbols:", " ".join(sorted(all_symbols)))


Saved records to notebook_outputs
Avg math symbols per record: 92.31359649122807
Unique math symbols captured: 33
Symbols: ° ± · × Γ Δ α β γ δ θ λ μ π σ φ ψ ω • ℕ ℝ ℤ ← → ↔ ∑ √ ∞ ∫ ≈ ≠ ≤ ≥


In [36]:
# --- robust demo selection: find the rows with the MOST matches ---
best_p_idx, best_p_count = None, -1
best_s_idx, best_s_count = None, -1

for i, row in df.iterrows():
    txt = row.get("text") or ""
    pseg = segment_items(txt, PROB_PATTERNS)
    sseg = segment_items(txt, SOL_PATTERNS)
    if len(pseg) > best_p_count:
        best_p_count, best_p_idx = len(pseg), i
    if len(sseg) > best_s_count:
        best_s_count, best_s_idx = len(sseg), i

demo_prob = df.loc[best_p_idx, "text"] if best_p_idx is not None else ""
demo_sol  = df.loc[best_s_idx, "text"] if best_s_idx is not None else ""

demo_p = segment_items(demo_prob, PROB_PATTERNS) if demo_prob else {}
demo_s = segment_items(demo_sol,  SOL_PATTERNS)  if demo_sol  else {}

print(f"Best problem row: {best_p_idx}  (segments: {best_p_count})")
print(f"Best solution row: {best_s_idx} (segments: {best_s_count})")
print("Found problem IDs sample:", list(demo_p.keys())[:10])
print("Found solution IDs sample:", list(demo_s.keys())[:10])



Best problem row: 53  (segments: 53)
Best solution row: 204 (segments: 2)
Found problem IDs sample: ['K1', 'K2', 'B2', 'X2', 'B1', 'Q11', 'Q12', 'Q21', 'Q22', 'J2']
Found solution IDs sample: ['7', 'G1']


In [41]:
import re, os
from urllib.parse import urljoin

YEAR_START, YEAR_END = 2011, 2024  # adjust if your instructor wants a subset
PROB_RE = re.compile(r"(prob|problem)[^/]*\.pdf$", re.I)
SOL_RE  = re.compile(r"(sol|soln|solutions?|answers?|key)[^/]*\.pdf$", re.I)

def collect_year_pdfs():
    found = {"prob": set(), "sol": set()}
    for y in range(YEAR_START, YEAR_END+1):
        idx = f"https://math.mit.edu/research/highschool/primes/materials/{y}/"
        try:
            r = session.get(idx, timeout=10); r.raise_for_status()
        except:
            continue
        soup = BeautifulSoup(r.text, "lxml")
        for a in soup.select("a[href]"):
            href = (a.get("href") or "").strip()
            absu = urljoin(idx, href)
            if PROB_RE.search(href) or PROB_RE.search(absu): found["prob"].add(absu)
            if SOL_RE.search(href)  or SOL_RE.search(absu):  found["sol"].add(absu)
    return found

found = collect_year_pdfs()
print(f"Found problems: {len(found['prob'])}, solutions: {len(found['sol'])}")

# Download any new PDFs and append to df
already = set(df["url"].astype(str))
to_get = [u for u in (found["prob"] | found["sol"]) if u not in already]

added = 0
for url in to_get:
    try:
        r = session.get(url, timeout=25); r.raise_for_status()
        if "pdf" not in r.headers.get("Content-Type","").lower() and not url.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(OUT_DIR, slugify(url)+".pdf")
        with open(pdf_path, "wb") as f: f.write(r.content)
        text = extract_pdf_text_math_aware(pdf_path)
        df = pd.concat([df, pd.DataFrame([{"url": url, "type":"pdf", "text": text}])], ignore_index=True)
        added += 1
    except:  # keep going if any file fails
        continue

print(f"Added {added} PDFs. df now has {len(df)} rows.")


Found problems: 12, solutions: 15
Added 13 PDFs. df now has 469 rows.


In [43]:
from collections import defaultdict
from urllib.parse import urlparse

def infer_year_from_url(url: str):
    m = re.search(r"(20\d{2})", url or "")
    return m.group(1) if m else None

def stem_for_pairing(url: str):
    path = urlparse(url or "").path
    name = os.path.splitext(os.path.basename(path))[0].lower()
    name = re.sub(r"(prob|problems?|sol|soln|solutions?|answers?|key)", "", name)
    name = re.sub(r"[^a-z0-9]+", "_", name).strip("_")
    if not name:
        parts = [p for p in path.strip("/").split("/") if p]
        if parts:
            name = re.sub(r"[^a-z0-9]+", "_", parts[-1].lower())
    return name or "index"

groups = defaultdict(lambda: {"prob": None, "sol": None})

for _, r in df.iterrows():
    url  = r.get("url"); txt = r.get("text") or ""
    year = infer_year_from_url(url); stem = stem_for_pairing(url)
    pseg = segment_items(txt, PROB_PATTERNS)
    sseg = segment_items(txt, SOL_PATTERNS)
    if pseg and (groups[(year,stem)]["prob"] is None or len(pseg) > len(groups[(year,stem)]["prob"]["seg"])):
        groups[(year,stem)]["prob"] = {"url": url, "seg": pseg}
    if sseg and (groups[(year,stem)]["sol"]  is None or len(sseg) > len(groups[(year,stem)]["sol"]["seg"])):
        groups[(year,stem)]["sol"]  = {"url": url, "seg": sseg}

rows = []
for (year, stem), g in groups.items():
    p = g["prob"]["seg"] if g["prob"] else {}
    s = g["sol"]["seg"]  if g["sol"]  else {}
    all_ids = sorted(set(p.keys()) | set(s.keys()), key=lambda x: (re.sub(r"[A-Z]","",x) or "9999", x))
    for pid in all_ids:
        rows.append({
            "year": year, "pair_stem": stem, "problem_id": pid,
            "problem_text": p.get(pid), "solution_text": s.get(pid),
            "problems_source_url": g["prob"]["url"] if g["prob"] else None,
            "solutions_source_url": g["sol"]["url"] if g["sol"] else None,
        })

pairs_df = pd.DataFrame(rows)
print("Pairs:", len(pairs_df), "| with solutions:", int((pairs_df['solution_text'].fillna('').str.len()>0).sum()))
pairs_df.head(10)


Pairs: 3308 | with solutions: 5


Unnamed: 0,year,pair_stem,problem_id,problem_text,solution_text,problems_source_url,solutions_source_url
0,,algebrafactsheet,A1,a1\n+,,https://math.mit.edu/research/highschool/prime...,
1,,algebrafactsheet,G1,g1\n=\ng\nfor\nall\ng\n∈G.\nNote\nthat\na\nuni...,,https://math.mit.edu/research/highschool/prime...,
2,,algebrafactsheet,M1,m1\n+,,https://math.mit.edu/research/highschool/prime...,
3,,algebrafactsheet,N1,"n1-m⊗n2,\nma⊗n-m⊗an,\nwhere\na\n∈A.\nBy\ndoing...",,https://math.mit.edu/research/highschool/prime...,
4,,algebrafactsheet,A2,"a2)m\n=\na1m\n+\na2m),\nand\nsuch\nthat\n1m\n=...",,https://math.mit.edu/research/highschool/prime...,
5,,algebrafactsheet,M2,"m2)⊗n-m1⊗n-m2⊗n,\nm⊗(n1+",,https://math.mit.edu/research/highschool/prime...,
6,,algebrafactsheet,N2,n2)-m⊗,,https://math.mit.edu/research/highschool/prime...,
7,,algebrafactsheet,X2,x2\n+\n1)\n=\nC.\nLie\nalgebra:\nA\nvector\nsp...,,https://math.mit.edu/research/highschool/prime...,
8,,algebrafactsheet,Z2,Z2\n⊗Z,,https://math.mit.edu/research/highschool/prime...,
9,,algebrafactsheet,Z3,"Z3\n=\n0.\nTheorem:\nIf\nV,\nW\nare\nvector\ns...",,https://math.mit.edu/research/highschool/prime...,


In [45]:
# Full scrape dataset (all records)
records_csv   = os.path.join(OUT_DIR, "records.csv")
records_jsonl = os.path.join(OUT_DIR, "records.jsonl")
df.to_csv(records_csv, index=False)
with open(records_jsonl, "w", encoding="utf-8") as f:
    for r in df.to_dict("records"):
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

# Per-problem pairs
pairs_csv   = os.path.join(OUT_DIR, "problem_solution_pairs.csv")
pairs_jsonl = os.path.join(OUT_DIR, "problem_solution_pairs.jsonl")
pairs_df.to_csv(pairs_csv, index=False)
with open(pairs_jsonl, "w", encoding="utf-8") as f:
    for r in pairs_df.to_dict("records"):
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("Saved:\n -", records_csv, "\n -", records_jsonl, "\n -", pairs_csv, "\n -", pairs_jsonl)


Saved:
 - notebook_outputs\records.csv 
 - notebook_outputs\records.jsonl 
 - notebook_outputs\problem_solution_pairs.csv 
 - notebook_outputs\problem_solution_pairs.jsonl


In [47]:
def count_math_symbols(s): return sum(1 for ch in str(s) if ch in MATH_CHARS)

total_records = len(df)
total_pairs   = len(pairs_df)
with_solutions = int((pairs_df['solution_text'].fillna('').str.len()>0).sum())

unique_syms = set()
for t in df["text"].dropna():
    for ch in t:
        if ch in MATH_CHARS: unique_syms.add(ch)

print("TOTAL records:", total_records)
print("TOTAL per-problem rows:", total_pairs)
print("Rows with solutions:", with_solutions)
print("Unique math symbols captured:", len(unique_syms))
print("Symbols:", " ".join(sorted(unique_syms)))
print("\nSample paired row:")
display(pairs_df[pairs_df['solution_text'].fillna('').str.len()>0].head(1))


TOTAL records: 469
TOTAL per-problem rows: 3308
Rows with solutions: 5
Unique math symbols captured: 33
Symbols: ° ± · × Γ Δ α β γ δ θ λ μ π σ φ ψ ω • ℕ ℝ ℤ ← → ↔ ∑ √ ∞ ∫ ≈ ≠ ≤ ≥

Sample paired row:


Unnamed: 0,year,pair_stem,problem_id,problem_text,solution_text,problems_source_url,solutions_source_url
1699,2019,zhangv,G1,"G1\nand\nG2,\nindicating\nthat\nour\ntheoretic...","solution\nto\nG1\nand\nG2\nas\nwell,\nbut\nwe\...",https://math.mit.edu/research/highschool/prime...,https://math.mit.edu/research/highschool/prime...


In [71]:
from datetime import date
today = "9/21/25"  # required date
report_path = os.path.join(OUT_DIR, "Milestone1_Report_2025-09-21.md")

# simple stats used in the narrative
sym_count = len(unique_syms)
avg_syms  = float(df["math_symbol_count"].mean() if "math_symbol_count" in df else 0.0)

md = f"""# IS 688 — Milestone 1 Report  
**Date:** {today}

## 1. Objective
Build a focused crawler to collect MIT PRIMES problems and solutions, extract text (including **mathematical notation**), and store a usable dataset.

## 2. Focused Crawling Strategy
- **Seed:** {BASE_URL}  
- **Scope control:** whitelist to `/research/highschool/primes/materials/`, ignore mailto/#/social links.  
- **Politeness:** requests session with retries and delay ({POLITE_DELAY}s).  
- **Targeted sweep:** additionally enumerated yearly folders (`/materials/YYYY/`) and pulled only `prob*.pdf` and `sol*/solution/answers/key` PDFs to ensure coverage of solutions.

## 3. Content Extraction
- **HTML:** BeautifulSoup → text with boilerplate removed.
- **PDFs (math-aware):** PyMuPDF text first; if the page looked sparse or image-heavy, **OCR fallback (Tesseract)** at 3x DPI.  
- **Normalization:** unify dashes, drop soft hyphens/NBSP, collapse whitespace.

### Math Handling Evidence
- Captured **{sym_count} unique math symbols** across the corpus (e.g., {(' '.join(sorted(list(unique_syms))[:12]) + ' …') if sym_count>12 else ' '.join(sorted(unique_syms))}).  
- Average math symbols per record: **{avg_syms:.1f}**.  
- Sample extractions and equations are shown in the notebook (Cells 7 & F4).

## 4. Data Storage
- **All records:** `records.csv`, `records.jsonl` (fields: url, type, text).  
- **Per-problem pairs:** `problem_solution_pairs.csv`, `problem_solution_pairs.jsonl` (fields: year, pair_stem, problem_id, problem_text, solution_text, source URLs).

## 5. Results
- Total records scraped: **{total_records}**  
- Total per-problem rows: **{total_pairs}**  
- Rows with solutions present: **{with_solutions}**  
- Example IDs detected in demo: see Cell 7.

## 6. Prompts/Assistance Used
Used an LLM to design the math-aware pipeline and robust regex segmenters; documented regex patterns and fallback logic in the notebook.

## 7. Challenges & Mitigations
- **Embedded math/figures:** many PDFs store equations as images → resolved with OCR fallback.  
- **Inline headings:** “General math problems **Problem G1** …” → inserted soft line breaks and tolerant regex (`Problem`, `Prob.`, `G1.` patterns).  
- **Solutions coverage:** some years publish solutions under variants (`sol`, `soln`, `answers`, `key`) → targeted sweep over year directories.

## 8. Repro Steps
1. Run Cells 1–6 to crawl/scrape.  
2. Run **F1** (targeted sweep) to ensure solutions are included.  
3. Run **F2–F3** to build & save per-problem pairs.  
4. Run **F4** to print final stats for grading.

## 9. Files to Submit
- Notebook (`.ipynb`)  
- `notebook_outputs/records.csv` and `records.jsonl`  
- `notebook_outputs/problem_solution_pairs.csv` and `.jsonl`  
- This report (`{os.path.basename(report_path)}`)

"""

with open(report_path, "w", encoding="utf-8") as f:
    f.write(md)

print("Report written to:", report_path)


Report written to: notebook_outputs\Milestone1_Report_2025-09-21.md
