Step 1 — PDF window

Loads the PDF and auto-selects the useful page window (skips front matter and reference blocks).
Uses simple heading/look-ahead rules to pause at “References” and resume if a new chapter starts.
Emits the selected page texts plus their original page indices for downstream steps.

In [1]:
# ===== Step 1) PDF Loader & Smart Windowing with Pause/Resume (Google Colab) =====
!pip -q install pypdf

import re
from pathlib import Path
from pypdf import PdfReader

PDF_PATH = "/content/PhDThesis__Masurel2015.pdf"

assert PDF_PATH is not None and Path(PDF_PATH).exists(), "PDF_PATH not set or file not found."

# === Config ===
MIN_SKIP_PAGES = 17
MAX_SKIP_PAGES_FALLBACK = 19
TAIL_EXCLUDE_PAGES = 5
LOOKAHEAD_PAGES = 6  # how far to look ahead after "References"

INTRO_PATTERNS = [
    r'^\s*(chapter\s+\w+\.?\s+)?introduction\b',
    r'^\s*introduction\b',
]
STOP_HEADINGS = [
    r'^\s*references\b',
    r'^\s*bibliograph(y|ies)\b',
    r'^\s*acknowledg(e)?ments\b',
]
SECTION_HEADINGS = [
    r'^\s*chapter\s+[ivxlcdm]+\b',
    r'^\s*chapter\s+\d+\b',
    r'^\s*\d+\.\s+[A-Z]',
]

# === Helpers ===
def read_pdf_pages_text(pdf_path: str):
    reader = PdfReader(pdf_path)
    return [page.extract_text() or "" for page in reader.pages]

def match_any(text, patterns):
    for pat in patterns:
        if re.search(pat, text, flags=re.IGNORECASE | re.MULTILINE):
            return True
    return False

def first_intro_page(pages_text):
    for i in range(MIN_SKIP_PAGES, len(pages_text)):
        if match_any(pages_text[i][:2000], INTRO_PATTERNS):
            return i
    return MAX_SKIP_PAGES_FALLBACK

def smart_extract_window(pages_text):
    N = len(pages_text)
    start_idx = min(first_intro_page(pages_text), N-1)
    selected, skipping = [], False

    i = start_idx
    while i < N - TAIL_EXCLUDE_PAGES:
        head = (pages_text[i] or "")[:1500]

        if match_any(head, STOP_HEADINGS):
            # pause extraction at refs
            lookahead_text = " ".join(pages_text[i+1:i+LOOKAHEAD_PAGES+1])
            if match_any(lookahead_text, SECTION_HEADINGS):
                # skip this refs block, then resume after lookahead
                print(f"Skipped References block at page {i+1}, resuming later…")
                i += LOOKAHEAD_PAGES
                continue
            else:
                print(f"Final stop at References block on page {i+1}")
                break

        selected.append((i, pages_text[i]))
        i += 1

    return selected, start_idx

# === Run ===
pages_text = read_pdf_pages_text(PDF_PATH)
N = len(pages_text)
print(f"Loaded {N} pages from: {PDF_PATH}")

selected_pairs, start_idx = smart_extract_window(pages_text)
selected_pages_text = [t for _, t in selected_pairs]
selected_indices = [i for i, _ in selected_pairs]

print("\n=== SMART WINDOW SUMMARY ===")
print(f"Total pages: {N}")
print(f"Selected pages: {len(selected_pages_text)}")
print(f"From page {selected_indices[0]+1} to {selected_indices[-1]+1}")

# Peek at first and last selected page
print("\n--- Start page preview ---")
print(selected_pages_text[0][:500].replace("\n", " "))
print("\n--- End page preview ---")
print(selected_pages_text[-1][:500].replace("\n", " "))


Loaded 257 pages from: /content/PhDThesis__Masurel2015.pdf

=== SMART WINDOW SUMMARY ===
Total pages: 257
Selected pages: 234
From page 19 to 252

--- Start page preview ---
    1    Chapter I. Introduction     1. Preamble and knowledge gaps    Paleoproterozoic (i.e. Biri mian) volcano-plutonic belts and sedimentary basins  of West Africa not only  provide a complete record of crustal growth but also host a  number of world-class gold deposits (Abouc hami et al., 1990; Boher et al., 1992). To  date, a large number of studies have focused on the Baoulé-Mossi domain, which covers  portions of Burkina Faso, Côte d’Ivoire, Ghana, Guinea and Mali (Fig. 1). Gold  deposits

--- End page preview ---
    234    3. Future work    Despite the work undertaken by the candi date, a number of questions remain  unanswered, some of which are directly relevant to exploration targeting:     3.1. Source of fluids and metals    Recent research suggests that the divers ity in mineralisation styles and ore  

Step 2 — Block extraction

Splits each selected page into fine-grained blocks (sentences, table rows, captions) with a heuristic block_type.
Assigns stable page_num and block_id for traceable provenance.
Emits df_blocks for later filtering.

In [2]:
# ===== Step 2) Block Extraction: sentences, tables, captions =====
!pip -q install nltk

import re
import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True) # Add this line

from nltk.tokenize import sent_tokenize
import pandas as pd

def detect_block_type(text):
    """Heuristic block classifier"""
    # Table-like if many numbers or tab spacing
    if re.search(r"\d", text) and (text.count(" ") > 10 or "\t" in text):
        return "table_row"
    # Caption if starts with Figure/Table
    if re.match(r"^\s*(figure|fig\.|table)\s+\d+", text, re.I):
        return "caption"
    return "sentence"

def extract_blocks(pages_text, page_indices):
    blocks = []
    for local_idx, page_text in enumerate(pages_text):
        page_num = page_indices[local_idx] + 1  # human page number
        # Split page into paragraphs/lines
        chunks = [c.strip() for c in page_text.split("\n") if c.strip()]
        block_id = 0
        for chunk in chunks:
            block_type = detect_block_type(chunk)
            # If sentence mode: split further
            if block_type == "sentence":
                sentences = sent_tokenize(chunk)
                for sent in sentences:
                    blocks.append({
                        "page_num": page_num,
                        "block_id": f"{page_num}-{block_id}",
                        "block_type": "sentence",
                        "text": sent
                    })
                    block_id += 1
            else:
                blocks.append({
                    "page_num": page_num,
                    "block_id": f"{page_num}-{block_id}",
                    "block_type": block_type,
                    "text": chunk
                })
                block_id += 1
    return pd.DataFrame(blocks)

# Run block extraction
df_blocks = extract_blocks(selected_pages_text, selected_indices)

print(f"Extracted {len(df_blocks)} blocks")
display(df_blocks.head(10)) # Use display for better output

Extracted 5825 blocks


Unnamed: 0,page_num,block_id,block_type,text
0,19,19-0,sentence,1
1,19,19-1,sentence,Chapter I.
2,19,19-2,sentence,Introduction
3,19,19-3,sentence,1.
4,19,19-4,sentence,Preamble and knowledge gaps
5,19,19-5,sentence,Paleoproterozoic (i.e.
6,19,19-6,sentence,Biri mian) volcano-plutonic belts and sediment...
7,19,19-7,sentence,of West Africa not only provide a complete re...
8,19,19-8,table_row,number of world-class gold deposits (Abouc ham...
9,19,19-9,sentence,"date, a large number of studies have focused o..."


Step 3 — Candidate finder

Loads a ready-to-use DICT (core geo terms, units, and aliases/normalisers).
Scans each block for domain keywords and quantitative patterns (number+unit, ranges, ±error, oxide %); guards against dates/citations.
Emits a shortlisted DataFrame with provenance + parsed numbers, and saves it to /content/candidate_blocks_step3.csv.

In [3]:
# ===== Step 3) Dictionary & Regex Pre-Filter (tightened) =====
!pip -q install pandas rapidfuzz

import re, json
from collections import Counter
from rapidfuzz import process, fuzz
import pandas as pd

# ---- A) Dictionary (extend anytime) ----
DICT = {
    "rock_type": [
        "limestone","dolomite","dolostone","wacke","arenite","siltstone","argillite",
        "mudstone","shale","sandstone","conglomerate","breccia","granite","monzogranite",
        "granodiorite","diorite","tonalite","gabbro","basalt","andesite","rhyolite",
        "komatiite","trachyte","phonolite","gneiss","schist","quartzite"
    ],
    "minerals": [
        "pyrite","arsenopyrite","chalcopyrite","pyrrhotite","sphalerite","galena",
        "magnetite","hematite","native gold","electrum","muscovite","biotite",
        "chlorite","sericite","albite","epidote","carbonate","quartz","feldspar"
    ],
    "mineralisation": [
        "sulfide","sulphide","stockwork","vein","disseminated","massive","breccia",
        "replacement","porphyry","orogenic"
    ],
    "tectonism_event": [
        "D1","D2","D3","D4","compressional","extensional","transcurrent",
        "sinistral","dextral","thrust","fold","shear zone","shear"
    ],
    "structures": [
        "foliation","lineation","cleavage","vein","veinlet","breccia","schistosity",
        "plunge","strike","dip","bedding","lamination","fault"
    ],
    "methods": [
        "ICP-MS","LA-ICP-MS","EPMA","SEM","XRD","XRF","AAS","fire assay",
        "TIMS","ID-TIMS","MC-ICP-MS","SIMS","LAICPMS","microprobe"
    ],
    "geochronology_terms": [
        "zircon","monazite","baddeleyite","concordia","intercept","weighted mean",
        "MSWD","discordant","206Pb/238U","207Pb/206Pb","U–Pb","U-Pb","age","dated"
    ],
    "assay_elements": [
        "Au","Ag","As","Sb","Cu","Pb","Zn","Ni","Co","Fe","S",
        "SiO2","Al2O3","MgO","CaO","K2O","Na2O","TiO2","P2O5","LOI","Cr2O3","MnO"
    ],
    "units": ["ppm","ppb","wt%","%","g/t","mg/kg","µg/g","ug/g","Ma","Ga","°C","deg C"],
    "stopwords_geo": ["references","bibliography","acknowledgements","appendix"]
}

UNIT_NORMALISE = {"gpt": "g/t", "percent": "%", "ug/g": "µg/g", "deg c": "°C"}
ALIAS_MAP = {
    "gold":"Au","arsenic":"As","antimony":"Sb","copper":"Cu","lead":"Pb","zinc":"Zn",
    "nickel":"Ni","cobalt":"Co","sulfur":"S","sulphur":"S"
}

def normalise_unit(u: str|None) -> str|None:
    if not u: return None
    u = u.strip().lower()
    return UNIT_NORMALISE.get(u, u).replace("ug/g","µg/g")

def alias_text(text: str) -> str:
    low = text.lower()
    for k, v in ALIAS_MAP.items():
        low = re.sub(rf"\b{k}\b", v.lower(), low)
    return low

# ---- B) Numeric/Unit regex ----
NUM = r"\d+(?:[\.,]\d+)?"
RANGE_SEP = r"(?:–|-|to)"
PATTERNS = {
    "num_unit": re.compile(rf"(?P<val>{NUM})\s*(?P<unit>ppm|ppb|wt%|%|g\/t|mg\/kg|µg\/g|ug\/g|Ma|Ga|°C)\b", re.I),
    "range": re.compile(rf"(?P<v1>{NUM})\s*{RANGE_SEP}\s*(?P<v2>{NUM})\s*(?P<unit>ppm|ppb|%|g\/t|Ma|Ga)?\b", re.I),
    "between_age": re.compile(rf"\bbetween\s+(?P<v1>{NUM})\s*(?:–|-|to|and)\s*(?P<v2>{NUM})\s*(?P<unit>Ma|Ga)\b", re.I),
    "plusminus": re.compile(rf"(?P<mean>{NUM})\s*(?:±|\+\/-)\s*(?P<err>{NUM})\s*(?P<unit>Ma|g\/t|ppm|%|°C)?\b", re.I),
    "oxide_pct": re.compile(rf"(?P<oxide>[A-Z][a-z]?(?:\d)?O\d?)\s*(?P<val>{NUM})\s*%", re.I),
}

# ---- C) Noise guards & context checks ----
MONTHS = r"(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|july|august|september|october|november|december)"
CITATION_CUES = r"\b(journal|geological society|special publications|bulletin|v\.|vol\.|issue|no\.|pp?\.|doi:|issn|isbn|proceedings|symposium|conference|abstracts|et al\.)\b"

RANGE_CONTEXT_OK = [
    "ma","ga","age","dated","u–pb","u-pb","zircon","monazite","concordia","intercept",
    "ppm","ppb","g/t","wt%","%","oxide","sio2","feo","tio2","mgo","cao","k2o","na2o","p2o5",
    "mg#","mswd"
]

def has_range_context(text: str) -> bool:
    low = text.lower()
    return any(tok in low for tok in RANGE_CONTEXT_OK)

def looks_like_date_or_citation(text: str) -> bool:
    low = text.lower()
    if re.search(MONTHS, low) and re.search(r"\b\d{1,2}\s*(?:–|-|to)\s*\d{1,2}\b", low):
        return True
    if re.search(CITATION_CUES, low) and re.search(r"\b\d{2,6}\s*(?:–|-|to)\s*\d{2,6}\b", low):
        return True
    return False

def age_signal_present(text: str) -> bool:
    low = text.lower()
    return any(tok in low for tok in ["u–pb","u-pb","concordia","mswd","age","dated"])

# ---- D) Candidate finder over df_blocks ----
def find_candidates_from_blocks(df_blocks, min_keywords=1, require_numeric=True, fuzzy_cutoff=95):
    recs = []
    for _, row in df_blocks.iterrows():
        raw_text = str(row["text"]).strip()
        if not raw_text:
            continue

        low = alias_text(raw_text)

        # Stronger citation/date guard: allow only if explicit unit/± OR clear age signal
        if looks_like_date_or_citation(raw_text):
            if not (PATTERNS["num_unit"].search(raw_text) or PATTERNS["plusminus"].search(raw_text) or age_signal_present(raw_text)):
                continue

        # Keyword categories
        keyword_hits = set()
        for cat, terms in DICT.items():
            if cat in ("units","stopwords_geo"):
                continue
            if any(t.lower() in low for t in terms):
                keyword_hits.add(cat); continue
            best = process.extractOne(low, terms, scorer=fuzz.partial_ratio, score_cutoff=fuzzy_cutoff)
            if best: keyword_hits.add(cat)

        if len(keyword_hits) < min_keywords:
            continue

        # Quantitative patterns
        m_numunit   = list(PATTERNS["num_unit"].finditer(raw_text))
        m_pm        = list(PATTERNS["plusminus"].finditer(raw_text))
        m_oxide     = list(PATTERNS["oxide_pct"].finditer(raw_text))
        m_range     = list(PATTERNS["range"].finditer(raw_text))
        m_between   = list(PATTERNS["between_age"].finditer(raw_text))

        explicit_ok = bool(m_numunit or m_pm or m_oxide)
        valid_numeric = explicit_ok

        # Allow ranges only with context; require stronger geo signal if still unit-less
        if not explicit_ok:
            if (m_range or m_between) and has_range_context(raw_text):
                valid_numeric = len(keyword_hits) >= 2
            else:
                valid_numeric = False

        # Co-occurrence rule for Ma/Ga without ± :
        if explicit_ok and not m_pm:
            # if ONLY Ma/Ga units found (no ppm/%/g/t/°C), demand geochron signal or ≥2 categories
            only_age_units = all((m.group("unit") or "").lower() in ["ma","ga"] for m in m_numunit) if m_numunit else False
            if only_age_units and not (("geochronology_terms" in keyword_hits) or (len(keyword_hits) >= 2)):
                valid_numeric = False

        if require_numeric and not valid_numeric:
            continue

        # Parse numbers
        numbers = []
        for m in m_numunit:
            numbers.append({
                "type":"num_unit",
                "value": float(m.group("val").replace(",", ".")),
                "unit":  normalise_unit(m.group("unit"))
            })
        for m in m_pm:
            numbers.append({
                "type":"plusminus",
                "mean":  float(m.group("mean").replace(",", ".")),
                "error": float(m.group("err").replace(",", ".")),
                "unit":  normalise_unit(m.group("unit"))
            })
        for m in m_oxide:
            numbers.append({
                "type":"oxide_pct",
                "oxide": m.group("oxide"),
                "value": float(m.group("val").replace(",", ".")),
                "unit":  "%"
            })
        # Age ranges
        if m_between:
            for m in m_between:
                numbers.append({
                    "type":"range",
                    "value_min": float(m.group("v1").replace(",", ".")),
                    "value_max": float(m.group("v2").replace(",", ".")),
                    "unit":      normalise_unit(m.group("unit"))
                })
        elif m_range and has_range_context(raw_text) and len(keyword_hits) >= 2:
            for m in m_range:
                numbers.append({
                    "type":"range",
                    "value_min": float(m.group("v1").replace(",", ".")),
                    "value_max": float(m.group("v2").replace(",", ".")),
                    "unit":      normalise_unit(m.group("unit")) if m.group("unit") else None
                })

        if not numbers:
            continue

        recs.append({
            "page_num": row["page_num"],
            "block_id": row["block_id"],
            "block_type": row["block_type"],
            "text": raw_text,
            "keyword_categories": sorted(list(keyword_hits)),
            "numbers_json": json.dumps(numbers, ensure_ascii=False),
            "has_numbers": True
        })

    return pd.DataFrame.from_records(recs)

# ---- E) Run and save ----
df_cands = find_candidates_from_blocks(df_blocks, min_keywords=1, require_numeric=True)

df_cands = df_cands.sort_values(["page_num","block_id"]).reset_index(drop=True)
print(f"Candidate blocks found: {len(df_cands)}")

display_cols = ["page_num","block_id","block_type","keyword_categories","text","numbers_json"]
print(df_cands[display_cols].head(12).to_string(index=False)[:2500])

OUT_PATH = "/content/candidate_blocks_step3.csv"
df_cands.to_csv(OUT_PATH, index=False)
print(f"\nSaved candidates → {OUT_PATH}")

# Quick category stats
cat_counts = Counter()
for cats in df_cands["keyword_categories"]:
    for c in cats:
        cat_counts[c] += 1
print("\nTop categories (rough):")
for c, n in cat_counts.most_common(12):
    print(f"  {c:22s} {n}")


Candidate blocks found: 174
 page_num block_id block_type                               keyword_categories                                                                                    text                                                                                                                                                                                                                               numbers_json
       19    19-18  table_row                                 [assay_elements]  2158 Ma gold at Wassa, Parra-Avila, in press; 2105 ± 2 Ma gold at Ashanti, Oberthür et                                                                 [{"type": "num_unit", "value": 2158.0, "unit": "ma"}, {"type": "num_unit", "value": 2.0, "unit": "ma"}, {"type": "plusminus", "mean": 2105.0, "error": 2.0, "unit": "ma"}]
       19    19-19  table_row                                 [assay_elements]     al., 1998; 2063 ± 9 Ma gold at Damang, Pigois et al., 2003). Nevertheless, the vast  

Step 4 — Offline facts (with tectonism + tags)

Converts each numbers_json hit into one atomic fact with category/attribute/values/units/method.
Auto-classifies: Ma/Ga or age cues → geochronology; age + D1–D4/shear/thrust → tectonism_event (entity = D-stage); element symbols → assay; else geochemistry.
Adds context snippet + confidence and writes /content/extracted_step4_offline.csv.

In [4]:
# ===== Step 4) Offline Structured Extraction (+ tectonism + dict tags) =====
import json, re
import pandas as pd
from pathlib import Path

# --- Inputs ---
CANDS_PATH = "/content/candidate_blocks_step3.csv"        # from Step 3
OUT_JSONL  = "/content/extracted_step4_offline.jsonl"
OUT_CSV    = "/content/extracted_step4_offline.csv"
OUT_PARQ   = "/content/extracted_step4_offline.parquet"

assert Path(CANDS_PATH).exists(), f"Missing {CANDS_PATH}"
df_cands = pd.read_csv(CANDS_PATH)

# --- Minimal fallbacks (use your Step-3 dicts if already defined) ---
try: DICT
except NameError:
    DICT = {
        "assay_elements":["Au","Ag","As","Sb","Cu","Pb","Zn","Ni","Co","Fe","S",
                          "SiO2","Al2O3","MgO","CaO","K2O","Na2O","TiO2","P2O5","LOI","Cr2O3","MnO"],
        "methods":["ICP-MS","LA-ICP-MS","EPMA","SEM","XRD","XRF","AAS","fire assay",
                   "TIMS","ID-TIMS","MC-ICP-MS","SIMS","LAICPMS","microprobe"],
        "geochronology_terms":["zircon","monazite","baddeleyite","concordia","mswd","age","dated","U–Pb","U-Pb"],
        "rock_type":["limestone","dolomite","wacke","sandstone","breccia","granite","tonalite","gabbro","basalt","andesite","rhyolite","gneiss","schist","quartzite"],
        "minerals":["pyrite","arsenopyrite","chalcopyrite","pyrrhotite","sphalerite","galena","magnetite","hematite","muscovite","biotite","chlorite","sericite","albite","epidote","quartz","feldspar","carbonate"],
        "mineralisation":["sulfide","sulphide","stockwork","vein","disseminated","massive","breccia","replacement","porphyry","orogenic"],
        "structures":["foliation","lineation","cleavage","vein","breccia","schistosity","plunge","strike","dip","bedding","lamination","fault"],
        "tectonism_event":["D1","D2","D3","D4","compressional","extensional","transcurrent","sinistral","dextral","thrust","fold","shear zone","shear"]
    }
try: ALIAS_MAP
except NameError:
    ALIAS_MAP = {"gold":"Au","arsenic":"As","antimony":"Sb","copper":"Cu","lead":"Pb","zinc":"Zn","sulfur":"S","sulphur":"S"}
ASSAY_SYMBOLS = set(DICT.get("assay_elements", []))
METHOD_TERMS  = set(t.lower() for t in DICT.get("methods", []))
GEOCHRON_CUES = {t.lower() for t in DICT.get("geochronology_terms", [])} | {"u–pb","u-pb","concordia","mswd","age","dated"}

# --- Helpers ---
def norm_unit(u):
    if not u: return None
    s = str(u).strip().lower().replace("ug/g","µg/g").replace("deg c","°c")
    return {"ma":"Ma","ga":"Ga","°c":"°C","percent":"%","wt%":"%","gpt":"g/t"}.get(s, s)

def find_method(low):  # from text.lower()
    for m in METHOD_TERMS:
        if m in low: return m.upper()
    return None

def find_element_attribute(text):
    # prefer exact symbols (Au, As, …). If none, use ALIAS_MAP words.
    tokens = sorted(list(ASSAY_SYMBOLS), key=len, reverse=True)
    for sym in tokens:
        if re.search(rf"(?<![A-Za-z0-9]){re.escape(sym)}(?![A-Za-z0-9])", text, re.I):
            return sym
    low = text.lower()
    for k,v in ALIAS_MAP.items():
        if re.search(rf"\b{k}\b", low): return v
    return None

def snippet(s, n=240):
    t = " ".join(str(s).split())
    return (t[:n]+"…") if len(t)>n else t

def base_conf(numtype):
    return {"plusminus":0.90,"range":0.80,"oxide_pct":0.85,"num_unit":0.75}.get(numtype,0.60)

# --- Tectonism detection (D1–D4 + tectonic words) ---
EVENT_STAGE_RE  = re.compile(r'\bD\s*([1-4])\b', re.I)
TECTONIC_TERMS  = {t.lower() for t in DICT.get("tectonism_event", [])} - {"d1","d2","d3","d4"}
def detect_stage(low):
    m = EVENT_STAGE_RE.search(low);  return f"D{m.group(1)}" if m else None
def has_tectonic(low):
    return bool(EVENT_STAGE_RE.search(low) or any(w in low for w in TECTONIC_TERMS))

# --- Dictionary tagger (fast boundary-aware) ---
def _compile_terms(terms):
    terms = sorted(set(terms), key=len, reverse=True)
    parts = [r"\b"+re.escape(t).replace(r"\ ", r"\s+")+r"\b" for t in terms]
    return re.compile("|".join(parts), re.I) if parts else None
TAG_CATS = ["rock_type","minerals","mineralisation","structures","methods","geochronology_terms"]
PAT = {c:_compile_terms(DICT[c]) for c in TAG_CATS}
def find_terms(s, cat):
    pat = PAT.get(cat);
    if not pat: return []
    hits = pat.findall(s or "")
    # normalise to canonical spellings
    canon = set(DICT[cat]); out=[]
    for h in hits:
        if isinstance(h,tuple): h=[x for x in h if x][0]
        hc = next((c for c in canon if c.lower()==str(h).lower()), h)
        if hc not in out: out.append(hc)
    return out

# --- Core conversion ---
ALLOWED_CATEGORIES = ["lithology","rock_type","minerals","mineralisation","tectonism_event","structures",
                      "assay","geochemistry","geochronology","magmatism","methods","stratigraphy"]

rows=[]
for _, r in df_cands.iterrows():
    text = str(r["text"]); low = text.lower()
    page = int(r["page_num"]); block_id = str(r["block_id"])
    method = find_method(low)
    try:
        items = json.loads(r["numbers_json"]) if r.get("numbers_json") else []
    except Exception:
        items = []

    # tags once per block (from full text)
    tags = {f"tags_{c}":"; ".join(find_terms(text, c)) for c in TAG_CATS}
    dict_tags = " | ".join(f"{k[5:]}:{v}" for k,v in tags.items() if v)

    for it in items:
        ntype = it.get("type"); conf = base_conf(ntype)
        if ntype == "oxide_pct":
            rows.append({
                "category":"geochemistry","entity":None,"attribute":it.get("oxide"),
                "value":float(it.get("value")) if it.get("value") is not None else None,
                "unit":"%","value_min":None,"value_max":None,"error":None,"error_unit":None,
                "method":method,"mineral_phase":None,"sample_id":None,
                "context_snippet":snippet(text),"page":page,"block_id":block_id,
                "confidence":min(1.0, conf + (0.05 if method else 0)), **tags, "dict_tags":dict_tags
            }); continue

        if ntype == "plusminus":
            unit = norm_unit(it.get("unit")); mean = it.get("mean"); err = it.get("error")
            is_age = (unit in ("Ma","Ga")) or any(c in low for c in GEOCHRON_CUES)
            stage  = detect_stage(low) if is_age and has_tectonic(low) else None
            if is_age and stage:   cat, attr, ent = "tectonism_event","age",stage
            elif is_age:           cat, attr, ent = "geochronology","age",None
            else:
                attr_guess = find_element_attribute(text) or "value"
                cat, attr, ent = "assay", attr_guess, None
            rows.append({
                "category":cat,"entity":ent,"attribute":attr,
                "value":float(mean) if mean is not None else None, "unit":unit,
                "value_min":None,"value_max":None,
                "error":float(err) if err is not None else None,"error_unit":unit,
                "method":method,"mineral_phase":None,"sample_id":None,
                "context_snippet":snippet(text),"page":page,"block_id":block_id,
                "confidence":min(1.0, conf + (0.1 if (is_age or method) else 0)), **tags, "dict_tags":dict_tags
            }); continue

        if ntype == "range":
            vmin = it.get("value_min"); vmax = it.get("value_max"); unit = norm_unit(it.get("unit"))
            is_age = (unit in ("Ma","Ga")) or any(c in low for c in GEOCHRON_CUES)
            stage  = detect_stage(low) if is_age and has_tectonic(low) else None
            if is_age and stage:   cat, attr, ent = "tectonism_event","age",stage
            elif is_age:           cat, attr, ent = "geochronology","age",None
            else:
                attr_guess = find_element_attribute(text) or "value"
                cat = "assay" if unit in ("ppm","ppb","g/t","mg/kg","µg/g","%","wt%") else "geochemistry"
                attr, ent = attr_guess, None
            rows.append({
                "category":cat,"entity":ent,"attribute":attr,
                "value":None,"unit":unit,
                "value_min":float(vmin) if vmin is not None else None,
                "value_max":float(vmax) if vmax is not None else None,
                "error":None,"error_unit":None,
                "method":method,"mineral_phase":None,"sample_id":None,
                "context_snippet":snippet(text),"page":page,"block_id":block_id,
                "confidence":min(1.0, conf + (0.05 if method else 0)), **tags, "dict_tags":dict_tags
            }); continue

        if ntype == "num_unit":
            val = it.get("value"); unit = norm_unit(it.get("unit"))
            is_age = (unit in ("Ma","Ga")) or any(c in low for c in GEOCHRON_CUES)
            stage  = detect_stage(low) if is_age and has_tectonic(low) else None
            if is_age and stage:   cat, attr, ent = "tectonism_event","age",stage
            elif is_age:           cat, attr, ent = "geochronology","age",None
            else:
                attr_guess = find_element_attribute(text)
                if attr_guess:     cat, attr, ent = "assay", attr_guess, None
                else:              cat, attr, ent = "geochemistry","value", None
            rows.append({
                "category":cat,"entity":ent,"attribute":attr,
                "value":float(val) if val is not None else None, "unit":unit,
                "value_min":None,"value_max":None,"error":None,"error_unit":None,
                "method":method,"mineral_phase":None,"sample_id":None,
                "context_snippet":snippet(text),"page":page,"block_id":block_id,
                "confidence":min(1.0, conf + (0.05 if method else 0)), **tags, "dict_tags":dict_tags
            }); continue

cols = ["category","entity","attribute","value","unit","value_min","value_max","error","error_unit",
        "method","mineral_phase","sample_id","context_snippet","page","block_id","confidence"] + \
       [f"tags_{c}" for c in TAG_CATS] + ["dict_tags"]
df4 = pd.DataFrame(rows, columns=cols).dropna(how="all")
# keep allowed categories and numeric-bearing rows
df4 = df4[df4["category"].isin(ALLOWED_CATEGORIES)]
num_mask = df4["value"].notna() | df4["value_min"].notna() | df4["value_max"].notna() | df4["error"].notna()
df4 = df4[num_mask].reset_index(drop=True)

# Save
with open(OUT_JSONL,"w",encoding="utf-8") as f:
    for _,row in df4.iterrows(): f.write(json.dumps(row.to_dict(), ensure_ascii=False)+"\n")
df4.to_csv(OUT_CSV, index=False)
try: df4.to_parquet(OUT_PARQ, index=False)
except Exception: pass

print(f"[Step4] Saved → {OUT_CSV} (rows={len(df4)})")
print(df4.head(8).to_string(index=False))

[Step4] Saved → /content/extracted_step4_offline.csv (rows=298)
     category entity attribute  value unit  value_min  value_max  error error_unit method mineral_phase sample_id                                                                        context_snippet  page block_id  confidence tags_rock_type tags_minerals tags_mineralisation tags_structures tags_methods tags_geochronology_terms                                      dict_tags
geochronology   None       age 2158.0   Ma        NaN        NaN    NaN       None   None          None      None 2158 Ma gold at Wassa, Parra-Avila, in press; 2105 ± 2 Ma gold at Ashanti, Oberthür et    19    19-18        0.75                                                                                                                                                      
geochronology   None       age    2.0   Ma        NaN        NaN    NaN       None   None          None      None 2158 Ma gold at Wassa, Parra-Avila, in press; 2105 ± 2 Ma gold at 

Step 5 — Packaging, lookups & exports (offline)

Normalises units/types and prints quick category/page counts.
Writes per-category CSVs and lookups: ages (converted to Ma, default ±1 Ma if missing error) and assays.
Exports a compact catalog.jsonl for fast search/grepping.

In [5]:
# ===== Step 5) Packaging, Lookups & Exports (offline) =====
import os, json, re
import pandas as pd
from pathlib import Path

SRC = "/content/extracted_step4_offline.csv"  # from Step 4
OUT_DIR = "/content/step5_offline"
Path(OUT_DIR).mkdir(exist_ok=True)
assert Path(SRC).exists(), f"Missing {SRC}"
df = pd.read_csv(SRC)

# type hygiene
for c in ["value","value_min","value_max","error","confidence","page"]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")
def _u(u):
    if pd.isna(u): return None
    s=str(u).strip().replace("ug/g","µg/g").lower()
    return {"ma":"Ma","ga":"Ga","°c":"°C","percent":"%","wt%":"%","gpt":"g/t"}.get(s, s)
for c in ("unit","error_unit"):
    if c in df.columns: df[c]=df[c].apply(_u)

# Summary
print("By category:\n", df["category"].value_counts().to_string())

# Per-category CSVs
CAT_DIR = f"{OUT_DIR}/by_category"; Path(CAT_DIR).mkdir(exist_ok=True)
for cat, sub in df.groupby(df["category"].fillna("uncategorized")):
    cols = [x for x in ["page","block_id","entity","category","attribute","value","value_min","value_max","unit",
                        "error","error_unit","method","mineral_phase","sample_id","context_snippet","confidence",
                        "tags_rock_type","tags_minerals","tags_structures","dict_tags"]
            if x in sub.columns]
    sub.sort_values(["page","block_id","attribute","value"], inplace=True, na_position="last")
    sub[cols].to_csv(f"{CAT_DIR}/{cat}.csv", index=False)
print(f"Saved per-category → {CAT_DIR}")

# Lookups
# Ages (default ±1 Ma window if no explicit error)
ages = df[df["category"].str.lower()=="geochronology"].copy()
def to_Ma(v,u):
    if pd.isna(v): return None
    if u=="Ma": return float(v)
    if u=="Ga": return float(v)*1000.0
    return None
if not ages.empty:
    ages["age_central_Ma"] = [to_Ma(v,u) for v,u in zip(ages.get("value"), ages.get("unit"))]
    def err_Ma(e, eu):
        if pd.isna(e) or pd.isna(eu): return None
        return e if eu=="Ma" else (e*1000.0 if eu=="Ga" else None)
    ages["age_error_Ma"]  = [err_Ma(e,eu) for e,eu in zip(ages.get("error"), ages.get("error_unit"))]
    ages["age_window_Ma"] = ages["age_error_Ma"].where(ages["age_error_Ma"].notna(), 1.0)
    ages["age_lo_Ma"]     = ages["age_central_Ma"] - ages["age_window_Ma"]
    ages["age_hi_Ma"]     = ages["age_central_Ma"] + ages["age_window_Ma"]
    keep = [c for c in ["page","block_id","entity","attribute","value","unit","error","error_unit",
                        "method","mineral_phase","sample_id","age_central_Ma","age_window_Ma","age_lo_Ma","age_hi_Ma",
                        "context_snippet","confidence"] if c in ages.columns]
    ages.sort_values(["age_central_Ma","page","confidence"], inplace=True, ascending=[True,True,False])
    ages.to_csv(f"{OUT_DIR}/lookup_geochronology.csv", index=False)

# Assays
assay = df[df["category"].str.lower()=="assay"].copy()
if not assay.empty:
    keep = [c for c in ["page","block_id","entity","attribute","value","value_min","value_max","unit",
                        "method","sample_id","context_snippet","confidence"] if c in assay.columns]
    assay.sort_values(["attribute","page","value","confidence"], inplace=True, ascending=[True,True,True,False])
    assay[keep].to_csv(f"{OUT_DIR}/lookup_assays.csv", index=False)

# Catalog JSONL
CATALOG = f"{OUT_DIR}/catalog.jsonl"
with open(CATALOG,"w",encoding="utf-8") as f:
    for _,r in df.iterrows():
        f.write(json.dumps({k:r.get(k) for k in
               ["category","attribute","unit","value","value_min","value_max","error","error_unit",
                "page","block_id","entity","confidence","context_snippet","dict_tags"]}, ensure_ascii=False)+"\n")
print("Step 5 done.")


By category:
 category
geochronology      245
geochemistry        37
assay               14
tectonism_event      2
Saved per-category → /content/step5_offline/by_category
Step 5 done.


Step 6 — Noise reduction & confidence gating

Scores each fact with penalties (e.g., age without Ma/Ga, assay without unit, ref-section text) and bonuses (method present, ±error, dict tags).
Computes conf_final and splits rows into clean, review, and dropped sets.
Saves facts_step6_clean.csv, facts_step6_review.csv, and facts_step6_dropped.csv.

In [6]:
# ===== Step 6) Noise Reduction & Confidence Gating =====
import pandas as pd, re
from pathlib import Path

SRC = "/content/extracted_step4_offline.csv"  # from Step 4
assert Path(SRC).exists(), f"Missing {SRC}"
df = pd.read_csv(SRC)

for c in ["value","value_min","value_max","error","confidence","page"]:
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")

# Penalties/bonuses
def penalties(row):
    p=0.0; snip=str(row.get("context_snippet") or "")
    cat=str(row.get("category") or "")
    unit=str(row.get("unit") or "")
    # hard noise words
    if re.search(r'\b(references|bibliograph|acknowledg|appendix)\b', snip, re.I): p+=0.25
    # ages must have Ma/Ga
    if cat=="geochronology" and unit not in ("Ma","Ga"): p+=0.35
    # assays/geochem should have units
    if cat in ("assay","geochemistry") and unit in ("",None): p+=0.25
    # very short context = risk
    if len(snip)<30: p+=0.05
    # suspicious % without oxide/chem words
    if unit=="%" and not re.search(r'(SiO2|Al2O3|FeO|MgO|CaO|Na2O|K2O|TiO2|oxide|wt%)', snip, re.I):
        p+=0.15
    return p

def bonuses(row):
    b=0.0; snip=str(row.get("context_snippet") or "")
    # dictionary tags present → more trust
    if str(row.get("dict_tags") or "")!="": b+=0.05
    # method named
    if str(row.get("method") or "")!="": b+=0.05
    # explicit ± uncertainty
    if pd.notna(row.get("error")): b+=0.08
    return b

base = df.get("confidence", pd.Series([0.6]*len(df)))
df["penalty"]   = df.apply(penalties, axis=1)
df["bonus"]     = df.apply(bonuses, axis=1)
df["conf_final"]= (base - df["penalty"] + df["bonus"]).clip(0.0,1.0)

# Gate
KEEP_THR   = 0.66
REVIEW_THR = 0.40
keep_mask  = df["conf_final"] >= KEEP_THR
review_mask= (df["conf_final"] >= REVIEW_THR) & (~keep_mask)

df_clean  = df[keep_mask].copy()
df_review = df[review_mask].copy()
df_drop   = df[~(keep_mask | review_mask)].copy()

df_clean.to_csv("/content/facts_step6_clean.csv",  index=False)
df_review.to_csv("/content/facts_step6_review.csv", index=False)
df_drop.to_csv("/content/facts_step6_dropped.csv", index=False)

print(f"[Step6] kept={len(df_clean)}  review={len(df_review)}  dropped={len(df_drop)}")


[Step6] kept=273  review=25  dropped=0


Step 7 — Provenance & references (offline)

Reads page text and adds section_title, nearby author–year citations, and figure/table IDs.
Builds stable provenance_id as PDF_NAME#p{page}:{block_id} plus a viewer #page= hint.
Writes the enriched table to /content/facts_step8_provenance_offline.csv.

In [10]:
# ===== Step 7 (OFFLINE): Referencing & Provenance Attachment =====
# Works with outputs from Step 4 (offline) OR Step 6.
# Produces /content/facts_step8_provenance_offline.csv

!pip -q install pypdf pandas

import os, re, json
import pandas as pd
from pathlib import Path
from pypdf import PdfReader

# ---------- Inputs ----------
PDF_PATH = globals().get("PDF_PATH", "/content/PhDThesis__Masurel2015.pdf")
FACTS_CANDIDATES = [
    "/content/facts_step7_postproc.csv",        # if you ran Step 7
    "/content/facts_step6_clean.csv",           # Step 6 clean
    "/content/facts_postproc_master.csv",       # alt name
    "/content/extracted_step4_offline.csv",     # Step 4 (offline)
    "/content/extracted_step4_offline.jsonl",
    "/content/extracted_step4_offline.parquet",
]
assert Path(PDF_PATH).exists(), f"Missing PDF at: {PDF_PATH}"
PDF_NAME = Path(PDF_PATH).name

def load_facts(paths=FACTS_CANDIDATES) -> pd.DataFrame:
    for p in paths:
        pth = Path(p)
        if not pth.exists():
            continue
        if pth.suffix == ".csv":
            df = pd.read_csv(pth)
        elif pth.suffix == ".jsonl":
            df = pd.read_json(pth, lines=True)
        else:
            try:
                df = pd.read_parquet(pth)
            except Exception:
                continue
        if not df.empty:
            print(f"[Step8] Loaded facts: {p} (rows={len(df)})")
            return df
    raise FileNotFoundError("No Step-7/6/4 artifacts found. Run Step 4 (offline) or Step 6 first.")

df = load_facts()

# ---------- Minimal hygiene ----------
for c in ("page","confidence"):
    if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")
for c in ("context_snippet","block_id","pdf_name","provenance_id","figure_table_id"):
    if c not in df.columns: df[c] = ""

# ---------- Read PDF text (per page) ----------
def read_pdf_pages_text(pdf_path: str):
    rdr = PdfReader(pdf_path)
    out = []
    for pg in rdr.pages:
        try:
            t = pg.extract_text() or ""
        except Exception:
            t = ""
        # common encoding fixes so regex works
        t = (t.replace("Â±", "±")
               .replace("\u2013", "-")   # en dash -> hyphen
               .replace("\u2014", "-"))  # em dash -> hyphen
        out.append(t)
    return out

pages_text = read_pdf_pages_text(PDF_PATH)
N = len(pages_text)

# ---------- Heuristics ----------
TITLE_PATS = [
    r'^\s*(chapter\s+[ivxlcdm\d]+\.?\s+.*)$',  # "Chapter I ...", "Chapter 1 ..."
    r'^\s*chapter\s+.*$',                      # loose chapter line
    r'^\s*\d+\.\s+[A-Z].{3,}$',                # "1. Title"
    r'^\s*[A-Z][A-Za-z0-9\s\-\(\),:]{4,}$',    # Title-like single line
]
TITLE_RE = re.compile("|".join(TITLE_PATS), re.IGNORECASE | re.MULTILINE)

def guess_section_title(pg_text: str) -> str|None:
    lines = (pg_text or "").strip().splitlines()
    for line in lines[:25]:
        m = TITLE_RE.search(line.strip())
        if m:
            return (m.group(m.lastindex) if m.lastindex else line).strip()
    return None

FIGTAB_RE = re.compile(r'\b((?:Figure|Fig\.|Table)\s+\d+[A-Za-z]?)\b', re.I)
def extract_figtab(text: str, limit=6):
    found = [m.group(1).strip() for m in FIGTAB_RE.finditer(text or "")]
    # de-dup preserve order
    seen, out = set(), []
    for x in found:
        if x not in seen:
            out.append(x); seen.add(x)
    return out[:limit]

CITE_RE = re.compile(
    r'\(([A-Z][A-Za-z\-]+(?:\s*&\s*[A-Z][A-Za-z\-]+)?|[A-Z][A-Za-z\-]+ et al\.)\s*,\s*(\d{4}[a-z]?)\)',
    re.UNICODE
)
def extract_citations(text: str, limit=8):
    t = (text or "").replace("Â±","±")
    hits = [f"{a} {y}" for a, y in CITE_RE.findall(t)]
    seen, out = set(), []
    for h in hits:
        if h not in seen:
            out.append(h); seen.add(h)
    return out[:limit]

# ---------- Enrich each fact ----------
enriched = []
for _, r in df.iterrows():
    page = int(r.get("page")) if pd.notna(r.get("page")) else 1
    page = 1 if page < 1 else page
    pg_idx = min(max(0, page-1), N-1)
    pg_text = pages_text[pg_idx] if 0 <= pg_idx < N else ""

    section_title = guess_section_title(pg_text) or ""

    ctx = str(r.get("context_snippet") or "")
    cites = list(dict.fromkeys(extract_citations(ctx) + extract_citations(pg_text)))
    figtab = []
    if str(r.get("figure_table_id") or "").strip():
        figtab.append(str(r.get("figure_table_id")).strip())
    figtab += extract_figtab(ctx)
    if not figtab:
        figtab += extract_figtab(pg_text)
    figtab = list(dict.fromkeys(figtab))[:6]
    figure_table_id = "; ".join(figtab)

    prov_existing = str(r.get("provenance_id") or "").strip()
    block_id = str(r.get("block_id") or "").strip()
    provenance_id = prov_existing if prov_existing else f"{PDF_NAME}#p{page}:{block_id}"
    pdf_page_hint = f"#page={page}"
    provenance_note = f"{PDF_NAME} page {page}, block {block_id}"

    row = dict(r)
    row.update({
        "pdf_name": row.get("pdf_name") or PDF_NAME,
        "section_title": section_title,
        "local_citations": "; ".join(cites) if cites else "",
        "figure_table_id": figure_table_id,
        "provenance_id": provenance_id,
        "pdf_page_hint": pdf_page_hint,
        "provenance_note": provenance_note,
    })
    enriched.append(row)

df8 = pd.DataFrame(enriched)

# ---------- Save ----------
OUT = "/content/facts_step8_provenance_offline.csv"
df8.to_csv(OUT, index=False)
print(f"[Step8] Saved with provenance → {OUT}")

# ---------- Peek ----------
cols = [c for c in [
    "category","attribute","value","value_min","value_max","unit","error",
    "page","block_id","section_title","figure_table_id","local_citations",
    "provenance_id","pdf_page_hint"
] if c in df8.columns]
print(df8[cols].head(10).to_string(index=False))


[Step8] Loaded facts: /content/facts_step7_postproc.csv (rows=113)
[Step8] Saved with provenance → /content/facts_step8_provenance_offline.csv
category attribute   value  value_min  value_max unit  error  page block_id                                                                                       section_title                        figure_table_id     local_citations                          provenance_id pdf_page_hint
   assay        As  350.00        NaN        NaN   °C    NaN    67     67-7                          Stage II is characterized by the abunda nce of antimony sulfosalts and the Fig. 11C; Fig. 11D; Fig. 11E; Fig. 11F                        PhDThesis__Masurel2015.pdf#p67:67-7      #page=67
   assay        Au     NaN      1.000        2.0  g/t    NaN   162   162-28 C) Field photograph showing an ENE-trending steep quartz vein (V 2) cut and sinistrally offset by a                      Figure 4; Fig. 5A                     PhDThesis__Masurel2015.pdf#p162:162-28     #pa