# AI Engineering Drawing Inspector v2.0

**Single-File QC Pipeline with Production Upgrades**

This notebook processes one drawing PDF at a time and produces 4 artifacts:
1. `ResolvedPartIdentity.json` - Part identification with confidence
2. `DrawingEvidence.json` - Extracted callouts (validates against schema v1.1.1)
3. `DiffResult.json` - Comparison with SolidWorks truth
4. `QCReport.md` - Human-readable QC report

**Production Upgrades Implemented:**
- **Upgrade 1**: Real OCR bounding boxes (Tesseract + LightOnOCR hybrid)
- **Upgrade 2**: Title-block / notes-region ROI OCR for identity extraction
- **Upgrade 3**: Explicit unit detection + normalization (INCH/MM)
- **Upgrade 4**: Deterministic canonicalization post-processor (regex-based)

**Models:**
- OCR: Tesseract (bboxes) + LightOnOCR-2 (text quality)
- VLM: Qwen2-VL-7B
- Judge: Text LLM

**Version:** 2.0 | **Schema:** DrawingEvidence v1.1.1

---
## Section 0: Configuration

In [None]:
# ============================================================
# SECTION 0A: Install Dependencies
# ============================================================
!pip install -q accelerate qwen-vl-utils pdf2image bitsandbytes
!pip install -q pymupdf opencv-python-headless jsonschema
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q pillow pypdfium2 huggingface_hub
!pip install -q pytesseract  # For real OCR bounding boxes
!apt-get install -y poppler-utils tesseract-ocr > /dev/null 2>&1

print("Dependencies installed!")

In [None]:
# ============================================================
# SECTION 0B: Configuration Variables
# ============================================================
import os
from pathlib import Path
from datetime import datetime

# =========================
# USER CONFIGURATION
# =========================

# Input: Single drawing PDF path
DRAWING_PDF_PATH = ""  # Set this to your PDF path

# SolidWorks JSON library folder
SOLIDWORKS_JSON_DIR = "sw_json_library"  # Folder containing part JSONs

# Output directory for artifacts
OUTPUT_DIR = "qc_output"

# Mode: "fast" (first page only) or "full" (all pages)
MODE = "fast"

# Schema file path
SCHEMA_PATH = "schemas/drawing_evidence_v1.1.1.schema.json"

# =========================
# DERIVED PATHS
# =========================
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Schema version we're using
SCHEMA_VERSION = "1.1.1"

print(f"Configuration:")
print(f"  DRAWING_PDF_PATH: {DRAWING_PDF_PATH or '(not set)'}")
print(f"  SOLIDWORKS_JSON_DIR: {SOLIDWORKS_JSON_DIR}")
print(f"  OUTPUT_DIR: {OUTPUT_DIR}")
print(f"  MODE: {MODE}")
print(f"  SCHEMA_VERSION: {SCHEMA_VERSION}")

In [None]:
# ============================================================
# SECTION 0C: Import Libraries & GPU Setup
# ============================================================
import json
import re
import gc
import torch
import fitz  # PyMuPDF
import numpy as np
from PIL import Image
from pathlib import Path
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Any, Optional, Tuple
from datetime import datetime
import jsonschema

def clear_gpu_memory():
    """Clear GPU memory cache."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# ============================================================
# SECTION 0D: Load Schema for Validation
# ============================================================
from google.colab import files

# Check if schema exists, otherwise upload
if not os.path.exists(SCHEMA_PATH):
    os.makedirs(os.path.dirname(SCHEMA_PATH), exist_ok=True)
    print(f"Schema not found at {SCHEMA_PATH}")
    print("Please upload drawing_evidence_v1.1.1.schema.json:")
    uploaded = files.upload()
    for filename in uploaded:
        if 'schema' in filename.lower():
            with open(SCHEMA_PATH, 'wb') as f:
                f.write(uploaded[filename])
            print(f"Schema saved to {SCHEMA_PATH}")
            break

# Load schema
try:
    with open(SCHEMA_PATH, 'r') as f:
        DRAWING_EVIDENCE_SCHEMA = json.load(f)
    print(f"Schema loaded: {DRAWING_EVIDENCE_SCHEMA.get('title', 'Unknown')}")
    print(f"Schema version: {DRAWING_EVIDENCE_SCHEMA.get('properties', {}).get('schemaVersion', {}).get('const', 'Unknown')}")
except Exception as e:
    print(f"Warning: Could not load schema: {e}")
    DRAWING_EVIDENCE_SCHEMA = None

In [None]:
# ============================================================
# SECTION 0E: HuggingFace Authentication
# ============================================================
from huggingface_hub import login
from google.colab import userdata

try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("Logged in to HuggingFace!")
except Exception as e:
    print(f"HF Login failed: {e}")
    print("Set HF_TOKEN in Colab Secrets (key icon in left sidebar)")
    hf_token = None

---
## Section 1: PDF Ingestion -> PageArtifacts

In [None]:
# ============================================================
# SECTION 1A: PageArtifacts Data Structure
# ============================================================

@dataclass
class PageArtifact:
    """Rendered page from PDF with associated data."""
    pageIndex0: int  # 0-indexed page number
    page: int  # 1-indexed page number (for schema)
    image: Image.Image  # Rendered page image
    width: int
    height: int
    dpi: int
    direct_text: Optional[str] = None  # PDF text layer if available


def render_pdf_to_artifacts(pdf_path: str, mode: str = "fast", dpi: int = 300) -> List[PageArtifact]:
    """
    Render PDF pages to PageArtifacts.

    Args:
        pdf_path: Path to PDF file
        mode: "fast" (first page only) or "full" (all pages)
        dpi: Resolution for rendering

    Returns:
        List of PageArtifact objects
    """
    artifacts = []

    try:
        doc = fitz.open(pdf_path)
        total_pages = len(doc)

        # Determine which pages to render
        if mode == "fast":
            pages_to_render = [0]  # First page only
        else:
            pages_to_render = list(range(total_pages))

        print(f"PDF: {os.path.basename(pdf_path)}")
        print(f"  Total pages: {total_pages}")
        print(f"  Rendering: {len(pages_to_render)} page(s) at {dpi} DPI")

        for page_idx in pages_to_render:
            page = doc.load_page(page_idx)

            # Render to image
            zoom = dpi / 72.0
            mat = fitz.Matrix(zoom, zoom)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # Extract direct text if available
            direct_text = page.get_text("text")
            if direct_text and len(direct_text.strip()) < 10:
                direct_text = None  # Probably just whitespace

            artifact = PageArtifact(
                pageIndex0=page_idx,
                page=page_idx + 1,  # 1-indexed
                image=img,
                width=pix.width,
                height=pix.height,
                dpi=dpi,
                direct_text=direct_text
            )
            artifacts.append(artifact)
            print(f"    Page {page_idx + 1}: {pix.width}x{pix.height}px")

        doc.close()

    except Exception as e:
        print(f"Error rendering PDF: {e}")
        return []

    return artifacts

print("PageArtifact structure defined.")
print("render_pdf_to_artifacts() ready.")

---
## Section 2: SolidWorks JSON Library Indexing

In [None]:
# ============================================================
# SECTION 2A: SolidWorks JSON Library Index (BOM-Robust)
# ============================================================

@dataclass
class SwPartEntry:
    """Entry in the SolidWorks parts index."""
    json_path: str
    part_number: str
    custom_id: Optional[str] = None
    custom_part_number: Optional[str] = None
    filename_stem: str = ""
    data: Dict[str, Any] = field(default_factory=dict)


@dataclass
class LoadResult:
    """Result of loading JSON files."""
    loaded: int
    skipped: int
    skipped_files: List[Tuple[str, str]]  # (filename, reason)


def load_json_robust(filepath: Path) -> Tuple[Optional[Dict], Optional[str]]:
    """
    Load a JSON file with robust encoding handling.

    Tries encodings in order:
    1. utf-8-sig (handles BOM automatically)
    2. utf-8 (standard)
    3. latin-1 (fallback for weird encodings)

    Returns:
        (data, error_message) - data is None if failed, error_message is None if success
    """
    encodings_to_try = ['utf-8-sig', 'utf-8', 'latin-1']

    for encoding in encodings_to_try:
        try:
            with open(filepath, 'r', encoding=encoding) as f:
                data = json.load(f)
            return data, None
        except UnicodeDecodeError:
            continue
        except json.JSONDecodeError as e:
            # If it's a BOM error on utf-8, try utf-8-sig
            if 'BOM' in str(e) and encoding == 'utf-8':
                continue
            # For other JSON errors, return the error
            return None, f"JSON parse error: {str(e)[:50]}"
        except Exception as e:
            return None, f"Unexpected error: {str(e)[:50]}"

    return None, "Failed all encoding attempts"


def normalize_json_to_utf8(filepath: Path, dry_run: bool = False) -> Tuple[bool, str]:
    """
    Normalize a JSON file to plain UTF-8 (no BOM).

    Args:
        filepath: Path to JSON file
        dry_run: If True, only report what would be done

    Returns:
        (success, message)
    """
    try:
        # Read with BOM-aware encoding
        with open(filepath, 'rb') as f:
            raw = f.read()

        # Check for BOM
        has_bom = raw.startswith(b'\xef\xbb\xbf')

        if not has_bom:
            return True, "Already clean UTF-8"

        if dry_run:
            return True, "Would remove BOM"

        # Remove BOM and re-encode
        content = raw[3:].decode('utf-8')

        # Validate it's valid JSON
        data = json.loads(content)

        # Write back as clean UTF-8
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        return True, "BOM removed, file normalized"

    except Exception as e:
        return False, f"Error: {str(e)[:50]}"


def normalize_json_directory(directory: str, dry_run: bool = True) -> Dict[str, Any]:
    """
    Normalize all JSON files in a directory to plain UTF-8.

    Args:
        directory: Path to directory
        dry_run: If True, only report what would be done

    Returns:
        Summary dict with counts and details
    """
    json_files = list(Path(directory).glob("**/*.json"))

    results = {
        "total": len(json_files),
        "already_clean": 0,
        "would_fix": 0,
        "fixed": 0,
        "errors": 0,
        "details": []
    }

    print(f"{'[DRY RUN] ' if dry_run else ''}Normalizing JSON files in {directory}...")
    print(f"  Found {len(json_files)} JSON files")

    for fp in json_files:
        success, msg = normalize_json_to_utf8(fp, dry_run=dry_run)

        if "Already clean" in msg:
            results["already_clean"] += 1
        elif "Would remove" in msg:
            results["would_fix"] += 1
            results["details"].append((fp.name, msg))
        elif "BOM removed" in msg:
            results["fixed"] += 1
            results["details"].append((fp.name, msg))
        else:
            results["errors"] += 1
            results["details"].append((fp.name, msg))

    print(f"  Already clean: {results['already_clean']}")
    print(f"  {'Would fix' if dry_run else 'Fixed'}: {results['would_fix'] if dry_run else results['fixed']}")
    print(f"  Errors: {results['errors']}")

    return results


class SwJsonLibrary:
    """Index of SolidWorks JSON files for fast lookup (BOM-robust)."""

    def __init__(self):
        self.by_part_number: Dict[str, SwPartEntry] = {}
        self.by_custom_id: Dict[str, SwPartEntry] = {}
        self.by_custom_pn: Dict[str, SwPartEntry] = {}
        self.by_filename: Dict[str, SwPartEntry] = {}
        self.all_entries: List[SwPartEntry] = []
        self.load_result: Optional[LoadResult] = None

    def _normalize(self, s: str) -> str:
        """Normalize string for lookup."""
        if not s:
            return ""
        return re.sub(r'[-\s_]', '', str(s)).lower()

    def load_from_directory(self, directory: str) -> LoadResult:
        """
        Load all JSON files from directory into index.

        Handles UTF-8 BOM and other encoding issues gracefully.
        """
        loaded = 0
        skipped = 0
        skipped_files = []

        json_files = list(Path(directory).glob("**/*.json"))

        print(f"Indexing SolidWorks JSON library...")
        print(f"  Directory: {directory}")
        print(f"  Found {len(json_files)} JSON files")

        for json_path in json_files:
            # Use robust JSON loader
            data, error = load_json_robust(json_path)

            if data is None:
                skipped += 1
                skipped_files.append((json_path.name, error or "Unknown error"))
                continue

            try:
                # Extract identifiers
                identity = data.get('identity', {})
                custom_props = data.get('customProperties', {})

                part_number = identity.get('partNumber', '')
                custom_id = custom_props.get('ID', '')
                custom_pn = custom_props.get('PART_NUMBER', custom_props.get('Part_Number', ''))
                filename_stem = json_path.stem

                entry = SwPartEntry(
                    json_path=str(json_path),
                    part_number=part_number,
                    custom_id=custom_id,
                    custom_part_number=custom_pn,
                    filename_stem=filename_stem,
                    data=data
                )

                self.all_entries.append(entry)

                # Index by various keys
                if part_number:
                    self.by_part_number[part_number] = entry
                    self.by_part_number[self._normalize(part_number)] = entry

                if custom_id:
                    self.by_custom_id[custom_id] = entry
                    self.by_custom_id[self._normalize(custom_id)] = entry

                if custom_pn:
                    self.by_custom_pn[custom_pn] = entry
                    self.by_custom_pn[self._normalize(custom_pn)] = entry

                if filename_stem:
                    self.by_filename[filename_stem] = entry
                    self.by_filename[self._normalize(filename_stem)] = entry

                loaded += 1

            except Exception as e:
                skipped += 1
                skipped_files.append((json_path.name, f"Processing error: {str(e)[:50]}"))

        # Store result
        self.load_result = LoadResult(
            loaded=loaded,
            skipped=skipped,
            skipped_files=skipped_files
        )

        # Print summary
        print(f"  Loaded: {loaded} files")
        print(f"  Skipped: {skipped} files")

        if skipped_files:
            print(f"  Skipped file details:")
            for fname, reason in skipped_files[:10]:  # Show first 10
                print(f"    - {fname}: {reason}")
            if len(skipped_files) > 10:
                print(f"    ... and {len(skipped_files) - 10} more")

        print(f"  Index sizes: by_part_number={len(self.by_part_number)}, by_filename={len(self.by_filename)}")

        return self.load_result

    def lookup(self, candidate: str) -> Optional[SwPartEntry]:
        """Look up a part by any identifier."""
        if not candidate:
            return None

        norm = self._normalize(candidate)

        # Try exact matches first
        if candidate in self.by_part_number:
            return self.by_part_number[candidate]
        if candidate in self.by_custom_id:
            return self.by_custom_id[candidate]
        if candidate in self.by_custom_pn:
            return self.by_custom_pn[candidate]
        if candidate in self.by_filename:
            return self.by_filename[candidate]

        # Try normalized
        if norm in self.by_part_number:
            return self.by_part_number[norm]
        if norm in self.by_custom_id:
            return self.by_custom_id[norm]
        if norm in self.by_custom_pn:
            return self.by_custom_pn[norm]
        if norm in self.by_filename:
            return self.by_filename[norm]

        return None


# Global library instance
sw_library: Optional[SwJsonLibrary] = None

print("SwJsonLibrary class defined (BOM-robust).")

In [None]:
# ============================================================
# SECTION 2B: Load SolidWorks Library
# ============================================================
from google.colab import files
import zipfile

# Check if directory exists
if not os.path.exists(SOLIDWORKS_JSON_DIR):
    print(f"SolidWorks JSON directory not found: {SOLIDWORKS_JSON_DIR}")
    print("Please upload a ZIP file containing your SolidWorks JSON files:")
    uploaded = files.upload()

    for filename in uploaded:
        if filename.endswith('.zip'):
            print(f"Extracting {filename}...")
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall(SOLIDWORKS_JSON_DIR)
            print(f"Extracted to {SOLIDWORKS_JSON_DIR}")
            break

# Initialize and load library
sw_library = SwJsonLibrary()
if os.path.exists(SOLIDWORKS_JSON_DIR):
    sw_library.load_from_directory(SOLIDWORKS_JSON_DIR)
else:
    print(f"Warning: Could not find {SOLIDWORKS_JSON_DIR}")

---
## Section 3: ResolvePartIdentity

In [None]:
# ============================================================
# SECTION 3A: Part Identity Resolution
# ============================================================

@dataclass
class IdentityCandidate:
    """A candidate part number with source info."""
    value: str
    source: str  # "filename", "ocr_title_block", "ocr_near_label"
    confidence: float
    match_found: bool = False
    json_path: Optional[str] = None


@dataclass
class ResolvedPartIdentity:
    """Result of part identity resolution."""
    resolvedPartNumber: Optional[str]
    jsonPath: Optional[str]
    confidence: float
    candidates: Dict[str, List[Dict]]
    notes: List[str]
    needsReview: bool = False


def extract_pn_from_filename(filepath: str) -> List[IdentityCandidate]:
    """
    Extract part number candidates from filename.
    Handles patterns like:
    - 1013572_01.pdf -> 1013572
    - 101357201-03.pdf -> 1013572
    - 314884W_0.pdf -> 314884
    """
    candidates = []
    filename = os.path.basename(filepath)
    name_no_ext = os.path.splitext(filename)[0]

    # Remove common suffixes
    name_no_ext = re.sub(r'\s*\(\d+\)$', '', name_no_ext)  # Remove (1), (2) etc
    name_no_ext = re.sub(r'[\s_]*(Paint|PAINT)$', '', name_no_ext, flags=re.IGNORECASE)

    # Split by common separators
    parts = re.split(r'[\s_]+', name_no_ext)
    if not parts:
        return candidates

    base = parts[0]
    tried = set()

    def add_candidate(val: str, conf: float):
        if val and val not in tried and len(val) >= 4:
            tried.add(val)
            match = sw_library.lookup(val) if sw_library else None
            candidates.append(IdentityCandidate(
                value=val,
                source="filename",
                confidence=conf,
                match_found=match is not None,
                json_path=match.json_path if match else None
            ))

    # Try various transformations
    add_candidate(base, 0.9)
    add_candidate(base.replace('-', ''), 0.85)

    # Remove letter suffix
    if base and base[-1].isalpha():
        add_candidate(base[:-1], 0.8)
        add_candidate(base[:-1].replace('-', ''), 0.75)

    # Handle revision pattern (1013572-01)
    rev_match = re.match(r'^(.+)-(\d{1,2})$', base)
    if rev_match:
        main_part = rev_match.group(1)
        add_candidate(main_part, 0.85)
        add_candidate(main_part.replace('-', ''), 0.8)

    # Progressive digit stripping
    temp = base.replace('-', '')
    conf = 0.7
    while len(temp) > 5 and conf > 0.3:
        temp = temp[:-1]
        add_candidate(temp, conf)
        conf -= 0.05

    return candidates


def extract_pn_from_ocr(ocr_lines: List[str]) -> List[IdentityCandidate]:
    """
    Extract part number candidates from OCR text.
    Looks for text near labels like PART NO, DWG NO, ID, etc.
    """
    candidates = []
    tried = set()

    # Labels that typically precede part numbers
    pn_labels = [
        r'PART\s*(?:NO|NUMBER|#)[.:]?\s*',
        r'DWG\s*(?:NO|NUMBER|#)[.:]?\s*',
        r'DRAWING\s*(?:NO|NUMBER|#)[.:]?\s*',
        r'P/?N[.:]?\s*',
        r'ID[.:]?\s*',
        r'ITEM\s*(?:NO|#)[.:]?\s*',
    ]

    def add_candidate(val: str, source: str, conf: float):
        val = val.strip()
        if val and val not in tried and len(val) >= 4:
            # Check if it looks like a part number (has digits)
            if not any(c.isdigit() for c in val):
                return
            tried.add(val)
            match = sw_library.lookup(val) if sw_library else None
            candidates.append(IdentityCandidate(
                value=val,
                source=source,
                confidence=conf,
                match_found=match is not None,
                json_path=match.json_path if match else None
            ))

    full_text = "\n".join(ocr_lines)

    # Search for labeled part numbers
    for pattern in pn_labels:
        matches = re.finditer(pattern + r'([A-Z0-9][-A-Z0-9]{3,20})', full_text, re.IGNORECASE)
        for m in matches:
            add_candidate(m.group(1), "ocr_near_label", 0.85)

    # Look for standalone part number patterns
    pn_patterns = [
        r'\b(\d{6,8})\b',  # 6-8 digit numbers
        r'\b(\d{5,7}[A-Z])\b',  # Digits + letter suffix
        r'\b([A-Z]{1,3}\d{5,8})\b',  # Letter prefix + digits
    ]

    for pattern in pn_patterns:
        matches = re.finditer(pattern, full_text)
        for m in matches:
            add_candidate(m.group(1), "ocr_title_block", 0.6)

    return candidates


def resolve_part_identity(
    pdf_path: str,
    ocr_lines: Optional[List[str]] = None
) -> ResolvedPartIdentity:
    """
    Resolve part identity from filename and OCR.

    Returns ResolvedPartIdentity with best match and alternatives.
    """
    notes = []

    # Get candidates from filename
    filename_candidates = extract_pn_from_filename(pdf_path)

    # Get candidates from OCR
    ocr_candidates = extract_pn_from_ocr(ocr_lines) if ocr_lines else []

    # Combine all candidates
    all_candidates = filename_candidates + ocr_candidates

    # Find best match (prefer matches found in library)
    matched = [c for c in all_candidates if c.match_found]
    unmatched = [c for c in all_candidates if not c.match_found]

    # Sort by confidence
    matched.sort(key=lambda x: x.confidence, reverse=True)
    unmatched.sort(key=lambda x: x.confidence, reverse=True)

    best_candidate = None
    confidence = 0.0
    needs_review = False

    if matched:
        best_candidate = matched[0]
        confidence = best_candidate.confidence
        notes.append(f"Matched '{best_candidate.value}' from {best_candidate.source}")

        # Check for conflicting matches
        if len(matched) > 1:
            unique_pns = set(c.value for c in matched)
            if len(unique_pns) > 1:
                notes.append(f"Multiple matches found: {unique_pns}")
                needs_review = True
                confidence *= 0.7
    else:
        notes.append("No match found in SolidWorks library")
        needs_review = True
        if unmatched:
            best_candidate = unmatched[0]
            confidence = best_candidate.confidence * 0.5  # Reduce confidence
            notes.append(f"Best unmatched candidate: '{best_candidate.value}'")

    # Low confidence threshold
    if confidence < 0.5:
        needs_review = True
        notes.append("Low confidence - needs manual review")

    return ResolvedPartIdentity(
        resolvedPartNumber=best_candidate.value if best_candidate else None,
        jsonPath=best_candidate.json_path if best_candidate else None,
        confidence=confidence,
        candidates={
            "fromFilename": [asdict(c) for c in filename_candidates],
            "fromOcrTitleBlock": [asdict(c) for c in ocr_candidates],
            "matchesFound": [asdict(c) for c in matched]
        },
        notes=notes,
        needsReview=needs_review
    )

print("Part identity resolution functions defined.")

---
## Section 4: OCR Evidence Extraction

In [None]:
# ============================================================
# SECTION 4A: Load LightOnOCR-2 Model
# ============================================================
import torch
from transformers import LightOnOcrForConditionalGeneration, LightOnOcrProcessor

clear_gpu_memory()

print("Loading LightOnOCR-2-1B...")

ocr_device = "cuda" if torch.cuda.is_available() else "cpu"
ocr_dtype = torch.bfloat16 if ocr_device == "cuda" else torch.float32

ocr_model = LightOnOcrForConditionalGeneration.from_pretrained(
    "lightonai/LightOnOCR-2-1B",
    torch_dtype=ocr_dtype,
    token=hf_token
).to(ocr_device)

ocr_processor = LightOnOcrProcessor.from_pretrained(
    "lightonai/LightOnOCR-2-1B",
    token=hf_token
)

print(f"LightOnOCR-2 loaded: {ocr_model.get_memory_footprint() / 1e9:.2f} GB")

In [None]:
# ============================================================
# SECTION 4B: OCR Block Extraction with Real Bounding Boxes
# ============================================================
import pytesseract
from PIL import Image
import numpy as np

@dataclass
class OcrBlock:
    """Single OCR text block with precise location."""
    text: str
    confidence: float
    pageIndex0: int
    bbox: Optional[Dict[str, float]] = None  # Normalized 0-1 coords: x, y, width, height
    line_index: int = 0
    rawBbox: Optional[Dict[str, int]] = None  # Pixel coordinates for debugging


def run_tesseract_ocr(image: Image.Image, page_index: int) -> List[OcrBlock]:
    """
    Run Tesseract OCR to get text with real bounding boxes.
    Returns OcrBlocks with precise x,y,width,height coordinates.
    """
    blocks = []

    # Convert to RGB if needed
    img = image.convert("RGB")
    img_width, img_height = img.size

    # Get OCR data with bounding boxes
    # Output format: dict with 'text', 'conf', 'left', 'top', 'width', 'height', 'level'
    try:
        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
    except Exception as e:
        print(f"    Tesseract error: {e}")
        return blocks

    n_boxes = len(ocr_data['text'])

    # Group words into lines (level 4 = word)
    current_line_y = -1
    current_line_blocks = []
    line_index = 0

    for i in range(n_boxes):
        text = ocr_data['text'][i].strip()
        conf = int(ocr_data['conf'][i])
        level = ocr_data['level'][i]

        # Skip empty text or very low confidence
        if not text or conf < 10:
            continue

        # Get pixel coordinates
        x_px = ocr_data['left'][i]
        y_px = ocr_data['top'][i]
        w_px = ocr_data['width'][i]
        h_px = ocr_data['height'][i]

        # Normalize to 0-1 range
        x_norm = x_px / img_width
        y_norm = y_px / img_height
        w_norm = w_px / img_width
        h_norm = h_px / img_height

        # Detect line breaks (significant Y change = new line)
        if current_line_y < 0:
            current_line_y = y_px
        elif abs(y_px - current_line_y) > h_px * 0.5:
            # New line detected - flush current line
            if current_line_blocks:
                line_index += 1
            current_line_y = y_px

        block = OcrBlock(
            text=text,
            confidence=conf / 100.0,  # Normalize to 0-1
            pageIndex0=page_index,
            bbox={
                "x": round(x_norm, 4),
                "y": round(y_norm, 4),
                "width": round(w_norm, 4),
                "height": round(h_norm, 4)
            },
            rawBbox={
                "x": x_px,
                "y": y_px,
                "width": w_px,
                "height": h_px
            },
            line_index=line_index
        )
        blocks.append(block)

    return blocks


def merge_words_to_lines(blocks: List[OcrBlock], y_tolerance: float = 0.015) -> List[OcrBlock]:
    """
    Merge individual word blocks into line blocks based on Y proximity.
    Words on the same line (similar Y) are combined.
    """
    if not blocks:
        return []

    # Sort by Y then X
    sorted_blocks = sorted(blocks, key=lambda b: (b.bbox['y'], b.bbox['x']))

    merged_lines = []
    current_line_words = [sorted_blocks[0]]
    current_y = sorted_blocks[0].bbox['y']

    for block in sorted_blocks[1:]:
        if abs(block.bbox['y'] - current_y) <= y_tolerance:
            # Same line
            current_line_words.append(block)
        else:
            # New line - merge current line
            if current_line_words:
                merged_lines.append(merge_line_words(current_line_words, len(merged_lines)))
            current_line_words = [block]
            current_y = block.bbox['y']

    # Don't forget last line
    if current_line_words:
        merged_lines.append(merge_line_words(current_line_words, len(merged_lines)))

    return merged_lines


def merge_line_words(words: List[OcrBlock], line_idx: int) -> OcrBlock:
    """Merge multiple word blocks into a single line block."""
    if not words:
        return None

    # Sort words left to right
    words = sorted(words, key=lambda w: w.bbox['x'])

    # Combine text
    text = " ".join(w.text for w in words)

    # Average confidence
    avg_conf = sum(w.confidence for w in words) / len(words)

    # Calculate bounding box that encompasses all words
    min_x = min(w.bbox['x'] for w in words)
    min_y = min(w.bbox['y'] for w in words)
    max_x = max(w.bbox['x'] + w.bbox['width'] for w in words)
    max_y = max(w.bbox['y'] + w.bbox['height'] for w in words)

    return OcrBlock(
        text=text,
        confidence=avg_conf,
        pageIndex0=words[0].pageIndex0,
        bbox={
            "x": round(min_x, 4),
            "y": round(min_y, 4),
            "width": round(max_x - min_x, 4),
            "height": round(max_y - min_y, 4)
        },
        rawBbox=None,  # Not preserved after merge
        line_index=line_idx
    )


def run_ocr_on_page(page_artifact: PageArtifact, use_lighton: bool = True) -> List[OcrBlock]:
    """
    Run OCR on a page and return OcrBlocks with real bounding boxes.

    Uses Tesseract for bounding boxes. Optionally uses LightOnOCR-2
    for enhanced text recognition (hybrid approach).
    """
    global ocr_model, ocr_processor, ocr_device, ocr_dtype

    # Step 1: Run Tesseract to get bounding boxes
    word_blocks = run_tesseract_ocr(page_artifact.image, page_artifact.pageIndex0)

    if not word_blocks:
        print(f"    No text found by Tesseract on page {page_artifact.page}")
        return []

    # Step 2: Merge words into lines
    line_blocks = merge_words_to_lines(word_blocks, y_tolerance=0.012)

    # Step 3 (Optional): Enhance with LightOnOCR-2 for better text quality
    if use_lighton and ocr_model is not None:
        try:
            img = page_artifact.image.convert("RGB")

            conversation = [{"role": "user", "content": [{"type": "image", "image": img}]}]
            inputs = ocr_processor.apply_chat_template(
                conversation, add_generation_prompt=True, tokenize=True,
                return_dict=True, return_tensors="pt"
            )
            inputs = {
                k: v.to(device=ocr_device, dtype=ocr_dtype) if v.is_floating_point() else v.to(ocr_device)
                for k, v in inputs.items()
            }

            with torch.no_grad():
                output_ids = ocr_model.generate(**inputs, max_new_tokens=2048)

            generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
            lighton_text = ocr_processor.decode(generated_ids, skip_special_tokens=True)
            lighton_lines = [line.strip() for line in lighton_text.split("\n") if line.strip()]

            # Try to match LightOnOCR text to Tesseract bboxes
            line_blocks = align_lighton_with_tesseract(line_blocks, lighton_lines)

        except Exception as e:
            print(f"    LightOnOCR enhancement failed: {e}")
            # Fall back to Tesseract-only results

    return line_blocks


def align_lighton_with_tesseract(
    tesseract_blocks: List[OcrBlock],
    lighton_lines: List[str]
) -> List[OcrBlock]:
    """
    Align LightOnOCR text with Tesseract bounding boxes.
    LightOnOCR often has better text quality, Tesseract has bboxes.

    Strategy: fuzzy match LightOnOCR lines to Tesseract lines,
    use LightOnOCR text with Tesseract bbox when confident.
    """
    if not lighton_lines or not tesseract_blocks:
        return tesseract_blocks

    result = []
    used_lighton = set()

    for tess_block in tesseract_blocks:
        best_match = None
        best_score = 0.0
        best_idx = -1

        tess_lower = tess_block.text.lower().replace(" ", "")

        for i, lo_line in enumerate(lighton_lines):
            if i in used_lighton:
                continue

            lo_lower = lo_line.lower().replace(" ", "")

            # Calculate similarity (simple character overlap)
            if len(tess_lower) < 3 or len(lo_lower) < 3:
                continue

            # Check character overlap ratio
            common = sum(1 for c in tess_lower if c in lo_lower)
            score = common / max(len(tess_lower), len(lo_lower))

            if score > best_score and score > 0.6:
                best_score = score
                best_match = lo_line
                best_idx = i

        if best_match and best_score > 0.7:
            # Use LightOnOCR text with Tesseract bbox
            used_lighton.add(best_idx)
            result.append(OcrBlock(
                text=best_match,
                confidence=max(tess_block.confidence, 0.9),  # Boost confidence for LightOnOCR
                pageIndex0=tess_block.pageIndex0,
                bbox=tess_block.bbox,
                rawBbox=tess_block.rawBbox,
                line_index=tess_block.line_index
            ))
        else:
            # Keep original Tesseract result
            result.append(tess_block)

    return result


def run_ocr_on_all_pages(page_artifacts: List[PageArtifact]) -> List[OcrBlock]:
    """Run OCR on all pages and return combined blocks with real bounding boxes."""
    all_blocks = []

    for artifact in page_artifacts:
        print(f"  Running OCR on page {artifact.page}...")
        blocks = run_ocr_on_page(artifact)
        all_blocks.extend(blocks)

        # Stats
        avg_conf = sum(b.confidence for b in blocks) / len(blocks) if blocks else 0
        print(f"    Found {len(blocks)} text lines, avg confidence: {avg_conf:.0%}")

    return all_blocks

print("OCR extraction with real bounding boxes defined.")

In [None]:
# ============================================================
# SECTION 4C: Title Block & Notes Region ROI OCR
# ============================================================
"""
Engineering drawings typically have:
- Title block: bottom-right corner (20-25% width, 15-20% height)
- General notes: left side or top-left (varies)
- Revision block: top-right or adjacent to title block

This section detects these regions and runs targeted OCR for:
- Part number, description, material, finish
- Drawing units (INCHES/MM)
- Revision info
"""

@dataclass
class DrawingRegion:
    """A detected region of interest on a drawing."""
    region_type: str  # "title_block", "notes", "revision_block"
    bbox_norm: Dict[str, float]  # Normalized 0-1 coords
    bbox_px: Dict[str, int]  # Pixel coords
    confidence: float
    ocr_text: List[str] = field(default_factory=list)


# Common title block locations (normalized coordinates)
# Format: (x_start, y_start, width, height) as fractions of image size
TITLE_BLOCK_REGIONS = [
    # Standard bottom-right title block (most common)
    {"name": "bottom_right_large", "x": 0.60, "y": 0.75, "w": 0.40, "h": 0.25},
    {"name": "bottom_right_medium", "x": 0.65, "y": 0.80, "w": 0.35, "h": 0.20},
    {"name": "bottom_right_small", "x": 0.70, "y": 0.85, "w": 0.30, "h": 0.15},
    # Full bottom strip (for wide title blocks)
    {"name": "bottom_strip", "x": 0.0, "y": 0.85, "w": 1.0, "h": 0.15},
]

NOTES_REGIONS = [
    # Left side notes column
    {"name": "left_notes", "x": 0.0, "y": 0.0, "w": 0.25, "h": 0.70},
    # Top-left notes area
    {"name": "top_left_notes", "x": 0.0, "y": 0.0, "w": 0.40, "h": 0.25},
    # Right side notes (less common)
    {"name": "right_notes", "x": 0.75, "y": 0.0, "w": 0.25, "h": 0.60},
]


def detect_title_block_region(
    page_artifact: PageArtifact,
    ocr_blocks: List[OcrBlock]
) -> Optional[DrawingRegion]:
    """
    Detect title block region by looking for keyword clusters.

    Title blocks typically contain: PART NO, DWG, MATERIAL, SCALE, etc.
    """
    title_keywords = [
        'PART', 'DWG', 'DRAWING', 'MATERIAL', 'SCALE', 'SHEET',
        'FINISH', 'TITLE', 'REV', 'DATE', 'DRAWN', 'CHECKED',
        'APPROVED', 'SIZE', 'CAGE', 'FSCM', 'P/N', 'UNLESS'
    ]

    img_w, img_h = page_artifact.width, page_artifact.height

    # Try each predefined region
    best_region = None
    best_score = 0

    for region_def in TITLE_BLOCK_REGIONS:
        # Count keyword matches in this region
        keyword_count = 0
        text_blocks_in_region = []

        for block in ocr_blocks:
            if block.pageIndex0 != page_artifact.pageIndex0:
                continue

            bx = block.bbox.get('x', 0)
            by = block.bbox.get('y', 0)

            # Check if block is within region
            if (region_def['x'] <= bx <= region_def['x'] + region_def['w'] and
                region_def['y'] <= by <= region_def['y'] + region_def['h']):

                text_blocks_in_region.append(block.text)
                text_upper = block.text.upper()

                for kw in title_keywords:
                    if kw in text_upper:
                        keyword_count += 1

        # Score based on keyword density
        if text_blocks_in_region:
            score = keyword_count / len(text_blocks_in_region)
            if keyword_count >= 3 and score > best_score:
                best_score = score
                best_region = DrawingRegion(
                    region_type="title_block",
                    bbox_norm={
                        "x": region_def['x'],
                        "y": region_def['y'],
                        "width": region_def['w'],
                        "height": region_def['h']
                    },
                    bbox_px={
                        "x": int(region_def['x'] * img_w),
                        "y": int(region_def['y'] * img_h),
                        "width": int(region_def['w'] * img_w),
                        "height": int(region_def['h'] * img_h)
                    },
                    confidence=min(score, 0.95),
                    ocr_text=text_blocks_in_region
                )

    return best_region


def detect_notes_region(
    page_artifact: PageArtifact,
    ocr_blocks: List[OcrBlock]
) -> Optional[DrawingRegion]:
    """
    Detect general notes region by looking for note patterns.

    Notes sections typically contain: NOTES:, GENERAL NOTES, 1., 2., etc.
    """
    notes_indicators = [
        'NOTES', 'GENERAL NOTES', 'UNLESS OTHERWISE', 'ALL DIMENSIONS',
        'REMOVE BURRS', 'BREAK EDGES', 'TOLERANCES', 'INTERPRET'
    ]

    img_w, img_h = page_artifact.width, page_artifact.height

    best_region = None
    best_score = 0

    for region_def in NOTES_REGIONS:
        indicator_count = 0
        numbered_lines = 0
        text_blocks_in_region = []

        for block in ocr_blocks:
            if block.pageIndex0 != page_artifact.pageIndex0:
                continue

            bx = block.bbox.get('x', 0)
            by = block.bbox.get('y', 0)

            if (region_def['x'] <= bx <= region_def['x'] + region_def['w'] and
                region_def['y'] <= by <= region_def['y'] + region_def['h']):

                text_blocks_in_region.append(block.text)
                text_upper = block.text.upper()

                for ind in notes_indicators:
                    if ind in text_upper:
                        indicator_count += 1

                # Check for numbered lists (1., 2., A., B.)
                if re.match(r'^[\d]{1,2}[.\)]', block.text.strip()):
                    numbered_lines += 1

        score = indicator_count + (numbered_lines * 0.5)
        if text_blocks_in_region and score > best_score and indicator_count >= 1:
            best_score = score
            best_region = DrawingRegion(
                region_type="notes",
                bbox_norm={
                    "x": region_def['x'],
                    "y": region_def['y'],
                    "width": region_def['w'],
                    "height": region_def['h']
                },
                bbox_px={
                    "x": int(region_def['x'] * img_w),
                    "y": int(region_def['y'] * img_h),
                    "width": int(region_def['w'] * img_w),
                    "height": int(region_def['h'] * img_h)
                },
                confidence=min(score / 5, 0.9),
                ocr_text=text_blocks_in_region
            )

    return best_region


def run_roi_ocr(
    page_artifact: PageArtifact,
    region: DrawingRegion
) -> List[OcrBlock]:
    """
    Run targeted OCR on a specific region (cropped image).
    Higher resolution OCR on smaller region for better accuracy.
    """
    img = page_artifact.image

    # Crop region (with small padding)
    px = region.bbox_px
    pad = 10
    x1 = max(0, px['x'] - pad)
    y1 = max(0, px['y'] - pad)
    x2 = min(img.width, px['x'] + px['width'] + pad)
    y2 = min(img.height, px['y'] + px['height'] + pad)

    cropped = img.crop((x1, y1, x2, y2))

    # Run Tesseract on cropped region with better config for small text
    try:
        # Use PSM 6 (uniform block of text) for better small text recognition
        custom_config = r'--oem 3 --psm 6'
        ocr_data = pytesseract.image_to_data(
            cropped,
            output_type=pytesseract.Output.DICT,
            config=custom_config
        )
    except Exception as e:
        print(f"    ROI OCR error: {e}")
        return []

    blocks = []
    crop_w, crop_h = cropped.size

    for i in range(len(ocr_data['text'])):
        text = ocr_data['text'][i].strip()
        conf = int(ocr_data['conf'][i])

        if not text or conf < 20:
            continue

        # Convert crop-relative coords back to full image coords
        local_x = ocr_data['left'][i]
        local_y = ocr_data['top'][i]
        w = ocr_data['width'][i]
        h = ocr_data['height'][i]

        global_x = x1 + local_x
        global_y = y1 + local_y

        # Normalize to full image
        x_norm = global_x / img.width
        y_norm = global_y / img.height
        w_norm = w / img.width
        h_norm = h / img.height

        blocks.append(OcrBlock(
            text=text,
            confidence=conf / 100.0,
            pageIndex0=page_artifact.pageIndex0,
            bbox={
                "x": round(x_norm, 4),
                "y": round(y_norm, 4),
                "width": round(w_norm, 4),
                "height": round(h_norm, 4)
            },
            rawBbox={
                "x": global_x,
                "y": global_y,
                "width": w,
                "height": h
            },
            line_index=i
        ))

    return blocks


def extract_identity_from_title_block(
    title_block_region: Optional[DrawingRegion],
    all_ocr_blocks: List[OcrBlock]
) -> Dict[str, Any]:
    """
    Extract structured identity info from title block region.

    Returns dict with: partNumber, description, material, finish, scale, etc.
    """
    result = {
        "partNumber": None,
        "description": None,
        "material": None,
        "finish": None,
        "scale": None,
        "revision": None,
        "sheetInfo": None,
        "rawTitleBlockText": []
    }

    if not title_block_region:
        return result

    # Get text from title block region
    title_text_lines = title_block_region.ocr_text or []
    result["rawTitleBlockText"] = title_text_lines

    full_text = " ".join(title_text_lines).upper()

    # Part number patterns
    pn_patterns = [
        r'PART\s*(?:NO|NUMBER|#)[.:)]*\s*([A-Z0-9][-A-Z0-9]{3,20})',
        r'P/?N[.:)]*\s*([A-Z0-9][-A-Z0-9]{3,20})',
        r'DWG\s*(?:NO|NUMBER|#)?[.:)]*\s*([A-Z0-9][-A-Z0-9]{3,20})',
        r'DRAWING\s*(?:NO|#)?[.:)]*\s*([A-Z0-9][-A-Z0-9]{3,20})',
        r'ID[.:)]*\s*(\d{5,10})',
    ]

    for pattern in pn_patterns:
        match = re.search(pattern, full_text)
        if match:
            result["partNumber"] = match.group(1).strip()
            break

    # Material patterns
    material_patterns = [
        r'MATERIAL[.:)]*\s*([A-Z0-9][A-Z0-9\s\-]{2,30}?)(?=\s*(?:FINISH|SCALE|SHEET|$))',
        r'MAT[\'L]*[.:)]*\s*([A-Z0-9][A-Z0-9\s\-]{2,30})',
        r'(?:AISI|SAE|ASTM)\s*[A-Z]?\d{3,5}',
        r'(?:STEEL|ALUMINUM|BRASS|BRONZE|COPPER|PLASTIC|NYLON|DELRIN)',
    ]

    for pattern in material_patterns:
        match = re.search(pattern, full_text)
        if match:
            result["material"] = match.group(1).strip() if match.lastindex else match.group(0).strip()
            break

    # Finish patterns
    finish_patterns = [
        r'FINISH[.:)]*\s*([A-Z][A-Z\s\-]{2,30}?)(?=\s*(?:MATERIAL|SCALE|SHEET|$))',
        r'(PAINT\s*\w+)',
        r'(ZINC\s*PLATE)',
        r'(ANODIZE[D]*)',
        r'(POWDER\s*COAT)',
        r'(BLACK\s*OXIDE)',
    ]

    for pattern in finish_patterns:
        match = re.search(pattern, full_text)
        if match:
            result["finish"] = match.group(1).strip()
            break

    # Scale pattern
    scale_match = re.search(r'SCALE[.:)]*\s*(\d+[.:]\d+|\d+/\d+|FULL|NTS)', full_text)
    if scale_match:
        result["scale"] = scale_match.group(1)

    # Revision pattern
    rev_match = re.search(r'REV(?:ISION)?[.:)]*\s*([A-Z0-9]{1,3})', full_text)
    if rev_match:
        result["revision"] = rev_match.group(1)

    # Sheet info
    sheet_match = re.search(r'SHEET\s*(\d+)\s*(?:OF|/)\s*(\d+)', full_text)
    if sheet_match:
        result["sheetInfo"] = f"{sheet_match.group(1)} of {sheet_match.group(2)}"

    return result


def extract_notes_content(
    notes_region: Optional[DrawingRegion],
    all_ocr_blocks: List[OcrBlock]
) -> List[Dict[str, Any]]:
    """
    Extract structured notes from notes region.

    Returns list of note dicts with type and content.
    """
    notes = []

    if not notes_region:
        return notes

    text_lines = notes_region.ocr_text or []

    # Common note patterns
    general_tolerance_pattern = r'UNLESS\s+OTHERWISE\s+SPECIFIED'
    material_note_pattern = r'MATERIAL[.:]*\s*(.+)'
    finish_note_pattern = r'FINISH[.:]*\s*(.+)'

    full_text = " ".join(text_lines).upper()

    # Check for tolerance block
    if re.search(general_tolerance_pattern, full_text):
        notes.append({
            "noteType": "GeneralTolerance",
            "rawText": full_text[:200],
            "confidence": 0.8
        })

    # Extract numbered notes
    for line in text_lines:
        line = line.strip()
        numbered_match = re.match(r'^(\d{1,2})[.\)]\s*(.+)', line)
        if numbered_match:
            notes.append({
                "noteType": "Numbered",
                "number": int(numbered_match.group(1)),
                "rawText": numbered_match.group(2),
                "confidence": 0.75
            })

    return notes


print("Title block & notes ROI detection defined.")

In [None]:
# ============================================================
# SECTION 4D: Unit Detection & Normalization
# ============================================================
"""
Engineering drawings specify units in:
- Title block: "UNLESS OTHERWISE SPECIFIED, DIMENSIONS ARE IN INCHES"
- Notes: "ALL DIMENSIONS IN MM"
- Scale annotation: "1:1 METRIC" or "SCALE: 1=1 INCH"

This section:
1. Detects the drawing's native unit system (INCH or MM)
2. Provides conversion functions to normalize all values to MM
"""

from enum import Enum
from typing import NamedTuple

class DrawingUnits(Enum):
    UNKNOWN = "unknown"
    INCHES = "inches"
    MILLIMETERS = "mm"
    MIXED = "mixed"  # Some drawings have both


class UnitDetectionResult(NamedTuple):
    detected_unit: DrawingUnits
    confidence: float
    evidence: List[str]
    conversion_factor: float  # Multiply raw values by this to get MM


# Conversion constant
INCH_TO_MM = 25.4

# Patterns indicating INCH units
INCH_PATTERNS = [
    r'DIMENSIONS\s+(?:ARE\s+)?IN\s+INCH(?:ES)?',
    r'ALL\s+DIMENSIONS\s+IN\s+INCH(?:ES)?',
    r'UNLESS\s+OTHERWISE\s+SPECIFIED[^.]*INCH(?:ES)?',
    r'SCALE[:\s]+\d+[:\s]*\d*\s*INCH',
    r'\.XXX\s*=\s*[±]?\s*\.00[1-5]',  # Tolerance format typical for inches
    r'FRACTIONAL[:\s]+[±]?\s*1/\d+',   # Fractional tolerances
    r'INTERPRET\s+(?:PER|IAW)\s+ASME\s+Y14\.5',  # US standard usually means inches
]

# Patterns indicating MM units
MM_PATTERNS = [
    r'DIMENSIONS\s+(?:ARE\s+)?IN\s+(?:MM|MILLIMETER)',
    r'ALL\s+DIMENSIONS\s+IN\s+(?:MM|MILLIMETER)',
    r'UNLESS\s+OTHERWISE\s+SPECIFIED[^.]*(?:MM|MILLIMETER)',
    r'SCALE[:\s]+\d+[:\s]*\d*\s*(?:MM|METRIC)',
    r'\.X\s*=\s*[±]?\s*0\.[1-5]',  # Tolerance format typical for mm
    r'ISO\s+\d{4}',  # ISO standards usually mean metric
    r'DIN\s+\d{4}',  # German standard = metric
]

# Dimension value heuristics
# Typical dimension ranges help distinguish units
# Hole diameters: 0.1" - 2" (2.54mm - 50.8mm)
# Common metric holes: M3=3mm, M4=4mm, M5=5mm, M6=6mm, M8=8mm, M10=10mm
# Imperial drills: #1-#80 (0.228" - 0.0135"), Letter A-Z (0.234" - 0.413")


def detect_units_from_text(ocr_text: List[str]) -> UnitDetectionResult:
    """
    Detect drawing units from OCR text.

    Searches for explicit unit declarations and tolerance patterns.
    """
    full_text = " ".join(ocr_text).upper()

    inch_evidence = []
    mm_evidence = []

    # Check explicit patterns
    for pattern in INCH_PATTERNS:
        match = re.search(pattern, full_text)
        if match:
            inch_evidence.append(f"Pattern match: {match.group(0)[:50]}")

    for pattern in MM_PATTERNS:
        match = re.search(pattern, full_text)
        if match:
            mm_evidence.append(f"Pattern match: {match.group(0)[:50]}")

    # Check for tolerance blocks (common format indicators)
    # Inch format: .XX = ±.01, .XXX = ±.005
    if re.search(r'\.XX\s*=\s*[±]?\s*\.0[1-3]', full_text):
        inch_evidence.append("Inch-style decimal tolerance")

    # MM format: .X = ±0.5, .XX = ±0.1
    if re.search(r'\.X\s*=\s*[±]?\s*0\.[1-5]', full_text):
        mm_evidence.append("MM-style decimal tolerance")

    # Look for explicit "INCH" or "MM" anywhere
    if re.search(r'\bINCH(?:ES)?\b', full_text) and 'MILLIMETER' not in full_text:
        inch_evidence.append("Explicit INCH keyword")

    if re.search(r'\b(?:MM|MILLIMETER(?:S)?)\b', full_text) and 'INCH' not in full_text:
        mm_evidence.append("Explicit MM keyword")

    # Determine result
    inch_score = len(inch_evidence)
    mm_score = len(mm_evidence)

    if inch_score > 0 and mm_score == 0:
        return UnitDetectionResult(
            detected_unit=DrawingUnits.INCHES,
            confidence=min(0.5 + inch_score * 0.15, 0.95),
            evidence=inch_evidence,
            conversion_factor=INCH_TO_MM
        )
    elif mm_score > 0 and inch_score == 0:
        return UnitDetectionResult(
            detected_unit=DrawingUnits.MILLIMETERS,
            confidence=min(0.5 + mm_score * 0.15, 0.95),
            evidence=mm_evidence,
            conversion_factor=1.0
        )
    elif inch_score > 0 and mm_score > 0:
        # Mixed evidence - go with stronger signal
        if inch_score > mm_score:
            return UnitDetectionResult(
                detected_unit=DrawingUnits.INCHES,
                confidence=0.5 * inch_score / (inch_score + mm_score),
                evidence=inch_evidence + ["Mixed signals detected"],
                conversion_factor=INCH_TO_MM
            )
        else:
            return UnitDetectionResult(
                detected_unit=DrawingUnits.MILLIMETERS,
                confidence=0.5 * mm_score / (inch_score + mm_score),
                evidence=mm_evidence + ["Mixed signals detected"],
                conversion_factor=1.0
            )
    else:
        return UnitDetectionResult(
            detected_unit=DrawingUnits.UNKNOWN,
            confidence=0.0,
            evidence=["No explicit unit indicators found"],
            conversion_factor=1.0  # Default to no conversion
        )


def detect_units_from_dimensions(dimensions: List[float]) -> UnitDetectionResult:
    """
    Heuristic unit detection based on dimension values.

    Large values (>100) are likely MM, small values with many decimals likely INCH.
    """
    if not dimensions:
        return UnitDetectionResult(
            detected_unit=DrawingUnits.UNKNOWN,
            confidence=0.0,
            evidence=["No dimensions to analyze"],
            conversion_factor=1.0
        )

    # Analyze dimension characteristics
    large_values = sum(1 for d in dimensions if d > 50)
    small_values = sum(1 for d in dimensions if d < 10)
    very_small = sum(1 for d in dimensions if d < 1)

    # Typical metric ranges
    metric_hole_sizes = sum(1 for d in dimensions if d in [3, 4, 5, 6, 8, 10, 12, 14, 16, 20])

    # Typical imperial (converted to decimal inches)
    # Common fractions as decimals: 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875
    imperial_fractions = sum(1 for d in dimensions
                            if any(abs(d - f) < 0.01 for f in
                                   [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875,
                                    0.0625, 0.1875, 0.3125, 0.4375, 0.5625, 0.6875, 0.8125]))

    evidence = []

    # If many values > 50, likely MM
    if large_values > len(dimensions) * 0.5:
        evidence.append(f"{large_values}/{len(dimensions)} values > 50 (suggests MM)")
        return UnitDetectionResult(
            detected_unit=DrawingUnits.MILLIMETERS,
            confidence=0.6,
            evidence=evidence,
            conversion_factor=1.0
        )

    # If values look like common imperial fractions
    if imperial_fractions > len(dimensions) * 0.3:
        evidence.append(f"{imperial_fractions}/{len(dimensions)} match imperial fractions")
        return UnitDetectionResult(
            detected_unit=DrawingUnits.INCHES,
            confidence=0.6,
            evidence=evidence,
            conversion_factor=INCH_TO_MM
        )

    # If many very small values (< 1) and small values, likely INCH
    if very_small > 0 and small_values > len(dimensions) * 0.7:
        evidence.append(f"Many small values (< 10) suggest INCH")
        return UnitDetectionResult(
            detected_unit=DrawingUnits.INCHES,
            confidence=0.5,
            evidence=evidence,
            conversion_factor=INCH_TO_MM
        )

    return UnitDetectionResult(
        detected_unit=DrawingUnits.UNKNOWN,
        confidence=0.3,
        evidence=["Dimension analysis inconclusive"],
        conversion_factor=1.0
    )


def normalize_to_mm(value: float, units: DrawingUnits) -> float:
    """Convert a dimension value to millimeters."""
    if units == DrawingUnits.INCHES:
        return value * INCH_TO_MM
    return value


def normalize_callout_to_mm(callout: Dict, units: DrawingUnits) -> Dict:
    """
    Normalize all dimension values in a callout to millimeters.

    Updates diameterMm, depthMm, radiusMm, etc. based on detected units.
    """
    if units == DrawingUnits.MILLIMETERS or units == DrawingUnits.UNKNOWN:
        return callout  # Already in MM or unknown, don't convert

    callout = callout.copy()  # Don't modify original

    # Convert hole dimensions
    if 'hole' in callout and callout['hole']:
        hole = callout['hole'].copy()
        if hole.get('diameterInches') and not hole.get('diameterMm'):
            hole['diameterMm'] = hole['diameterInches'] * INCH_TO_MM
        elif hole.get('diameterMm') and units == DrawingUnits.INCHES:
            # Value was parsed as MM but drawing is in inches
            # This is a common VLM confusion - assume it's actually inches
            hole['diameterInches'] = hole['diameterMm']
            hole['diameterMm'] = hole['diameterMm'] * INCH_TO_MM
        if hole.get('depthMm') and units == DrawingUnits.INCHES:
            hole['depthInches'] = hole['depthMm']
            hole['depthMm'] = hole['depthMm'] * INCH_TO_MM
        callout['hole'] = hole

    # Convert fillet
    if 'fillet' in callout and callout['fillet']:
        fillet = callout['fillet'].copy()
        if fillet.get('radiusMm') and units == DrawingUnits.INCHES:
            fillet['radiusInches'] = fillet['radiusMm']
            fillet['radiusMm'] = fillet['radiusMm'] * INCH_TO_MM
        callout['fillet'] = fillet

    # Convert chamfer
    if 'chamfer' in callout and callout['chamfer']:
        chamfer = callout['chamfer'].copy()
        if chamfer.get('distance1Mm') and units == DrawingUnits.INCHES:
            chamfer['distance1Inches'] = chamfer['distance1Mm']
            chamfer['distance1Mm'] = chamfer['distance1Mm'] * INCH_TO_MM
        if chamfer.get('distance2Mm') and units == DrawingUnits.INCHES:
            chamfer['distance2Inches'] = chamfer['distance2Mm']
            chamfer['distance2Mm'] = chamfer['distance2Mm'] * INCH_TO_MM
        callout['chamfer'] = chamfer

    # Convert linear dimension
    if 'dimension' in callout and callout['dimension']:
        dim = callout['dimension'].copy()
        if dim.get('valueMm') and units == DrawingUnits.INCHES:
            dim['valueInches'] = dim['valueMm']
            dim['valueMm'] = dim['valueMm'] * INCH_TO_MM
        callout['dimension'] = dim

    return callout


def parse_dimension_with_unit(raw_text: str) -> Tuple[Optional[float], DrawingUnits]:
    """
    Parse a dimension string and detect its unit.

    Examples:
        "12.7mm" -> (12.7, MM)
        "0.500"" or "0.500 IN" -> (0.5, INCHES)
        "1/2" -> (0.5, INCHES)  # Fractions are imperial
        "Ø12.70" -> (12.7, UNKNOWN)  # Need context
    """
    text = raw_text.strip().upper()

    # Remove diameter symbol
    text = re.sub(r'[Øø∅⌀]', '', text)

    # Check for explicit unit suffix
    mm_match = re.match(r'([\d.]+)\s*MM', text)
    if mm_match:
        return float(mm_match.group(1)), DrawingUnits.MILLIMETERS

    inch_match = re.match(r'([\d.]+)\s*(?:IN(?:CH)?|")', text)
    if inch_match:
        return float(inch_match.group(1)), DrawingUnits.INCHES

    # Check for fractions (imperial)
    frac_match = re.match(r'(\d+)?[\s-]*(\d+)/(\d+)', text)
    if frac_match:
        whole = int(frac_match.group(1) or 0)
        numer = int(frac_match.group(2))
        denom = int(frac_match.group(3))
        value = whole + (numer / denom)
        return value, DrawingUnits.INCHES

    # Plain number - unit unknown
    num_match = re.match(r'([\d.]+)', text)
    if num_match:
        return float(num_match.group(1)), DrawingUnits.UNKNOWN

    return None, DrawingUnits.UNKNOWN


print("Unit detection and normalization defined.")

# ============================================================
# SECTION 5B: VLM Evidence Extraction with Upgrades
# ============================================================

VLM_EXTRACTION_PROMPT = '''You are an engineering drawing analyzer. Extract all callouts and dimensions from this drawing.

**OUTPUT FORMAT (JSON):**
```json
{
  "foundCallouts": [
    {
      "calloutType": "Hole|TappedHole|Fillet|Chamfer|LinearDimension|DiameterDimension|Note",
      "rawText": "exact text as shown",
      "quantity": 1,
      "quantityRaw": "(2X) or 4 PLCS or null",
      "hole": {"diameterMm": 12.7, "isThrough": true, "depthMm": null},
      "thread": {"standard": "Metric", "nominalMm": 6, "pitch": 1.0},
      "fillet": {"radiusMm": 3.0},
      "chamfer": {"distance1Mm": 0.5, "angleDegrees": 45}
    }
  ],
  "foundNotes": [
    {"rawText": "note text", "noteType": "General|Material|Finish"}
  ]
}
```

**RULES:**
1. Only extract what you can SEE with evidence
2. For dimensions: convert fractions to decimal (1/2 = 0.5 inches = 12.7mm)
3. For holes: note if THRU or BLIND with depth
4. For threads: identify M6x1.0, 1/4-20 UNC, etc.
5. Include quantity (2X, 4 PLCS, etc.)
6. Do NOT say "missing" - only report what you FIND

**OCR TEXT FROM DRAWING:**
{ocr_text}

Extract all callouts and dimensions from the drawing image.
'''


def extract_evidence_with_vlm(
    page_artifacts: List[PageArtifact],
    ocr_blocks: List[OcrBlock],
    resolved_identity: ResolvedPartIdentity,
    detected_units: Optional[UnitDetectionResult] = None,
    title_block_identity: Optional[Dict] = None
) -> Dict[str, Any]:
    """
    Use VLM to extract structured evidence from drawing.
    
    Integrates:
    - ROI-extracted title block identity
    - Unit detection for normalization
    - Deterministic post-processing canonicalization
    
    Returns DrawingEvidence-compatible dict.
    """
    global vlm_model, vlm_processor
    
    # Prepare OCR text for prompt
    ocr_lines = [b.text for b in ocr_blocks if any(c.isdigit() for c in b.text)]
    ocr_text = "\n".join([f"- {line}" for line in ocr_lines[:100]])
    
    # Get first page image
    if not page_artifacts:
        return {"error": "No page artifacts"}
    
    img = page_artifacts[0].image
    
    # Build prompt
    prompt = VLM_EXTRACTION_PROMPT.format(ocr_text=ocr_text)
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": prompt}
            ]
        }
    ]
    
    # Process with VLM
    text_input = vlm_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    
    inputs = vlm_processor(
        text=[text_input],
        images=image_inputs,
        return_tensors="pt",
        padding=True
    ).to(vlm_model.device)
    
    print(f"  VLM input tokens: {inputs.input_ids.shape[1]}")
    
    with torch.no_grad():
        output_ids = vlm_model.generate(**inputs, max_new_tokens=2000, do_sample=False)
    
    generated_ids = output_ids[:, inputs.input_ids.shape[1]:]
    response = vlm_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # Parse JSON from response
    vlm_data = parse_vlm_json_response(response)
    
    # Build DrawingEvidence structure with all upgrades
    evidence = build_drawing_evidence(
        vlm_data=vlm_data,
        ocr_blocks=ocr_blocks,
        resolved_identity=resolved_identity,
        page_artifacts=page_artifacts,
        detected_units=detected_units,
        title_block_identity=title_block_identity
    )
    
    return evidence


def parse_vlm_json_response(response: str) -> Dict[str, Any]:
    """Extract JSON from VLM response."""
    # Try to find JSON block
    json_match = re.search(r'```json\s*(.+?)\s*```', response, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group(1))
        except json.JSONDecodeError:
            pass
    
    # Try to find raw JSON
    json_match = re.search(r'\{[^{}]*"foundCallouts"[^{}]*\}', response, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group(0))
        except json.JSONDecodeError:
            pass
    
    # Return empty structure
    print(f"  Warning: Could not parse VLM JSON response")
    return {"foundCallouts": [], "foundNotes": []}


def build_drawing_evidence(
    vlm_data: Dict[str, Any],
    ocr_blocks: List[OcrBlock],
    resolved_identity: ResolvedPartIdentity,
    page_artifacts: List[PageArtifact],
    detected_units: Optional[UnitDetectionResult] = None,
    title_block_identity: Optional[Dict] = None
) -> Dict[str, Any]:
    """
    Build DrawingEvidence JSON conforming to schema v1.1.1.
    
    Applies all upgrades:
    1. Real bounding boxes from OCR
    2. Title block identity extraction
    3. Unit detection and normalization
    4. Deterministic canonicalization
    """
    # Get SW data for fallback (using BOM-robust loader)
    sw_data = {}
    if resolved_identity.jsonPath and os.path.exists(resolved_identity.jsonPath):
        sw_data, load_error = load_json_robust(Path(resolved_identity.jsonPath))
        if not sw_data:
            print(f"  Warning: Could not load SW data: {load_error}")
            sw_data = {}
    
    sw_identity = sw_data.get('identity', {})
    
    # Merge identity sources: OCR title block > filename resolution > SW fallback
    tb = title_block_identity or {}
    
    identity = {
        "partNumber": {
            "ocrValue": tb.get("partNumber") or resolved_identity.resolvedPartNumber,
            "swFallback": sw_identity.get('partNumber'),
            "resolved": tb.get("partNumber") or resolved_identity.resolvedPartNumber or sw_identity.get('partNumber'),
            "ocrConfidence": resolved_identity.confidence,
            "source": "ocr_title_block" if tb.get("partNumber") else ("ocr_filename" if resolved_identity.resolvedPartNumber else "sw_fallback")
        },
        "description": {
            "ocrValue": tb.get("description"),
            "swFallback": sw_identity.get('description'),
            "resolved": tb.get("description") or sw_identity.get('description'),
            "source": "ocr_title_block" if tb.get("description") else "sw_fallback"
        },
        "material": {
            "ocrValue": tb.get("material"),
            "swFallback": sw_identity.get('material'),
            "resolved": tb.get("material") or sw_identity.get('material'),
            "source": "ocr_title_block" if tb.get("material") else "sw_fallback"
        },
        "finish": {
            "ocrValue": tb.get("finish"),
            "swFallback": sw_identity.get('finish'),
            "resolved": tb.get("finish") or sw_identity.get('finish'),
            "source": "ocr_title_block" if tb.get("finish") else "sw_fallback"
        },
        "revision": {
            "ocrValue": tb.get("revision"),
            "swFallback": sw_identity.get('revision'),
            "resolved": tb.get("revision") or sw_identity.get('revision'),
            "source": "ocr_title_block" if tb.get("revision") else "sw_fallback"
        }
    }
    
    # Process VLM callouts with post-processing
    found_callouts = []
    for callout in vlm_data.get('foundCallouts', []):
        processed = process_vlm_callout(callout)
        if processed:
            # Apply unit normalization if inches detected
            if detected_units and detected_units.detected_unit == DrawingUnits.INCHES:
                processed = normalize_callout_to_mm(processed, DrawingUnits.INCHES)
            
            # Apply deterministic canonicalization
            processed = canonicalize_callout(processed)
            
            found_callouts.append(processed)
    
    # Process notes
    found_notes = []
    for note in vlm_data.get('foundNotes', []):
        found_notes.append({
            "rawText": note.get('rawText', ''),
            "noteType": note.get('noteType', 'General'),
            "confidence": 0.8
        })
    
    # Build unit detection info
    unit_info = None
    if detected_units:
        unit_info = {
            "detectedUnit": detected_units.detected_unit.value,
            "confidence": detected_units.confidence,
            "evidence": detected_units.evidence,
            "conversionApplied": detected_units.detected_unit == DrawingUnits.INCHES
        }
    
    # Build evidence document
    evidence = {
        "schemaVersion": SCHEMA_VERSION,
        "extractionTime": datetime.now().isoformat(),
        "sourceFile": DRAWING_PDF_PATH,
        "pageCount": len(page_artifacts),
        "overallConfidence": calculate_overall_confidence(found_callouts),
        "identity": identity,
        "foundCallouts": found_callouts,
        "foundNotes": found_notes,
        "views": [],
        "unitDetection": unit_info,
        "validationSummary": generate_validation_summary(found_callouts)
    }
    
    return evidence


def process_vlm_callout(callout: Dict) -> Optional[Dict]:
    """Process a single VLM callout into schema format."""
    callout_type = callout.get('calloutType', 'Unknown')
    raw_text = callout.get('rawText', '')
    
    if not raw_text:
        return None
    
    processed = {
        "calloutType": callout_type,
        "rawText": raw_text,
        "canonical": None,  # Will be set by canonicalizer
        "confidence": callout.get('confidence', 0.8),
        "location": {
            "page": 1,
            "pageIndex0": 0,
            "bbox": callout.get('bbox')  # May have real bbox now
        },
        "quantity": callout.get('quantity', 1),
        "quantityRaw": callout.get('quantityRaw'),
        "plcs": None,
        "isTypical": None,
        "validationWarnings": []
    }
    
    # Add type-specific data
    if callout_type in ['Hole', 'TappedHole', 'Counterbore', 'Countersink']:
        hole_data = callout.get('hole', {})
        processed['hole'] = {
            "diameterMm": hole_data.get('diameterMm'),
            "diameterInches": hole_data.get('diameterInches'),
            "diameterRaw": hole_data.get('diameterRaw'),
            "depthMm": hole_data.get('depthMm'),
            "isThrough": hole_data.get('isThrough')
        }
        if callout_type == 'TappedHole':
            thread_data = callout.get('thread', {})
            processed['thread'] = {
                "rawText": thread_data.get('rawText'),
                "standard": thread_data.get('standard'),
                "nominalMm": thread_data.get('nominalMm'),
                "pitch": thread_data.get('pitch'),
                "tpi": thread_data.get('tpi')
            }
    
    elif callout_type == 'Fillet':
        fillet_data = callout.get('fillet', {})
        processed['fillet'] = {
            "radiusMm": fillet_data.get('radiusMm'),
            "radiusRaw": fillet_data.get('radiusRaw')
        }
    
    elif callout_type == 'Chamfer':
        chamfer_data = callout.get('chamfer', {})
        processed['chamfer'] = {
            "chamferType": chamfer_data.get('chamferType', 'AngleDistance'),
            "distance1Mm": chamfer_data.get('distance1Mm'),
            "angleDegrees": chamfer_data.get('angleDegrees', 45)
        }
    
    elif callout_type in ['LinearDimension', 'DiameterDimension', 'RadiusDimension']:
        dim_data = callout.get('dimension', {})
        processed['dimension'] = {
            "valueMm": dim_data.get('valueMm'),
            "valueInches": dim_data.get('valueInches'),
            "valueRaw": dim_data.get('valueRaw')
        }
    
    return processed


def calculate_overall_confidence(callouts: List[Dict]) -> float:
    """Calculate overall confidence from callouts."""
    if not callouts:
        return 0.0
    confidences = [c.get('confidence', 0.5) for c in callouts]
    return sum(confidences) / len(confidences)


def generate_validation_summary(callouts: List[Dict]) -> Dict:
    """Generate validation summary for callouts."""
    warnings_count = {}
    callouts_with_warnings = 0
    low_confidence = 0
    no_location = 0
    
    for c in callouts:
        warnings = c.get('validationWarnings', [])
        if warnings:
            callouts_with_warnings += 1
        for w in warnings:
            key = "other"
            if "diameter" in w.lower():
                key = "missingDiameter"
            elif "depth" in w.lower():
                key = "missingDepth"
            elif "thread" in w.lower() or "pitch" in w.lower():
                key = "incompleteThread"
            elif "confidence" in w.lower():
                key = "lowConfidence"
            elif "radius" in w.lower():
                key = "missingRadius"
            elif "chamfer" in w.lower():
                key = "missingChamfer"
            warnings_count[key] = warnings_count.get(key, 0) + 1
        
        if c.get('confidence', 1.0) < 0.5:
            low_confidence += 1
        
        loc = c.get('location', {})
        if not loc.get('bbox'):
            no_location += 1
    
    return {
        "totalCallouts": len(callouts),
        "calloutsWithWarnings": callouts_with_warnings,
        "warningCounts": warnings_count,
        "lowConfidenceCount": low_confidence,
        "noLocationCount": no_location
    }

print("VLM evidence extraction with upgrades defined.")

In [None]:
# ============================================================
# SECTION 5A: Load Qwen2-VL Model
# ============================================================
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info

gc.collect()
torch.cuda.empty_cache()

MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"

print(f"Loading {MODEL_ID}...")
print(f"GPU memory before: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
    token=hf_token,
    torch_dtype=torch.float16
)

vlm_processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    token=hf_token
)

print(f"Qwen2-VL-7B loaded!")
print(f"GPU memory after: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [None]:
# ============================================================
# SECTION 6A: Deterministic Canonicalization Post-Processor
# ============================================================
"""
This section provides DETERMINISTIC regex-based canonicalization that
runs AFTER VLM extraction. It ensures consistent formatting regardless
of how the VLM interpreted the raw text.

Key transformations:
- Diameter symbols: ⌀, Ø, ø, 0/ -> Ø (U+00D8)
- Fractions -> decimals: 1/2 -> 0.5
- THRU/THROUGH/T standardization
- DEEP/DP standardization
- Quantity parsing: (4X), 4 PLCS, 4 HOLES, TYP
- Thread normalization: M6x1.0, 1/4-20 UNC
"""

# Standard symbols
DIAMETER_SYMBOL = "\u00d8"  # Ø (U+00D8) - canonical diameter symbol
DEGREE_SYMBOL = "\u00b0"    # ° (U+00B0) - degree symbol
PLUS_MINUS = "\u00b1"       # ± (U+00B1) - plus/minus

# Common fractions lookup (for speed)
COMMON_FRACTIONS = {
    "1/64": 0.015625, "1/32": 0.03125, "3/64": 0.046875, "1/16": 0.0625,
    "5/64": 0.078125, "3/32": 0.09375, "7/64": 0.109375, "1/8": 0.125,
    "9/64": 0.140625, "5/32": 0.15625, "11/64": 0.171875, "3/16": 0.1875,
    "13/64": 0.203125, "7/32": 0.21875, "15/64": 0.234375, "1/4": 0.25,
    "17/64": 0.265625, "9/32": 0.28125, "19/64": 0.296875, "5/16": 0.3125,
    "21/64": 0.328125, "11/32": 0.34375, "23/64": 0.359375, "3/8": 0.375,
    "25/64": 0.390625, "13/32": 0.40625, "27/64": 0.421875, "7/16": 0.4375,
    "29/64": 0.453125, "15/32": 0.46875, "31/64": 0.484375, "1/2": 0.5,
    "33/64": 0.515625, "17/32": 0.53125, "35/64": 0.546875, "9/16": 0.5625,
    "37/64": 0.578125, "19/32": 0.59375, "39/64": 0.609375, "5/8": 0.625,
    "41/64": 0.640625, "21/32": 0.65625, "43/64": 0.671875, "11/16": 0.6875,
    "45/64": 0.703125, "23/32": 0.71875, "47/64": 0.734375, "3/4": 0.75,
    "49/64": 0.765625, "25/32": 0.78125, "51/64": 0.796875, "13/16": 0.8125,
    "53/64": 0.828125, "27/32": 0.84375, "55/64": 0.859375, "7/8": 0.875,
    "57/64": 0.890625, "29/32": 0.90625, "59/64": 0.921875, "15/16": 0.9375,
    "61/64": 0.953125, "31/32": 0.96875, "63/64": 0.984375,
}

# Matching tolerances (from sw_to_evidence_mapping.json)
TOLERANCES = {
    "diameter_mm": {"absolute": 0.15, "percent": 0.5},
    "depth_mm": {"absolute": 0.5, "percent": 2.0},
    "radius_mm": {"absolute": 0.05, "percent": 5.0},
    "chamfer_mm": {"absolute": 0.1, "percent": 5.0},
    "angle_deg": {"absolute": 1.0},
    "thread_pitch": {"exact": True},
    "quantity": {"exact": True}
}


def canonicalize_diameter_symbol(text: str) -> str:
    """
    Replace all diameter symbol variants with canonical Ø (U+00D8).

    Handles: ⌀ (U+2300), ø (U+00F8), 0/ (typo), DIA, etc.
    """
    # Unicode diameter symbols
    text = text.replace("\u2300", DIAMETER_SYMBOL)  # ⌀ -> Ø
    text = text.replace("\u00f8", DIAMETER_SYMBOL)  # ø -> Ø
    text = text.replace("\u2205", DIAMETER_SYMBOL)  # ∅ -> Ø

    # OCR misreads
    text = re.sub(r'0/', DIAMETER_SYMBOL, text)  # 0/ -> Ø
    text = re.sub(r'O/', DIAMETER_SYMBOL, text)  # O/ -> Ø
    text = re.sub(r'\bDIA\s*', DIAMETER_SYMBOL, text, flags=re.IGNORECASE)  # DIA -> Ø
    text = re.sub(r'\bDIAM(?:ETER)?\s*', DIAMETER_SYMBOL, text, flags=re.IGNORECASE)

    return text


def fraction_to_decimal(text: str) -> str:
    """
    Convert fraction strings to decimal.

    Handles: "1/2", "1-1/2", "1 1/2" -> "0.5", "1.5", "1.5"
    """
    def replace_fraction(match):
        whole = match.group(1)
        numer = int(match.group(2))
        denom = int(match.group(3))

        # Try lookup first
        frac_key = f"{numer}/{denom}"
        if frac_key in COMMON_FRACTIONS:
            decimal = COMMON_FRACTIONS[frac_key]
        else:
            decimal = numer / denom

        if whole:
            whole_num = int(whole)
            return f"{whole_num + decimal:.4f}".rstrip('0').rstrip('.')
        else:
            return f"{decimal:.4f}".rstrip('0').rstrip('.')

    # Match whole-fraction (1-1/2, 1 1/2) or just fraction (1/2)
    pattern = r'(\d+)?[\s-]*(\d+)/(\d+)'
    return re.sub(pattern, replace_fraction, text)


def canonicalize_through_hole(text: str) -> str:
    """
    Standardize THROUGH hole notation.

    THRU, THROUGH, T/H, THROUGH ALL -> THRU
    """
    text = re.sub(r'\bTHROUGH\s+ALL\b', 'THRU', text, flags=re.IGNORECASE)
    text = re.sub(r'\bTHROUGH\b', 'THRU', text, flags=re.IGNORECASE)
    text = re.sub(r'\bT/H\b', 'THRU', text, flags=re.IGNORECASE)
    return text


def canonicalize_depth(text: str) -> str:
    """
    Standardize DEPTH notation.

    DP, D/P, DEEP -> DEEP
    Also extracts depth value pattern.
    """
    text = re.sub(r'\bDP\b', 'DEEP', text, flags=re.IGNORECASE)
    text = re.sub(r'\bD/P\b', 'DEEP', text, flags=re.IGNORECASE)
    return text


def parse_quantity(text: str) -> Dict[str, Any]:
    """
    Extract quantity information from text.

    Patterns:
    - (4X), 4X -> quantity=4
    - 4 PLCS, 4 PLACES -> quantity=4, plcs=4
    - 4 HOLES -> quantity=4
    - TYP, TYPICAL -> isTypical=True

    Returns dict with: quantity, plcs, quantityRaw, isTypical
    """
    result = {
        "quantity": 1,
        "plcs": None,
        "quantityRaw": None,
        "isTypical": None
    }

    text_upper = text.upper()

    # Check for TYP/TYPICAL first
    if re.search(r'\bTYP(?:ICAL)?\b', text_upper):
        result["isTypical"] = True
        result["quantityRaw"] = "TYP"

    # Pattern: (4X) or 4X at start
    match = re.search(r'\((\d+)\s*X\)', text_upper)
    if match:
        result["quantity"] = int(match.group(1))
        result["quantityRaw"] = match.group(0)
        return result

    match = re.search(r'^(\d+)\s*X\b', text_upper)
    if match:
        result["quantity"] = int(match.group(1))
        result["quantityRaw"] = f"({match.group(1)}X)"
        return result

    # Pattern: 4 PLCS, 4 PLACES
    match = re.search(r'(\d+)\s*(?:PLCS?|PLACES?)', text_upper)
    if match:
        result["quantity"] = int(match.group(1))
        result["plcs"] = int(match.group(1))
        result["quantityRaw"] = f"{match.group(1)} PLCS"
        return result

    # Pattern: 4 HOLES
    match = re.search(r'(\d+)\s*HOLES?', text_upper)
    if match:
        result["quantity"] = int(match.group(1))
        result["quantityRaw"] = f"{match.group(1)} HOLES"
        return result

    return result


def parse_hole_callout(text: str) -> Dict[str, Any]:
    """
    Parse a hole callout string into structured data.

    Examples:
    - "Ø12.70 THRU" -> {diameterMm: 12.7, isThrough: True}
    - "Ø.500 x .750 DEEP" -> {diameterMm: 12.7, depthMm: 19.05} (if inches)
    - "Ø12.70 x 25.4 DEEP (4X)" -> {diameterMm: 12.7, depthMm: 25.4, quantity: 4}
    """
    result = {
        "diameterMm": None,
        "diameterRaw": None,
        "depthMm": None,
        "depthRaw": None,
        "isThrough": False,
        "quantity": 1,
        "plcs": None,
        "quantityRaw": None,
        "isTypical": None
    }

    # Canonicalize first
    text = canonicalize_diameter_symbol(text)
    text = fraction_to_decimal(text)
    text = canonicalize_through_hole(text)
    text = canonicalize_depth(text)

    # Extract quantity
    qty_info = parse_quantity(text)
    result.update(qty_info)

    # Check for THRU
    if re.search(r'\bTHRU\b', text, re.IGNORECASE):
        result["isThrough"] = True

    # Extract diameter (after Ø symbol)
    dia_match = re.search(r'[Øø\u2300]([\d.]+)', text)
    if dia_match:
        result["diameterMm"] = float(dia_match.group(1))
        result["diameterRaw"] = f"{DIAMETER_SYMBOL}{dia_match.group(1)}"

    # Extract depth (before DEEP)
    depth_match = re.search(r'[xX×]\s*([\d.]+)\s*(?:mm)?\s*DEEP', text, re.IGNORECASE)
    if depth_match:
        result["depthMm"] = float(depth_match.group(1))
        result["depthRaw"] = f"{depth_match.group(1)} DEEP"

    return result


def parse_thread_callout(text: str) -> Dict[str, Any]:
    """
    Parse a thread callout string into structured data.

    Examples:
    - "M6x1.0" -> {standard: "Metric", nominalMm: 6, pitch: 1.0}
    - "1/4-20 UNC" -> {standard: "UNC", nominalMm: 6.35, tpi: 20}
    - "#10-32 UNF" -> {standard: "UNF", nominalMm: 4.83, tpi: 32}
    """
    result = {
        "standard": None,
        "nominalMm": None,
        "nominalRaw": None,
        "pitch": None,
        "tpi": None,
        "thread_class": None
    }

    text = fraction_to_decimal(text)
    text_upper = text.upper()

    # Metric thread: M6x1.0, M8x1.25
    metric_match = re.search(r'M(\d+(?:\.\d+)?)\s*[xX×]\s*(\d+(?:\.\d+)?)', text_upper)
    if metric_match:
        result["standard"] = "Metric"
        result["nominalMm"] = float(metric_match.group(1))
        result["nominalRaw"] = f"M{metric_match.group(1)}"
        result["pitch"] = float(metric_match.group(2))
        return result

    # Metric thread without pitch: M6
    metric_simple = re.search(r'M(\d+(?:\.\d+)?)\b', text_upper)
    if metric_simple:
        result["standard"] = "Metric"
        result["nominalMm"] = float(metric_simple.group(1))
        result["nominalRaw"] = f"M{metric_simple.group(1)}"
        return result

    # Unified thread: 1/4-20 UNC, .250-20 UNC
    unified_match = re.search(r'([\d.]+)-(\d+)\s*(UN[CFJ]?)', text_upper)
    if unified_match:
        nominal_inch = float(unified_match.group(1))
        result["standard"] = unified_match.group(3)
        result["nominalMm"] = nominal_inch * 25.4
        result["nominalRaw"] = unified_match.group(1)
        result["tpi"] = int(unified_match.group(2))
        return result

    # Numbered threads: #10-32, #6-32
    numbered_match = re.search(r'#(\d+)-(\d+)', text_upper)
    if numbered_match:
        num = int(numbered_match.group(1))
        # Convert number to diameter: D = 0.013*N + 0.060 (inches)
        diameter_inch = 0.013 * num + 0.060
        result["standard"] = "Unified"
        result["nominalMm"] = diameter_inch * 25.4
        result["nominalRaw"] = f"#{num}"
        result["tpi"] = int(numbered_match.group(2))
        return result

    return result


def parse_fillet_callout(text: str) -> Dict[str, Any]:
    """
    Parse a fillet/radius callout.

    Examples:
    - "R3.0" -> {radiusMm: 3.0}
    - "R.125" (inches) -> {radiusMm: 3.175}
    - "FILLET R0.030" -> {radiusMm: 0.76}
    """
    result = {
        "radiusMm": None,
        "radiusRaw": None
    }

    text = fraction_to_decimal(text)

    # Pattern: R followed by number
    r_match = re.search(r'R\s*([\d.]+)', text, re.IGNORECASE)
    if r_match:
        result["radiusMm"] = float(r_match.group(1))
        result["radiusRaw"] = f"R{r_match.group(1)}"

    return result


def parse_chamfer_callout(text: str) -> Dict[str, Any]:
    """
    Parse a chamfer callout.

    Examples:
    - "0.030 x 45°" -> {distance1Mm: 0.76, angleDegrees: 45}
    - "1.0 x 1.0" -> {distance1Mm: 1.0, distance2Mm: 1.0}
    - "CHAMFER .030 X 45 DEG" -> {distance1Mm: 0.76, angleDegrees: 45}
    """
    result = {
        "chamferType": "AngleDistance",
        "distance1Mm": None,
        "distance2Mm": None,
        "angleDegrees": None
    }

    text = fraction_to_decimal(text)
    text = text.replace("°", " DEG ").replace(DEGREE_SYMBOL, " DEG ")

    # Pattern: distance x angle (45°, 45 DEG)
    angle_match = re.search(r'([\d.]+)\s*[xX×]\s*([\d.]+)\s*(?:DEG|°)?', text)
    if angle_match:
        val1 = float(angle_match.group(1))
        val2 = float(angle_match.group(2))

        # If second value is 45, 30, 60 - it's probably an angle
        if val2 in [30, 45, 60]:
            result["distance1Mm"] = val1
            result["angleDegrees"] = val2
        else:
            # Two distances
            result["chamferType"] = "TwoDistances"
            result["distance1Mm"] = val1
            result["distance2Mm"] = val2

    return result


def generate_canonical_form(callout: Dict) -> str:
    """
    Generate canonical text form for a callout.

    Output format examples:
    - Hole: "Ø12.70mm THRU (4X)"
    - Thread: "M6x1.0 x 12mm DEEP"
    - Fillet: "R3.00mm"
    - Chamfer: "0.76mm x 45°"
    """
    callout_type = callout.get("calloutType", "")

    if callout_type in ["Hole", "TappedHole"]:
        hole = callout.get("hole", {})
        dia = hole.get("diameterMm")
        if not dia:
            return callout.get("rawText", "")

        parts = [f"{DIAMETER_SYMBOL}{dia:.2f}mm"]

        if callout_type == "TappedHole":
            thread = callout.get("thread", {})
            if thread.get("standard") == "Metric":
                nom = thread.get("nominalMm")
                pitch = thread.get("pitch")
                if nom and pitch:
                    parts = [f"M{nom:.0f}x{pitch}"]

        if hole.get("isThrough"):
            parts.append("THRU")
        elif hole.get("depthMm"):
            parts.append(f"x {hole['depthMm']:.1f}mm DEEP")

        qty = callout.get("quantity", 1)
        if qty > 1:
            parts.append(f"({qty}X)")

        return " ".join(parts)

    elif callout_type == "Fillet":
        fillet = callout.get("fillet", {})
        radius = fillet.get("radiusMm")
        if radius:
            return f"R{radius:.2f}mm"
        return callout.get("rawText", "")

    elif callout_type == "Chamfer":
        chamfer = callout.get("chamfer", {})
        d1 = chamfer.get("distance1Mm")
        angle = chamfer.get("angleDegrees")
        d2 = chamfer.get("distance2Mm")

        if d1 and angle:
            return f"{d1:.2f}mm x {angle:.0f}{DEGREE_SYMBOL}"
        elif d1 and d2:
            return f"{d1:.2f}mm x {d2:.2f}mm"
        return callout.get("rawText", "")

    return callout.get("rawText", "")


def canonicalize_callout(callout: Dict) -> Dict:
    """
    Apply full canonicalization to a callout.

    1. Parse rawText to extract values
    2. Fill in missing fields
    3. Generate canonical form
    4. Add validation warnings
    """
    callout = callout.copy()
    raw_text = callout.get("rawText", "")
    callout_type = callout.get("calloutType", "")

    # Parse based on type
    if callout_type in ["Hole", "TappedHole", "Counterbore", "Countersink"]:
        parsed = parse_hole_callout(raw_text)

        # Update hole data if not already set
        if "hole" not in callout or not callout["hole"]:
            callout["hole"] = {}

        hole = callout["hole"]
        if not hole.get("diameterMm") and parsed["diameterMm"]:
            hole["diameterMm"] = parsed["diameterMm"]
        if not hole.get("depthMm") and parsed["depthMm"]:
            hole["depthMm"] = parsed["depthMm"]
        if hole.get("isThrough") is None:
            hole["isThrough"] = parsed["isThrough"]

        callout["hole"] = hole

        # Update quantity
        if callout.get("quantity", 1) == 1 and parsed["quantity"] > 1:
            callout["quantity"] = parsed["quantity"]
            callout["quantityRaw"] = parsed["quantityRaw"]
        if parsed["plcs"]:
            callout["plcs"] = parsed["plcs"]
        if parsed["isTypical"]:
            callout["isTypical"] = parsed["isTypical"]

        # Thread parsing for tapped holes
        if callout_type == "TappedHole":
            thread_parsed = parse_thread_callout(raw_text)
            if "thread" not in callout or not callout["thread"]:
                callout["thread"] = {}

            thread = callout["thread"]
            if not thread.get("standard") and thread_parsed["standard"]:
                thread["standard"] = thread_parsed["standard"]
            if not thread.get("nominalMm") and thread_parsed["nominalMm"]:
                thread["nominalMm"] = thread_parsed["nominalMm"]
            if not thread.get("pitch") and thread_parsed["pitch"]:
                thread["pitch"] = thread_parsed["pitch"]
            if not thread.get("tpi") and thread_parsed["tpi"]:
                thread["tpi"] = thread_parsed["tpi"]

            callout["thread"] = thread

    elif callout_type == "Fillet":
        parsed = parse_fillet_callout(raw_text)
        if "fillet" not in callout or not callout["fillet"]:
            callout["fillet"] = {}

        fillet = callout["fillet"]
        if not fillet.get("radiusMm") and parsed["radiusMm"]:
            fillet["radiusMm"] = parsed["radiusMm"]
        callout["fillet"] = fillet

    elif callout_type == "Chamfer":
        parsed = parse_chamfer_callout(raw_text)
        if "chamfer" not in callout or not callout["chamfer"]:
            callout["chamfer"] = {}

        chamfer = callout["chamfer"]
        if not chamfer.get("distance1Mm") and parsed["distance1Mm"]:
            chamfer["distance1Mm"] = parsed["distance1Mm"]
        if not chamfer.get("angleDegrees") and parsed["angleDegrees"]:
            chamfer["angleDegrees"] = parsed["angleDegrees"]
        if not chamfer.get("distance2Mm") and parsed["distance2Mm"]:
            chamfer["distance2Mm"] = parsed["distance2Mm"]
        callout["chamfer"] = chamfer

    # Generate canonical form
    callout["canonical"] = generate_canonical_form(callout)

    # Add validation warnings
    callout["validationWarnings"] = generate_callout_warnings(callout)

    return callout


def generate_callout_warnings(callout: Dict) -> List[str]:
    """Generate soft validation warnings for a callout."""
    warnings = []
    callout_type = callout.get("calloutType", "")

    if callout_type in ["Hole", "TappedHole"]:
        hole = callout.get("hole", {})
        if not hole.get("diameterMm"):
            warnings.append("Missing diameter")
        if not hole.get("isThrough") and not hole.get("depthMm"):
            warnings.append("Blind hole without depth")

        if callout_type == "TappedHole":
            thread = callout.get("thread", {})
            if not thread.get("standard"):
                warnings.append("Thread standard not identified")
            if not thread.get("pitch") and not thread.get("tpi"):
                warnings.append("Thread pitch/TPI not specified")

    elif callout_type == "Fillet":
        fillet = callout.get("fillet", {})
        if not fillet.get("radiusMm"):
            warnings.append("Missing radius")

    elif callout_type == "Chamfer":
        chamfer = callout.get("chamfer", {})
        if not chamfer.get("distance1Mm"):
            warnings.append("Missing chamfer distance")

    # Low confidence warning
    if callout.get("confidence", 1.0) < 0.5:
        warnings.append(f"Low confidence: {callout.get('confidence', 0):.2f}")

    return warnings


def canonicalize_all_callouts(callouts: List[Dict]) -> List[Dict]:
    """
    Apply canonicalization to all callouts in a list.
    """
    return [canonicalize_callout(c) for c in callouts]


def values_match(val1: float, val2: float, tolerance_key: str) -> Tuple[bool, float]:
    """
    Compare two values within tolerance.
    Returns (matches, score).
    """
    if val1 is None or val2 is None:
        return False, 0.0

    tol = TOLERANCES.get(tolerance_key, {})

    if tol.get('exact'):
        matches = val1 == val2
        return matches, 1.0 if matches else 0.0

    diff = abs(val1 - val2)
    abs_tol = tol.get('absolute', 0.1)
    pct_tol = tol.get('percent', 1.0)

    pct_diff = (diff / val1 * 100) if val1 > 0 else 100

    # Match if within either absolute OR percentage tolerance
    if diff <= abs_tol or pct_diff <= pct_tol:
        score = 1.0 - (diff / (abs_tol * 2))
        return True, max(0.5, min(1.0, score))

    return False, 0.0


print("Deterministic canonicalization post-processor defined.")

---
## Section 6: Rule Matcher -> DiffResult

In [None]:
# ============================================================
# SECTION 6A: Canonicalization & Matching Rules
# ============================================================

# Constants for canonicalization
INCH_TO_MM = 25.4
DIAMETER_SYMBOL = "\u00f8"  # U+00F8
DEGREE_SYMBOL = "\u00b0"    # U+00B0

# Common fractions
COMMON_FRACTIONS = {
    "1/64": 0.015625, "1/32": 0.03125, "1/16": 0.0625, "3/32": 0.09375,
    "1/8": 0.125, "5/32": 0.15625, "3/16": 0.1875, "7/32": 0.21875,
    "1/4": 0.25, "9/32": 0.28125, "5/16": 0.3125, "11/32": 0.34375,
    "3/8": 0.375, "13/32": 0.40625, "7/16": 0.4375, "15/32": 0.46875,
    "1/2": 0.5, "17/32": 0.53125, "9/16": 0.5625, "19/32": 0.59375,
    "5/8": 0.625, "21/32": 0.65625, "11/16": 0.6875, "23/32": 0.71875,
    "3/4": 0.75, "25/32": 0.78125, "13/16": 0.8125, "27/32": 0.84375,
    "7/8": 0.875, "29/32": 0.90625, "15/16": 0.9375, "31/32": 0.96875,
}

# Matching tolerances (from schema)
TOLERANCES = {
    "diameter_mm": {"absolute": 0.15, "percent": 0.5},
    "depth_mm": {"absolute": 0.5, "percent": 2.0},
    "radius_mm": {"absolute": 0.05, "percent": 5.0},
    "angle_deg": {"absolute": 1.0},
    "thread_pitch": {"exact": True},
    "quantity": {"exact": True}
}


def values_match(val1: float, val2: float, tolerance_key: str) -> Tuple[bool, float]:
    """
    Compare two values within tolerance.
    Returns (matches, score).
    """
    if val1 is None or val2 is None:
        return False, 0.0

    tol = TOLERANCES.get(tolerance_key, {})

    if tol.get('exact'):
        matches = val1 == val2
        return matches, 1.0 if matches else 0.0

    diff = abs(val1 - val2)
    abs_tol = tol.get('absolute', 0.1)
    pct_tol = tol.get('percent', 1.0)

    pct_diff = (diff / val1 * 100) if val1 > 0 else 100

    # Match if within either absolute OR percentage tolerance
    if diff <= abs_tol or pct_diff <= pct_tol:
        score = 1.0 - (diff / (abs_tol * 2))  # Score based on how close
        return True, max(0.5, min(1.0, score))

    return False, 0.0


def normalize_thread_callout(raw: str) -> str:
    """Normalize thread callout for comparison."""
    # Uppercase and clean
    s = raw.upper().strip()
    # Normalize separators
    s = re.sub(r'\s*[Xx]\s*', 'x', s)
    s = re.sub(r'\s*-\s*', '-', s)
    return s


def normalize_hole_callout(callout: Dict) -> str:
    """Generate canonical form for hole callout."""
    hole = callout.get('hole', {})

    dia_mm = hole.get('diameterMm')
    if dia_mm is None:
        return callout.get('rawText', '')

    canonical = f"{DIAMETER_SYMBOL}{dia_mm:.2f}mm"

    if hole.get('isThrough'):
        canonical += " THRU"
    elif hole.get('depthMm'):
        canonical += f" x {hole['depthMm']:.1f}mm DEEP"

    qty = callout.get('quantity', 1)
    if qty > 1:
        canonical += f" ({qty}X)"

    return canonical

print("Canonicalization functions defined.")

In [None]:
# ============================================================
# SECTION 6B: DiffResult Generation
# ============================================================

@dataclass
class MatchResult:
    """Result of matching a SW requirement to drawing evidence."""
    sw_item: Dict[str, Any]
    evidence_item: Optional[Dict[str, Any]]
    status: str  # "matched", "missing", "conflict", "ambiguous"
    score: float
    notes: List[str] = field(default_factory=list)


def match_holes(sw_holes: List[Dict], evidence_callouts: List[Dict]) -> List[MatchResult]:
    """Match SolidWorks holes to drawing evidence."""
    results = []
    evidence_holes = [c for c in evidence_callouts if c.get('calloutType') in ['Hole', 'TappedHole']]
    used_evidence = set()

    for sw_hole in sw_holes:
        sw_dia = sw_hole.get('diameter_mm') or sw_hole.get('diameterMm')
        sw_type = sw_hole.get('type', 'Through')
        sw_depth = sw_hole.get('depth_mm') or sw_hole.get('depthMm')
        sw_count = sw_hole.get('count', 1)

        best_match = None
        best_score = 0.0
        best_idx = -1

        for idx, ev_hole in enumerate(evidence_holes):
            if idx in used_evidence:
                continue

            hole_data = ev_hole.get('hole', {})
            ev_dia = hole_data.get('diameterMm')
            ev_through = hole_data.get('isThrough', False)
            ev_depth = hole_data.get('depthMm')
            ev_count = ev_hole.get('quantity', 1)

            # Check diameter match
            dia_match, dia_score = values_match(sw_dia, ev_dia, 'diameter_mm')
            if not dia_match:
                continue

            # Check type match
            sw_is_through = sw_type.lower() in ['through', 'thru']
            if sw_is_through != ev_through:
                continue

            # Check depth for blind holes
            depth_score = 1.0
            if not sw_is_through and sw_depth and ev_depth:
                depth_match, depth_score = values_match(sw_depth, ev_depth, 'depth_mm')
                if not depth_match:
                    continue

            # Calculate overall score
            score = (dia_score + depth_score) / 2
            if sw_count == ev_count:
                score *= 1.0
            else:
                score *= 0.8  # Quantity mismatch penalty

            if score > best_score:
                best_score = score
                best_match = ev_hole
                best_idx = idx

        if best_match:
            used_evidence.add(best_idx)
            notes = []
            if sw_count != best_match.get('quantity', 1):
                notes.append(f"Quantity: SW={sw_count}, Drawing={best_match.get('quantity', 1)}")
            results.append(MatchResult(
                sw_item=sw_hole,
                evidence_item=best_match,
                status="matched",
                score=best_score,
                notes=notes
            ))
        else:
            results.append(MatchResult(
                sw_item=sw_hole,
                evidence_item=None,
                status="missing",
                score=0.0,
                notes=[f"No matching hole found for {DIAMETER_SYMBOL}{sw_dia:.2f}mm"]
            ))

    return results


def generate_diff_result(
    drawing_evidence: Dict[str, Any],
    sw_data: Dict[str, Any]
) -> Dict[str, Any]:
    """
    Compare DrawingEvidence to SolidWorks truth and generate DiffResult.
    """
    matched = []
    missing = []
    conflicts = []
    ambiguous = []
    extras = []

    # Get SW requirements
    sw_comparison = sw_data.get('comparison', {})
    sw_hole_groups = sw_comparison.get('holeGroups', [])
    sw_features = sw_data.get('features', {})
    sw_fillets = sw_features.get('fillets', [])
    sw_chamfers = sw_features.get('chamfers', [])

    # Get drawing evidence
    evidence_callouts = drawing_evidence.get('foundCallouts', [])

    # Match holes
    if sw_hole_groups:
        print(f"  Matching {len(sw_hole_groups)} hole groups...")
        hole_results = match_holes(sw_hole_groups, evidence_callouts)
        for r in hole_results:
            item = {
                "type": "hole",
                "swRequirement": r.sw_item,
                "drawingEvidence": r.evidence_item,
                "score": r.score,
                "notes": r.notes
            }
            if r.status == "matched":
                matched.append(item)
            elif r.status == "missing":
                missing.append(item)
            elif r.status == "conflict":
                conflicts.append(item)

    # Match fillets (simplified)
    evidence_fillets = [c for c in evidence_callouts if c.get('calloutType') == 'Fillet']
    for sw_fillet in sw_fillets:
        sw_radius = sw_fillet.get('radius_mm') or sw_fillet.get('radiusMm')
        found = False
        for ev_fillet in evidence_fillets:
            ev_radius = ev_fillet.get('fillet', {}).get('radiusMm')
            match, score = values_match(sw_radius, ev_radius, 'radius_mm')
            if match:
                matched.append({
                    "type": "fillet",
                    "swRequirement": sw_fillet,
                    "drawingEvidence": ev_fillet,
                    "score": score,
                    "notes": []
                })
                found = True
                break
        if not found:
            missing.append({
                "type": "fillet",
                "swRequirement": sw_fillet,
                "drawingEvidence": None,
                "score": 0.0,
                "notes": [f"Fillet R{sw_radius:.2f}mm not found"]
            })

    # Summary
    total_sw = len(sw_hole_groups) + len(sw_fillets) + len(sw_chamfers)

    diff_result = {
        "matched": matched,
        "missing": missing,
        "conflicts": conflicts,
        "ambiguous": ambiguous,
        "extras": extras,
        "summary": {
            "totalSwRequirements": total_sw,
            "matchedCount": len(matched),
            "missingCount": len(missing),
            "conflictCount": len(conflicts),
            "ambiguousCount": len(ambiguous),
            "extraCount": len(extras),
            "matchRate": len(matched) / max(total_sw, 1)
        }
    }

    return diff_result

print("DiffResult generation functions defined.")

---
## Section 7: Text LLM Judge -> QCReport

In [None]:
# ============================================================
# SECTION 7A: QC Report Generation
# ============================================================

def generate_qc_report(
    resolved_identity: ResolvedPartIdentity,
    drawing_evidence: Dict[str, Any],
    diff_result: Dict[str, Any],
    sw_data: Dict[str, Any]
) -> str:
    """
    Generate human-readable QC report in Markdown format.

    Uses VLM as judge to provide verdict and recommendations.
    """
    global vlm_model, vlm_processor

    summary = diff_result.get('summary', {})

    # Build context for judge
    sw_identity = sw_data.get('identity', {})

    judge_context = f"""
**Part Information:**
- Part Number: {resolved_identity.resolvedPartNumber or 'Unknown'}
- Description: {sw_identity.get('description', 'Unknown')}
- Material: {sw_identity.get('material', 'Unknown')}

**Comparison Summary:**
- Total SW Requirements: {summary.get('totalSwRequirements', 0)}
- Matched: {summary.get('matchedCount', 0)}
- Missing: {summary.get('missingCount', 0)}
- Conflicts: {summary.get('conflictCount', 0)}
- Match Rate: {summary.get('matchRate', 0):.1%}

**Missing Items:**
"""

    for item in diff_result.get('missing', [])[:10]:
        sw_req = item.get('swRequirement', {})
        item_type = item.get('type', 'unknown')
        if item_type == 'hole':
            dia = sw_req.get('diameter_mm') or sw_req.get('diameterMm', 0)
            judge_context += f"- Hole: {DIAMETER_SYMBOL}{dia:.2f}mm\n"
        elif item_type == 'fillet':
            rad = sw_req.get('radius_mm') or sw_req.get('radiusMm', 0)
            judge_context += f"- Fillet: R{rad:.2f}mm\n"
        else:
            judge_context += f"- {item_type}: {item.get('notes', [])}\n"

    # Determine verdict based on match rate
    match_rate = summary.get('matchRate', 0)
    missing_count = summary.get('missingCount', 0)
    conflict_count = summary.get('conflictCount', 0)

    if match_rate >= 0.95 and conflict_count == 0:
        verdict = "PASS"
        verdict_reason = "All critical requirements verified"
    elif match_rate >= 0.7 and conflict_count == 0:
        verdict = "NEEDS_REVIEW"
        verdict_reason = f"{missing_count} items could not be verified"
    else:
        verdict = "FAIL"
        if conflict_count > 0:
            verdict_reason = f"{conflict_count} conflicting dimensions found"
        else:
            verdict_reason = f"{missing_count} required items missing from drawing"

    # Build report
    report = f"""# QC Inspection Report

**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Part Identification

| Field | Value |
|-------|-------|
| Part Number | {resolved_identity.resolvedPartNumber or 'Unknown'} |
| Description | {sw_identity.get('description', 'Unknown')} |
| Material | {sw_identity.get('material', 'Unknown')} |
| Drawing File | {os.path.basename(DRAWING_PDF_PATH)} |
| SW JSON | {os.path.basename(resolved_identity.jsonPath or 'Not found')} |
| ID Confidence | {resolved_identity.confidence:.0%} |

## Verdict

### **{verdict}**

{verdict_reason}

## Summary

| Metric | Count |
|--------|-------|
| Total SW Requirements | {summary.get('totalSwRequirements', 0)} |
| Matched | {summary.get('matchedCount', 0)} |
| Missing | {summary.get('missingCount', 0)} |
| Conflicts | {summary.get('conflictCount', 0)} |
| **Match Rate** | **{summary.get('matchRate', 0):.1%}** |

## Matched Items

| Type | SW Requirement | Drawing Evidence | Score |
|------|----------------|------------------|-------|
"""

    for item in diff_result.get('matched', [])[:20]:
        sw_req = item.get('swRequirement', {})
        ev = item.get('drawingEvidence', {})
        item_type = item.get('type', 'unknown')
        score = item.get('score', 0)

        if item_type == 'hole':
            sw_str = f"{DIAMETER_SYMBOL}{sw_req.get('diameter_mm', sw_req.get('diameterMm', 0)):.2f}mm"
            ev_str = ev.get('rawText', '') if ev else 'N/A'
        else:
            sw_str = str(sw_req)
            ev_str = ev.get('rawText', '') if ev else 'N/A'

        report += f"| {item_type} | {sw_str} | {ev_str[:30]} | {score:.0%} |\n"

    report += "\n## Missing Items\n\n"

    if diff_result.get('missing'):
        report += "| Type | SW Requirement | Notes |\n"
        report += "|------|----------------|-------|\n"

        for item in diff_result.get('missing', []):
            sw_req = item.get('swRequirement', {})
            item_type = item.get('type', 'unknown')
            notes = '; '.join(item.get('notes', []))

            if item_type == 'hole':
                sw_str = f"{DIAMETER_SYMBOL}{sw_req.get('diameter_mm', sw_req.get('diameterMm', 0)):.2f}mm"
            elif item_type == 'fillet':
                sw_str = f"R{sw_req.get('radius_mm', sw_req.get('radiusMm', 0)):.2f}mm"
            else:
                sw_str = str(sw_req)[:30]

            report += f"| {item_type} | {sw_str} | {notes} |\n"
    else:
        report += "*No missing items*\n"

    if diff_result.get('conflicts'):
        report += "\n## Conflicts\n\n"
        report += "| Type | SW Requirement | Drawing Evidence | Issue |\n"
        report += "|------|----------------|------------------|-------|\n"

        for item in diff_result.get('conflicts', []):
            report += f"| {item.get('type')} | ... | ... | {item.get('notes', [])} |\n"

    report += f"""\n## Recommendations\n
"""

    if verdict == "PASS":
        report += "- Drawing appears complete and matches SW model\n"
        report += "- No action required\n"
    elif verdict == "NEEDS_REVIEW":
        report += "- Manual review recommended for unverified items\n"
        report += "- Check if missing callouts are on different views/sheets\n"
        report += "- Verify OCR accuracy for complex dimensions\n"
    else:
        report += "- Drawing requires updates before release\n"
        report += "- Add missing callouts as listed above\n"
        report += "- Resolve any dimension conflicts\n"

    if resolved_identity.needsReview:
        report += f"\n**Note:** Part identification has low confidence ({resolved_identity.confidence:.0%}). "
        report += "Verify the correct part was matched.\n"

    report += "\n---\n*Generated by AI Drawing Inspector v2.0*\n"

    return report

print("QC report generation function defined.")

# ============================================================
# SECTION 8B: Main Pipeline Function (with All Upgrades)
# ============================================================

def run_qc_pipeline(pdf_path: str, mode: str = "fast") -> Dict[str, Any]:
    """
    Run the complete QC pipeline on a single drawing.
    
    Integrates all 4 upgrades:
    1. Real OCR bounding boxes (Tesseract + LightOnOCR hybrid)
    2. Title-block / notes-region ROI OCR
    3. Explicit unit detection + normalization
    4. Deterministic canonicalization post-processor
    
    Args:
        pdf_path: Path to drawing PDF
        mode: "fast" (first page) or "full" (all pages)
    
    Returns:
        Dict with all artifacts and file paths
    """
    print("="*60)
    print("AI DRAWING INSPECTOR v2.0 (with Upgrades)")
    print("="*60)
    print(f"Input: {pdf_path}")
    print(f"Mode: {mode}")
    print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*60)
    
    # Step 1: PDF Ingestion
    print("\n[1/8] PDF Ingestion...")
    page_artifacts = render_pdf_to_artifacts(pdf_path, mode=mode)
    if not page_artifacts:
        return {"error": "Failed to render PDF"}
    
    # Step 2: OCR Extraction (Upgrade 1: Real bounding boxes)
    print("\n[2/8] OCR Extraction (with real bounding boxes)...")
    ocr_blocks = run_ocr_on_all_pages(page_artifacts)
    ocr_lines = [b.text for b in ocr_blocks]
    print(f"  Total OCR blocks: {len(ocr_blocks)}")
    
    # Count blocks with real bboxes
    blocks_with_bbox = sum(1 for b in ocr_blocks if b.bbox and b.bbox.get('width'))
    print(f"  Blocks with real bboxes: {blocks_with_bbox}/{len(ocr_blocks)}")
    
    # Step 3: ROI Detection (Upgrade 2: Title block & notes regions)
    print("\n[3/8] ROI Detection (title block & notes regions)...")
    title_block_region = None
    notes_region = None
    title_block_identity = {}
    
    if page_artifacts:
        # Detect regions on first page
        first_page = page_artifacts[0]
        
        title_block_region = detect_title_block_region(first_page, ocr_blocks)
        if title_block_region:
            print(f"  Title block found at y={title_block_region.bbox_norm['y']:.2f}")
            print(f"    Region text samples: {title_block_region.ocr_text[:3]}...")
            
            # Extract structured identity from title block
            title_block_identity = extract_identity_from_title_block(title_block_region, ocr_blocks)
            if title_block_identity.get('partNumber'):
                print(f"    Part Number (from title): {title_block_identity['partNumber']}")
            if title_block_identity.get('material'):
                print(f"    Material (from title): {title_block_identity['material']}")
        else:
            print("  No title block detected")
        
        notes_region = detect_notes_region(first_page, ocr_blocks)
        if notes_region:
            print(f"  Notes region found at x={notes_region.bbox_norm['x']:.2f}")
            notes_content = extract_notes_content(notes_region, ocr_blocks)
            print(f"    Found {len(notes_content)} structured notes")
        else:
            print("  No notes region detected")
    
    # Step 4: Unit Detection (Upgrade 3)
    print("\n[4/8] Unit Detection...")
    detected_units = detect_units_from_text(ocr_lines)
    print(f"  Detected unit: {detected_units.detected_unit.value}")
    print(f"  Confidence: {detected_units.confidence:.0%}")
    if detected_units.evidence:
        print(f"  Evidence: {detected_units.evidence[:2]}")
    
    # Step 5: Resolve Part Identity
    print("\n[5/8] Resolving Part Identity...")
    resolved_identity = resolve_part_identity(pdf_path, ocr_lines)
    
    # Merge with title block identity
    if title_block_identity.get('partNumber') and not resolved_identity.resolvedPartNumber:
        # Try to find match for title block part number
        match = sw_library.lookup(title_block_identity['partNumber']) if sw_library else None
        if match:
            resolved_identity = ResolvedPartIdentity(
                resolvedPartNumber=title_block_identity['partNumber'],
                jsonPath=match.json_path,
                confidence=0.85,
                candidates=resolved_identity.candidates,
                notes=["Matched from title block OCR"] + resolved_identity.notes,
                needsReview=False
            )
    
    print(f"  Resolved: {resolved_identity.resolvedPartNumber}")
    print(f"  Confidence: {resolved_identity.confidence:.0%}")
    print(f"  JSON Path: {resolved_identity.jsonPath or 'Not found'}")
    if resolved_identity.needsReview:
        print(f"  WARNING: Needs review - {resolved_identity.notes}")
    
    # Load SW data (using BOM-robust loader)
    sw_data = {}
    if resolved_identity.jsonPath and os.path.exists(resolved_identity.jsonPath):
        sw_data, load_error = load_json_robust(Path(resolved_identity.jsonPath))
        if sw_data:
            print(f"  SW data loaded: {len(sw_data.get('comparison', {}).get('holeGroups', []))} hole groups")
        else:
            print(f"  Warning: Could not load SW data: {load_error}")
            sw_data = {}
    
    # Step 6: VLM Evidence Extraction (with Upgrade 4: Canonicalization)
    print("\n[6/8] VLM Evidence Extraction (with canonicalization)...")
    drawing_evidence = extract_evidence_with_vlm(
        page_artifacts,
        ocr_blocks,
        resolved_identity,
        detected_units=detected_units,
        title_block_identity=title_block_identity
    )
    print(f"  Found callouts: {len(drawing_evidence.get('foundCallouts', []))}")
    print(f"  Found notes: {len(drawing_evidence.get('foundNotes', []))}")
    
    # Show canonicalization results
    callouts = drawing_evidence.get('foundCallouts', [])
    canonical_count = sum(1 for c in callouts if c.get('canonical'))
    print(f"  Callouts with canonical form: {canonical_count}/{len(callouts)}")
    
    # Show unit conversion status
    if drawing_evidence.get('unitDetection', {}).get('conversionApplied'):
        print("  Unit conversion: INCHES -> MM applied")
    
    # Step 7: Diff Generation
    print("\n[7/8] Generating Diff...")
    if sw_data:
        diff_result = generate_diff_result(drawing_evidence, sw_data)
        summary = diff_result.get('summary', {})
        print(f"  Matched: {summary.get('matchedCount', 0)}")
        print(f"  Missing: {summary.get('missingCount', 0)}")
        print(f"  Match Rate: {summary.get('matchRate', 0):.1%}")
    else:
        diff_result = {
            "matched": [], "missing": [], "conflicts": [],
            "ambiguous": [], "extras": [],
            "summary": {"totalSwRequirements": 0, "matchedCount": 0,
                       "missingCount": 0, "conflictCount": 0,
                       "ambiguousCount": 0, "extraCount": 0, "matchRate": 0}
        }
        print("  No SW data available for comparison")
    
    # Step 8: Generate QC Report
    print("\n[8/8] Generating QC Report...")
    qc_report = generate_qc_report(resolved_identity, drawing_evidence, diff_result, sw_data)
    
    # Determine verdict from report
    if "### **PASS**" in qc_report:
        verdict = "PASS"
    elif "### **NEEDS_REVIEW**" in qc_report:
        verdict = "NEEDS_REVIEW"
    else:
        verdict = "FAIL"
    print(f"  Verdict: {verdict}")
    
    # Save Outputs
    print("\n[SAVE] Saving Outputs...")
    saved_paths = save_outputs(
        resolved_identity, drawing_evidence, diff_result, qc_report, OUTPUT_DIR
    )
    
    # Print upgrade summary
    print("\n" + "="*60)
    print("PIPELINE COMPLETE - UPGRADE STATUS")
    print("="*60)
    print(f"[✓] Upgrade 1: Real OCR bboxes ({blocks_with_bbox}/{len(ocr_blocks)} blocks)")
    print(f"[✓] Upgrade 2: ROI Detection (title_block={title_block_region is not None}, notes={notes_region is not None})")
    print(f"[✓] Upgrade 3: Unit Detection ({detected_units.detected_unit.value}, {detected_units.confidence:.0%})")
    print(f"[✓] Upgrade 4: Canonicalization ({canonical_count}/{len(callouts)} callouts)")
    print("="*60)
    print(f"Verdict: {verdict}")
    print(f"Output directory: {OUTPUT_DIR}")
    
    return {
        "verdict": verdict,
        "resolved_identity": asdict(resolved_identity),
        "drawing_evidence": drawing_evidence,
        "diff_result": diff_result,
        "qc_report": qc_report,
        "saved_paths": saved_paths,
        "upgrade_status": {
            "real_bboxes": {"blocks_with_bbox": blocks_with_bbox, "total_blocks": len(ocr_blocks)},
            "roi_detection": {
                "title_block_found": title_block_region is not None,
                "notes_region_found": notes_region is not None
            },
            "unit_detection": {
                "detected_unit": detected_units.detected_unit.value,
                "confidence": detected_units.confidence,
                "conversion_applied": detected_units.detected_unit == DrawingUnits.INCHES
            },
            "canonicalization": {"callouts_canonicalized": canonical_count, "total_callouts": len(callouts)}
        }
    }

print("Main pipeline function with all upgrades defined.")
print("\nUsage: result = run_qc_pipeline(DRAWING_PDF_PATH, mode=MODE)")

In [None]:
# ============================================================
# SECTION 8A: Output Saving Functions
# ============================================================

def save_outputs(
    resolved_identity: ResolvedPartIdentity,
    drawing_evidence: Dict[str, Any],
    diff_result: Dict[str, Any],
    qc_report: str,
    output_dir: str
) -> Dict[str, str]:
    """
    Save all 4 artifacts to output directory.

    Returns dict of saved file paths.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Build filename prefix
    pn = resolved_identity.resolvedPartNumber or "unknown"
    drawing_name = os.path.splitext(os.path.basename(DRAWING_PDF_PATH))[0]
    prefix = f"{pn}_{drawing_name}"
    # Clean filename
    prefix = re.sub(r'[^a-zA-Z0-9_-]', '_', prefix)

    paths = {}

    # 1. ResolvedPartIdentity.json
    identity_path = os.path.join(output_dir, f"{prefix}_ResolvedPartIdentity.json")
    with open(identity_path, 'w', encoding='utf-8') as f:
        json.dump(asdict(resolved_identity), f, indent=2)
    paths['identity'] = identity_path
    print(f"  Saved: {identity_path}")

    # 2. DrawingEvidence.json
    evidence_path = os.path.join(output_dir, f"{prefix}_DrawingEvidence.json")
    with open(evidence_path, 'w', encoding='utf-8') as f:
        json.dump(drawing_evidence, f, indent=2)
    paths['evidence'] = evidence_path
    print(f"  Saved: {evidence_path}")

    # Validate against schema
    if DRAWING_EVIDENCE_SCHEMA:
        try:
            jsonschema.validate(drawing_evidence, DRAWING_EVIDENCE_SCHEMA)
            print(f"  Schema validation: PASSED")
        except jsonschema.ValidationError as e:
            print(f"  Schema validation: FAILED - {e.message}")

    # 3. DiffResult.json
    diff_path = os.path.join(output_dir, f"{prefix}_DiffResult.json")
    with open(diff_path, 'w', encoding='utf-8') as f:
        json.dump(diff_result, f, indent=2)
    paths['diff'] = diff_path
    print(f"  Saved: {diff_path}")

    # 4. QCReport.md
    report_path = os.path.join(output_dir, f"{prefix}_QCReport.md")
    with open(report_path, 'w', encoding='utf-8') as f:
        f.write(qc_report)
    paths['report'] = report_path
    print(f"  Saved: {report_path}")

    return paths

print("Output saving function defined.")

In [None]:
# ============================================================
# SECTION 8C: Run Pipeline (Upload and Execute)
# ============================================================
from google.colab import files
from IPython.display import display, Markdown

# Upload PDF if not set
if not DRAWING_PDF_PATH or not os.path.exists(DRAWING_PDF_PATH):
    print("Upload a drawing PDF to inspect:")
    uploaded = files.upload()

    if uploaded:
        DRAWING_PDF_PATH = list(uploaded.keys())[0]
        print(f"\nUsing: {DRAWING_PDF_PATH}")
    else:
        print("No file uploaded.")

# Run pipeline
if DRAWING_PDF_PATH and os.path.exists(DRAWING_PDF_PATH):
    result = run_qc_pipeline(DRAWING_PDF_PATH, mode=MODE)

    # Display report
    if 'qc_report' in result:
        print("\n" + "="*60)
        print("QC REPORT")
        print("="*60)
        display(Markdown(result['qc_report']))
else:
    print("No PDF file available. Set DRAWING_PDF_PATH or upload a file.")