# AI Engineering Drawing Inspector (Single File)

Streamlined inspector using:
- **Qwen2-VL-7B** for visual reasoning (smaller, faster)
- **LightOnOCR-2** for structured text extraction
- **RAG** retrieval from ASME Y14.5 standard


## 1. Setup

In [None]:
# ============================================================
# CELL 1A: Install Dependencies & HuggingFace Login
# ============================================================
# Core ML dependencies
!pip install -q accelerate
!pip install -q qwen-vl-utils
!pip install -q pdf2image
!pip install -q faiss-cpu sentence-transformers
!pip install -q bitsandbytes
!apt-get install -y poppler-utils > /dev/null 2>&1

# Production Pipeline Dependencies
!pip install -q pymupdf opencv-python-headless

# LightOnOCR-2 requires transformers from source
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q pillow pypdfium2 huggingface_hub

print("All packages installed!")

# HuggingFace Authentication
from huggingface_hub import login
from google.colab import userdata

try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("Logged in to HuggingFace!")
except Exception as e:
    print(f"HF Login failed: {e}")
    print("Set HF_TOKEN in Colab Secrets (key icon in left sidebar)")


In [None]:
# ============================================================
# CELL 1B: Import Libraries
# ============================================================
import os
import json
import re
import pickle
import gc
import torch
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
from qwen_vl_utils import process_vision_info

# Production Pipeline Imports
import fitz  # PyMuPDF
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any

def clear_gpu_memory():
    """Clear GPU memory cache."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


## 2. Load Model

In [None]:
# ============================================================
# CELL 2A: Load LightOnOCR-2 (Small OCR Model - Load First)
# ============================================================
import torch
from transformers import LightOnOcrForConditionalGeneration, LightOnOcrProcessor
from PIL import Image
import numpy as np
from google.colab import userdata

# Clear any leftover GPU memory
clear_gpu_memory()

# Get HF token
try:
    hf_token = userdata.get('HF_TOKEN')
except:
    hf_token = None

print("Loading LightOnOCR-2-1B (VLM-based OCR)...")

ocr_device = "cuda" if torch.cuda.is_available() else "cpu"
ocr_dtype = torch.bfloat16 if ocr_device == "cuda" else torch.float32

ocr_model = LightOnOcrForConditionalGeneration.from_pretrained(
    "lightonai/LightOnOCR-2-1B",
    torch_dtype=ocr_dtype,
    token=hf_token
).to(ocr_device)

ocr_processor = LightOnOcrProcessor.from_pretrained(
    "lightonai/LightOnOCR-2-1B",
    token=hf_token
)

print(f"LightOnOCR-2 loaded: {ocr_model.get_memory_footprint() / 1e9:.2f} GB")

def get_drawing_text_ocr(image_input):
    """Run LightOnOCR-2 on the drawing and return structured text."""
    global ocr_model, ocr_processor, ocr_device, ocr_dtype
    try:
        if isinstance(image_input, np.ndarray):
            img = Image.fromarray(image_input).convert("RGB")
        else:
            img = image_input.convert("RGB")

        conversation = [{"role": "user", "content": [{"type": "image", "image": img}]}]
        inputs = ocr_processor.apply_chat_template(
            conversation, add_generation_prompt=True, tokenize=True,
            return_dict=True, return_tensors="pt",
        )
        inputs = {k: v.to(device=ocr_device, dtype=ocr_dtype) if v.is_floating_point() else v.to(ocr_device) for k, v in inputs.items()}

        with torch.no_grad():
            output_ids = ocr_model.generate(**inputs, max_new_tokens=2048)
        generated_ids = output_ids[0, inputs["input_ids"].shape[1]:]
        output_text = ocr_processor.decode(generated_ids, skip_special_tokens=True)
        return [line.strip() for line in output_text.split("\n") if line.strip()]
    except Exception as e:
        print(f"LightOnOCR Error: {e}")
        return []

print("LightOnOCR-2 Ready!")


In [None]:
# ============================================================
# CELL 2B: Load Qwen2-VL Model (7B - Faster & Lighter)
# ============================================================
import torch
import gc
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
from google.colab import userdata

# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()

try:
    hf_token = userdata.get('HF_TOKEN')
except:
    hf_token = None

# Using 7B model - much faster to download and load
MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"

print(f"Loading {MODEL_ID}...")
print(f"GPU memory before: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = Qwen2VLForConditionalGeneration.from_pretrained(
    MODEL_ID,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
    token=hf_token,
    torch_dtype=torch.float16
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    token=hf_token
)

print(f"Qwen2-VL-7B loaded!")
print(f"GPU memory after: {torch.cuda.memory_allocated() / 1e9:.2f} GB")


## 3. Load Context Databases

In [None]:
# ============================================================
# CELL 3A: Upload Configuration Files
# ============================================================
import os
import zipfile
from google.colab import files

MAPPING_FILE = "400S_file_part_mapping.json"
STRUCTURE_FILE = "400S_detailed_structure_fixed.json"
RAG_INDEX_FILE = "asme_visual_index.pkl"

print("="*60)
print("STEP 1: Upload Configuration Files")
print("="*60)

def locate_file(filename):
    if os.path.exists(filename):
        return os.path.abspath(filename)
    nested_path = os.path.join("rag_data", filename)
    if os.path.exists(nested_path):
        return os.path.abspath(nested_path)
    return None

FILE_MAPPING_PATH = locate_file(MAPPING_FILE)
STRUCTURE_PATH = locate_file(STRUCTURE_FILE)
RAG_INDEX_PATH = locate_file(RAG_INDEX_FILE)

missing_files = []
if not FILE_MAPPING_PATH:
    missing_files.append(MAPPING_FILE)
if not STRUCTURE_PATH:
    missing_files.append(STRUCTURE_FILE)
if not RAG_INDEX_PATH:
    missing_files.append(RAG_INDEX_FILE)

if missing_files:
    print(f"\nMissing files: {', '.join(missing_files)}")
    print("\nPlease upload the required files (or a ZIP containing them):")
    uploaded = files.upload()

    for filename in uploaded:
        if filename.lower().endswith('.zip'):
            print(f"\nExtracting {filename}...")
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall("rag_data")
            print("Extraction complete.")
            break

    FILE_MAPPING_PATH = locate_file(MAPPING_FILE) or os.path.abspath(MAPPING_FILE)
    STRUCTURE_PATH = locate_file(STRUCTURE_FILE) or os.path.abspath(STRUCTURE_FILE)
    RAG_INDEX_PATH = locate_file(RAG_INDEX_FILE)

if FILE_MAPPING_PATH:
    DATA_DIR = os.path.dirname(FILE_MAPPING_PATH)
else:
    DATA_DIR = "/content"

print("\n" + "="*60)
print("FILE STATUS:")
print("="*60)
print(f"File Mapping:  {'✅ OK' if FILE_MAPPING_PATH and os.path.exists(FILE_MAPPING_PATH) else '❌ MISSING'}")
print(f"Structure:     {'✅ OK' if STRUCTURE_PATH and os.path.exists(STRUCTURE_PATH) else '❌ MISSING'}")
print(f"RAG Index:     {'✅ OK' if RAG_INDEX_PATH and os.path.exists(RAG_INDEX_PATH) else '⚠️ MISSING'}")
print(f"\nData directory: {DATA_DIR}")

In [None]:
# ============================================================
# CELL 3B: Load Part Context Databases
# ============================================================

def normalize_pn(pn):
    """Normalize part number for lookup."""
    return re.sub(r'[-\s]', '', str(pn)).lower()

def load_context_databases():
    """Load and build all context databases."""
    print("Loading file mapping...")
    with open(FILE_MAPPING_PATH, 'r') as f:
        file_mapping_list = json.load(f)

    filename_to_pn = {}
    for entry in file_mapping_list:
        filename = entry['file']
        pn = entry['pn']
        if pn:
            filename_to_pn[filename] = pn
            filename_to_pn[filename + '.pdf'] = pn
            filename_to_pn[filename + '.PDF'] = pn

    print(f"  Loaded {len(file_mapping_list)} file mappings")

    print("Loading part structure...")
    with open(STRUCTURE_PATH, 'r') as f:
        structure_data = json.load(f)

    print("Building part context database...")
    part_context_db = {}

    for assembly_name, parts_list in structure_data.items():
        for part in parts_list:
            pn = part['pn']
            desc = part['desc']

            siblings_list = []
            siblings_pns = []

            for p_sibling in parts_list:
                if p_sibling['pn'] != pn:
                    safe_desc = str(p_sibling['desc']).replace('"', "'")
                    siblings_list.append(f"{p_sibling['pn']} ({safe_desc})")
                    siblings_pns.append(p_sibling['pn'])

            siblings_str = "; ".join(siblings_list[:12])
            if len(siblings_list) > 12:
                siblings_str += f"... and {len(siblings_list) - 12} more"

            lookup_key = normalize_pn(pn)

            part_context_db[lookup_key] = {
                'pn': pn,
                'description': desc,
                'assembly': assembly_name,
                'siblings': siblings_str,
                'siblings_list': siblings_pns
            }
            part_context_db[pn] = part_context_db[lookup_key]

    print(f"  Built context for {len(part_context_db) // 2} unique parts")
    return filename_to_pn, part_context_db

filename_to_pn, part_context_db = load_context_databases()
print("\n✅ Context databases loaded successfully!")

In [None]:
# ============================================================
# CELL 3E: Load RAG Index & Visual Database
# ============================================================
import os
import pickle
from sentence_transformers import SentenceTransformer
import numpy as np

rag_data = []
rag_embeddings = None
rag_available = False
RAG_IMAGE_DIR = None

print("="*60)
print("RAG SYSTEM SETUP")
print("="*60)

print("\n[STEP 1/3] Loading CLIP model...")
search_model = SentenceTransformer('clip-ViT-B-32')
print("  ✅ CLIP model loaded!")

print("\n[STEP 2/3] Loading RAG Index...")
index_loaded = False

# Check multiple locations for the index file
index_locations = [
    "/content/asme_visual_index.pkl",
    "/content/rag_data/asme_visual_index.pkl",
    "asme_visual_index.pkl",
]
if 'RAG_INDEX_PATH' in dir() and RAG_INDEX_PATH:
    index_locations.insert(0, RAG_INDEX_PATH)

for idx_path in index_locations:
    if idx_path and os.path.exists(idx_path):
        print(f"  ✅ Found: {idx_path}")
        with open(idx_path, 'rb') as f:
            rag_data = pickle.load(f)
        RAG_INDEX_PATH = idx_path
        index_loaded = True
        break

if not index_loaded:
    print("  ❌ No index found. Please upload asme_visual_index.pkl:")
    from google.colab import files
    try:
        uploaded = files.upload()
        for filename in uploaded:
            if filename.endswith('.pkl'):
                with open(filename, 'rb') as f:
                    rag_data = pickle.load(f)
                index_loaded = True
                break
    except:
        pass

print("\n[STEP 3/3] Looking for RAG Visual Database...")

# Check multiple locations for the image folder
image_locations = [
    "/content/rag_visual_db",
    "/content/rag_data/rag_visual_db",
    "rag_visual_db",
]
if 'DATA_DIR' in dir() and DATA_DIR:
    image_locations.insert(0, os.path.join(DATA_DIR, "rag_visual_db"))

found_images = False
for loc in image_locations:
    if loc and os.path.exists(loc) and os.path.isdir(loc):
        # Count images
        img_files = [f for f in os.listdir(loc) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        if len(img_files) > 0:
            RAG_IMAGE_DIR = os.path.abspath(loc)
            found_images = True
            print(f"  ✅ Found: {RAG_IMAGE_DIR} ({len(img_files)} images)")
            break

if not found_images:
    print("  ❌ No images found. Please upload rag_visual_db.zip:")
    from google.colab import files
    import zipfile, shutil
    try:
        uploaded = files.upload()
        for filename in uploaded:
            if filename.lower().endswith('.zip'):
                RAG_IMAGE_DIR = "/content/rag_visual_db"
                if os.path.exists(RAG_IMAGE_DIR):
                    shutil.rmtree(RAG_IMAGE_DIR)
                os.makedirs(RAG_IMAGE_DIR, exist_ok=True)
                with zipfile.ZipFile(filename, 'r') as zf:
                    zf.extractall(RAG_IMAGE_DIR)
                found_images = True
                print(f"  ✅ Extracted to {RAG_IMAGE_DIR}")
                break
    except:
        RAG_IMAGE_DIR = "/content/rag_visual_db"

# Build search index
print("\n" + "="*60)
if index_loaded and len(rag_data) > 0:
    embeddings_list = [item['embedding'] for item in rag_data]
    rag_embeddings = np.array(embeddings_list).astype('float32')
    rag_available = True
    print("✅ RAG SYSTEM: READY")
    print(f"  Index: {len(rag_data)} entries")
    print(f"  Images: {RAG_IMAGE_DIR}")
else:
    print("❌ RAG SYSTEM: NOT READY")
print("="*60)

## 4. Helper Functions

In [None]:
# ============================================================
# CELL 4A: Core Helper Functions
# ============================================================
import os
import re
from pdf2image import convert_from_path

def extract_filename_key(filepath):
    """Extract filename key for lookup."""
    filename = os.path.basename(filepath)
    name_no_ext = os.path.splitext(filename)[0]
    name_no_ext = re.sub(r'\s*\(\d+\)$', '', name_no_ext)  # Remove (1), (2) etc
    name_cleaned = re.sub(r'[\s_]*(Paint|PAINT)$', '', name_no_ext, flags=re.IGNORECASE)
    return name_cleaned.strip()

def get_part_context(filepath):
    """Look up part context from filename."""
    filename_key = extract_filename_key(filepath)

    if filename_key in filename_to_pn:
        pn = filename_to_pn[filename_key]
        lookup_key = normalize_pn(pn)
        if lookup_key in part_context_db:
            return pn, part_context_db[lookup_key]

    for ext in ['.pdf', '.PDF']:
        key = filename_key + ext
        if key in filename_to_pn:
            pn = filename_to_pn[key]
            lookup_key = normalize_pn(pn)
            if lookup_key in part_context_db:
                return pn, part_context_db[lookup_key]

    return None, None

def build_context_string(pn, context):
    """Build the context string for inspection prompt."""
    if context is None:
        return "CONTEXT: Unknown Part (General Syntax Check Only)."

    desc = context.get('description', 'Unknown')
    assembly = context.get('assembly', 'Unknown Assembly')
    siblings = context.get('siblings', 'None listed')

    return f"""CONTEXT: This is Part {pn} ({desc}).
It belongs to the {assembly}.
It must assemble with these mating parts: {siblings}.
CRITICAL: Check for mating tolerances suitable for a {desc}."""

def pdf_to_image(pdf_path, dpi=150):
    """Convert first page of PDF to PIL Image."""
    pages = convert_from_path(pdf_path, dpi=dpi, first_page=1, last_page=1)
    return pages[0] if pages else None

print("✅ Core helper functions defined.")

In [None]:
# ============================================================
# CELL 4B: Model Query Function
# ============================================================
import torch
from qwen_vl_utils import process_vision_info

def query_model(messages, max_tokens=1024):
    """Send a query to Qwen2-VL and get response."""
    if 'model' not in globals() or 'processor' not in globals():
        raise RuntimeError("⚠️ Model not loaded. Run Cell 2 first.")

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)

    generated_ids = output_ids[:, inputs.input_ids.shape[1]:]
    response = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    return response.strip()

print("✅ Model query function defined.")

In [None]:
# ============================================================
# CELL 4C: RAG Retrieval Function
# ============================================================

def retrieve_asme_pages(keywords, top_k=2):
    """Retrieve relevant ASME standard pages based on keywords."""
    global RAG_IMAGE_DIR

    if not rag_available or rag_embeddings is None:
        print("  ⚠️ RAG system not available")
        return []

    if RAG_IMAGE_DIR is None:
        print("  ⚠️ RAG_IMAGE_DIR not set")
        return []

    try:
        query_vector = search_model.encode([keywords])
        scores = np.dot(query_vector, rag_embeddings.T).flatten()
        top_indices = np.argsort(scores)[-top_k:][::-1]

        retrieved_images = []
        print(f"  RAG Search: '{keywords[:50]}...'")

        for idx in top_indices:
            item = rag_data[idx]
            rel_path = item['path'].replace('\\', '/')

            paths_to_try = [
                os.path.join(RAG_IMAGE_DIR, rel_path),
                os.path.join(RAG_IMAGE_DIR, os.path.basename(rel_path)),
            ]

            path_parts = rel_path.split('/')
            if len(path_parts) > 1:
                paths_to_try.append(os.path.join(RAG_IMAGE_DIR, path_parts[-1]))

            print(f"    - {os.path.basename(rel_path)} (Score: {scores[idx]:.3f})")

            for try_path in paths_to_try:
                if os.path.exists(try_path):
                    try:
                        img = Image.open(try_path).convert('RGB')
                        retrieved_images.append(img)
                        break
                    except Exception as e:
                        print(f"      Error: {e}")

        return retrieved_images

    except Exception as e:
        print(f"  RAG error: {e}")
        return []

print("✅ RAG retrieval function defined.")

In [None]:
# ============================================================
# CELL 4D: Production Pipeline Helpers
# ============================================================
import fitz  # PyMuPDF
from PIL import Image
import numpy as np
from typing import List, Tuple

print("Initializing Production Pipeline...")

def render_pdf_page(pdf_path: str, dpi: int = 300) -> Image.Image:
    """Render first page of PDF to PIL Image."""
    try:
        doc = fitz.open(pdf_path)
        page = doc.load_page(0)
        zoom = dpi / 72.0
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        doc.close()
        return img
    except Exception as e:
        print(f"Rendering Error: {e}")
        return None

def run_lighton_ocr(img: Image.Image) -> List[str]:
    """Run LightOnOCR-2 on image."""
    return get_drawing_text_ocr(img)

# Alias for backwards compatibility
run_tesseract_ocr = run_lighton_ocr

def make_overlapping_tiles(full_img: Image.Image) -> List[Tuple[str, Image.Image]]:
    """Split image into 4 overlapping quadrants."""
    w, h = full_img.size
    tile_w, tile_h = w // 2, h // 2
    overlap = int(min(w, h) * 0.15)
    boxes = {
        "Top-Left": (0, 0, tile_w + overlap, tile_h + overlap),
        "Top-Right": (w - (tile_w + overlap), 0, w, tile_h + overlap),
        "Bottom-Left": (0, h - (tile_h + overlap), tile_w + overlap, h),
        "Bottom-Right": (w - (tile_w + overlap), h - (tile_h + overlap), w, h)
    }
    return [(name, full_img.crop(box)) for name, box in boxes.items()]

print("Production Pipeline Ready.")


## 5. Main Inspection Function

In [None]:
# ============================================================
# CELL 5B: UNIVERSAL MISMATCH INSPECTOR (Batch-Ready)
# ============================================================

def inspect_drawing_universal(pdf_path):
    """
    Universal inspector that dynamically loads context for each file.
    Batch-ready with robust error handling.
    """
    # 1. Get Dynamic Context for THIS specific file
    pn = None
    try:
        pn, ctx = get_part_context(pdf_path)
        context_str = ctx['siblings']  # Fixed: was 'siblings_str'
        part_name = ctx['description']  # Fixed: was 'desc'
    except:
        print(f"⚠️ Context Lookup Failed for {pdf_path}")
        context_str = "No Mating Parts Found in Database."
        part_name = "Unknown Part"

    print(f"\n{'='*60}\nUNIVERSAL INSPECTION: {part_name}\n{'='*60}")

    # --- Phase A: Perception ---
    print("[1/3] Reading Drawing (LightOnOCR-2)...")
    full_img = render_pdf_page(pdf_path, dpi=300)
    if not full_img:
        return {'error': 'Failed to render PDF', 'part_number': pn}

    # Run Tesseract
    ocr_texts = run_tesseract_ocr(full_img)
    # Filter for relevant engineering text (numbers/dims)
    filtered_ocr = [t for t in ocr_texts if any(char.isdigit() for char in t)]
    ocr_block = "\n".join([f"- {t}" for t in filtered_ocr[:120]])

    print(f"  > Evidence: {len(filtered_ocr)} relevant lines found.")

    # --- Phase B: Reasoning ---
    print("[2/3] Generating Dynamic Truth Table...")

    system_prompt = """You are a Universal Engineering Auditor.

**YOUR GOAL:**
Cross-reference "LIST A" (Requirements) against "LIST B" (Drawing Evidence).

**LOGIC PROTOCOL:**
1. Read LIST A to identify the Mating Parts.
2. For EACH Mating Part, search LIST B for a corresponding feature (Thread, Hole, Diameter).
3. **STRICTLY** compare dimensions.
   - If List A says "3/4-16" and List B says "M10", output NO.
   - If List A says "0.750" and List B says "0.500", output NO.
4. If the Mating Part is generic (e.g. "WASHER"), and you see *any* washer dimension, you may output "LIKELY MATCH".
5. If you cannot find any matching text in List B, output "NOT FOUND"."""

    user_text = f"""**LIST A (THE REQUIREMENTS for Part {pn}):**
{context_str}

**LIST B (THE DRAWING TEXT):**
{ocr_block}

**TASK:**
Create a Truth Table checking the compatibility of the Mating Parts in List A against the Evidence in List B.

| Mating Part (from List A) | Found Feature in List B | Compatible? (YES/NO/NOT FOUND) |
| :--- | :--- | :--- |
| [Name/Spec of Part 1] | [Text from Drawing] | [Verdict] |
| [Name/Spec of Part 2] | [Text from Drawing] | [Verdict] |

**FINAL VERDICT:**
PASS if all critical features match.
FAIL if there is a direct contradiction (Metric vs Imperial)."""

    content_payload = [
        {'type': 'image', 'image': full_img},
        {'type': 'text', 'text': user_text}
    ]

    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': content_payload}
    ]

    # Inference
    print("[3/3] Running inference...")
    text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text_input],
        images=[full_img],
        return_tensors="pt",
        padding=True
    ).to(model.device)

    token_count = inputs.input_ids.shape[1]
    print(f"  Token count: {token_count}")

    generated_ids = model.generate(**inputs, max_new_tokens=600)
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    response = output_text.split("assistant\n")[-1] if "assistant\n" in output_text else output_text

    print(f"\n{'='*60}\nRESULT:\n{'='*60}")
    print(response)

    return {
        'response': response,
        'part_number': pn,
        'part_name': part_name,
        'ocr_total': len(ocr_texts),
        'ocr_filtered': len(filtered_ocr),
        'token_count': token_count
    }

# Aliases for compatibility
inspect_drawing_strict_optimized = inspect_drawing_universal
inspect_drawing_production = inspect_drawing_universal

print("✅ Universal Inspector Loaded (Batch-Ready).")

In [None]:
# ============================================================
# CELL 5D: MASTER PIPELINE - Full Inspection (Stage 1 + Stage 2)
# ============================================================
from datetime import datetime
import re

def parse_verdict(response_text):
    """
    Parse the truth table response to determine PASS/FAIL.
    Returns FAIL if any row has 'NO', PASS if all rows have 'YES'.
    """
    if not response_text:
        return 'ERROR', 'No response'

    text_upper = response_text.upper()

    # Count YES and NO in the response
    yes_count = len(re.findall(r'\|\s*YES\s*\|', text_upper))
    no_count = len(re.findall(r'\|\s*NO\s*\|', text_upper))

    # Also check for standalone YES/NO at end of lines
    yes_count += len(re.findall(r'YES\s*$', text_upper, re.MULTILINE))
    no_count += len(re.findall(r'NO\s*$', text_upper, re.MULTILINE))

    # Check for NOT FOUND
    not_found = 'NOT FOUND' in text_upper

    # Determine verdict
    if no_count > 0 or not_found:
        issues = []
        if no_count > 0:
            issues.append(f"{no_count} mismatches")
        if not_found:
            issues.append("missing specs")
        return 'FAIL', '; '.join(issues)
    elif yes_count > 0:
        return 'PASS', f"{yes_count} specs verified"
    else:
        return 'REVIEW', 'Could not parse verdict'


def run_full_inspection(pdf_path, skip_stage2_on_fail=True):
    """
    Master pipeline that runs both inspection stages:

    Stage 1 (Gatekeeper): Strict mismatch detection
    Stage 2 (Consultant): Improvement suggestions (only if Stage 1 passes)
    """
    print(f"\n{'#'*60}")
    print(f"# FULL ENGINEERING INSPECTION PIPELINE")
    print(f"# File: {os.path.basename(pdf_path)}")
    print(f"# Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"{'#'*60}")

    result = {
        'file': os.path.basename(pdf_path),
        'path': pdf_path,
        'timestamp': datetime.now().isoformat(),
        'stage1': None,
        'stage2': None,
        'final_verdict': None
    }

    # STAGE 1: THE GATEKEEPER
    print(f"\n{'='*60}")
    print("STAGE 1: THE GATEKEEPER (Mismatch Detection)")
    print(f"{'='*60}")

    pn, ctx = get_part_context(pdf_path)

    if not ctx:
        print(f"⚠️ No context found for {pdf_path}")
        result['stage1'] = {'verdict': 'ERROR', 'reason': 'Part not in database'}
        result['final_verdict'] = 'ERROR'
        return result

    context_data = {
        'pn': pn,
        'description': ctx.get('description', 'Unknown'),
        'assembly': ctx.get('assembly', 'Unknown'),
        'siblings': ctx.get('siblings', 'No mating parts')
    }

    stage1_result = inspect_drawing_universal(pdf_path)

    if isinstance(stage1_result, dict) and 'error' in stage1_result:
        stage1_verdict = 'ERROR'
        stage1_reason = stage1_result['error']
        response = ''
    else:
        response = stage1_result.get('response', '') if isinstance(stage1_result, dict) else str(stage1_result)
        stage1_verdict, stage1_reason = parse_verdict(response)

    result['stage1'] = {
        'verdict': stage1_verdict,
        'reason': stage1_reason,
        'part_number': pn,
        'part_name': context_data['description'],
        'ocr_count': stage1_result.get('ocr_filtered', 0) if isinstance(stage1_result, dict) else 0,
        'response': response[:1000]
    }

    print(f"\n>>> STAGE 1 VERDICT: {stage1_verdict} - {stage1_reason}")

    # STAGE 2: THE CONSULTANT
    if stage1_verdict == 'FAIL' and skip_stage2_on_fail:
        print(f"\n{'='*60}")
        print("STAGE 2: SKIPPED (Stage 1 Failed)")
        print(f"{'='*60}")
        print("Fix the Stage 1 issues before requesting improvement suggestions.")
        result['stage2'] = {'status': 'SKIPPED', 'reason': 'Stage 1 failed'}
        result['final_verdict'] = 'FAIL'

    elif stage1_verdict == 'ERROR':
        print(f"\n{'='*60}")
        print("STAGE 2: SKIPPED (Stage 1 Error)")
        print(f"{'='*60}")
        result['stage2'] = {'status': 'SKIPPED', 'reason': 'Stage 1 error'}
        result['final_verdict'] = 'ERROR'

    else:
        print(f"\n{'='*60}")
        print("STAGE 2: THE CONSULTANT (Proceeding...)")
        print(f"{'='*60}")

        ocr_text_list = []
        if isinstance(stage1_result, dict):
            full_img = render_pdf_page(pdf_path, dpi=200)
            if full_img:
                ocr_text_list = run_tesseract_ocr(full_img)
                ocr_text_list = [t for t in ocr_text_list if any(c.isdigit() for c in t)]

        try:
            stage2_result = suggest_improvements_stage2(
                pdf_path=pdf_path,
                context_data=context_data,
                ocr_text_list=ocr_text_list
            )

            result['stage2'] = {
                'status': 'COMPLETED',
                'suggestions': stage2_result.get('suggestions', ''),
                'asme_pages_used': stage2_result.get('asme_pages_used', 0),
                'search_queries': stage2_result.get('search_queries', [])
            }

        except Exception as e:
            print(f"⚠️ Stage 2 Error: {e}")
            result['stage2'] = {'status': 'ERROR', 'reason': str(e)}

        result['final_verdict'] = 'PASS_WITH_SUGGESTIONS' if stage1_verdict == 'PASS' else 'REVIEW'

    # FINAL SUMMARY
    print(f"\n{'#'*60}")
    print("# FINAL INSPECTION SUMMARY")
    print(f"{'#'*60}")
    print(f"Part: {pn} ({context_data['description']})")
    print(f"Stage 1 (Gatekeeper): {result['stage1']['verdict']}")
    print(f"Stage 2 (Consultant): {result['stage2'].get('status', 'N/A')}")
    print(f"Final Verdict: {result['final_verdict']}")
    print(f"{'#'*60}\n")

    return result


def run_full_inspection_batch(drawing_folder, output_file=None, limit=None):
    """Run full inspection on all PDFs in a folder."""
    from tqdm.notebook import tqdm

    pdf_files = glob.glob(os.path.join(drawing_folder, "**/*.pdf"), recursive=True)
    pdf_files += glob.glob(os.path.join(drawing_folder, "**/*.PDF"), recursive=True)
    pdf_files = sorted(list(set(pdf_files)))

    if limit:
        pdf_files = pdf_files[:limit]

    print(f"{'#'*60}")
    print(f"FULL INSPECTION BATCH: {len(pdf_files)} files")
    print(f"{'#'*60}\n")

    results = []

    for pdf_path in tqdm(pdf_files, desc="Full Inspection"):
        try:
            result = run_full_inspection(pdf_path)
            results.append(result)
        except Exception as e:
            results.append({'file': os.path.basename(pdf_path), 'final_verdict': 'ERROR', 'error': str(e)})

    if not output_file:
        output_file = f"full_inspection_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2)

    verdicts = [r.get('final_verdict', 'ERROR') for r in results]
    print(f"\n{'='*60}")
    print(f"BATCH SUMMARY: {len(results)} files")
    print(f"PASS: {verdicts.count('PASS_WITH_SUGGESTIONS')} | FAIL: {verdicts.count('FAIL')} | ERROR: {verdicts.count('ERROR')}")
    print(f"Results: {output_file}")

    return results

print("✅ parse_verdict() defined.")
print("✅ run_full_inspection() defined.")
print("✅ run_full_inspection_batch() defined.")


## 6. Inspect a Drawing

In [None]:
# Upload and inspect a single PDF drawing
from google.colab import files

print("Upload a PDF drawing to inspect:")
uploaded = files.upload()

if uploaded:
    pdf_path = list(uploaded.keys())[0]
    print(f"
Inspecting: {pdf_path}")
    result = run_full_inspection(pdf_path)

    # Display result summary
    print("
" + "="*60)
    print("INSPECTION COMPLETE")
    print("="*60)
    if 'verdict' in result:
        print(f"Verdict: {result['verdict']}")
    if 'reason' in result:
        print(f"Reason: {result['reason']}")
else:
    print("No file uploaded.")
