# Test Full Pipeline: End-to-End (M11)

This notebook runs the complete YOLO-OBB pipeline end-to-end:

```
image -> detect -> crop -> rotate+OCR -> parse -> normalize -> validate -> expand -> match -> score
```

Uses `YOLOPipeline` which orchestrates all 10 stages in a single `run()` call.

**Runtime requirement:** GPU with at least 8 GB VRAM (T4 or better).

**Required uploads (via file explorer dialog):**
- Sample page image (PNG/JPG)
- SolidWorks JSON export (single `.json` or `.zip` library)
- HuggingFace token for LightOnOCR-2 (entered via prompt)

In [None]:
# Cell 1: Install all dependencies + clone and install ai_inspector package
# NOTE: Set your runtime to GPU before running (Runtime > Change runtime type > T4 or better)
%pip install ultralytics transformers torch pillow accelerate --quiet

# Clone the repo (or pull latest if already cloned)
!git clone https://github.com/skaumbdoallsaws-coder/AI-Drawing-Inspector.git /content/AI-Drawing-Inspector 2>/dev/null || \
    (cd /content/AI-Drawing-Inspector && git pull)

# Install ai_inspector as an editable package so relative imports work
%pip install -e /content/AI-Drawing-Inspector --quiet

print('Dependencies installed.')
print('ai_inspector package installed.')

In [None]:
# Cell 2: Upload files + set paths (no Drive mount required)
import os
import re
import json
import zipfile
from pathlib import Path

# ---- Colab file-upload helper ----
try:
    from google.colab import userdata, files  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False
    userdata = None
    files = None

UPLOAD_DIR = '/content/uploads'
os.makedirs(UPLOAD_DIR, exist_ok=True)

# ---- HuggingFace token ----
HF_TOKEN = None
if IN_COLAB and userdata is not None:
    try:
        HF_TOKEN = userdata.get('HF_TOKEN')
    except Exception:
        HF_TOKEN = None

if not HF_TOKEN:
    try:
        from getpass import getpass
        entered = getpass('Enter HF_TOKEN (leave blank to skip): ').strip()
        HF_TOKEN = entered or None
    except Exception:
        HF_TOKEN = None

if HF_TOKEN:
    os.environ['HF_TOKEN'] = HF_TOKEN
    print(f'HF_TOKEN set (length={len(HF_TOKEN)})')
else:
    print('WARNING: HF_TOKEN not set. LightOnOCR model loading will fail.')

# ---- Upload drawing image ----
print('\n--- Upload your drawing image (PNG/JPG) ---')
SAMPLE_IMAGE = ''
if IN_COLAB and files is not None:
    uploaded = files.upload()
    for fname, data in uploaded.items():
        dest = os.path.join(UPLOAD_DIR, fname)
        with open(dest, 'wb') as f:
            f.write(data)
        if fname.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.bmp')):
            SAMPLE_IMAGE = dest
            print(f'Drawing image saved: {dest}')
else:
    # Local fallback: enter path manually
    SAMPLE_IMAGE = input('Enter local path to drawing image: ').strip()

if not SAMPLE_IMAGE or not os.path.exists(SAMPLE_IMAGE):
    raise FileNotFoundError(
        f'No valid drawing image found. Got: "{SAMPLE_IMAGE}"\n'
        'Re-run this cell and upload a PNG/JPG drawing page.'
    )

# ---- Upload SW JSON or ZIP ----
print('\n--- Upload SolidWorks data (.json or .zip) ---')
print('(You may upload a single JSON file or a zip library. Press Cancel/skip if none.)')
SW_JSON_PATH = ''
SW_ZIP_PATH = ''
if IN_COLAB and files is not None:
    try:
        uploaded_sw = files.upload()
        for fname, data in uploaded_sw.items():
            dest = os.path.join(UPLOAD_DIR, fname)
            with open(dest, 'wb') as f:
                f.write(data)
            if fname.lower().endswith('.json'):
                SW_JSON_PATH = dest
                print(f'SW JSON saved: {dest}')
            elif fname.lower().endswith('.zip'):
                SW_ZIP_PATH = dest
                print(f'SW ZIP saved: {dest}')
    except Exception:
        print('SW upload skipped. Pipeline will run without SW comparison data.')
else:
    sw_input = input('Enter local path to SW JSON or ZIP (blank to skip): ').strip()
    if sw_input:
        if sw_input.lower().endswith('.zip'):
            SW_ZIP_PATH = sw_input
        else:
            SW_JSON_PATH = sw_input

# ---- Paths ----
MODEL_PATH = 'hf://shadrack20s/ai-inspector-callout-detection/callout_v2_yolo11s-obb_best.pt'
OUTPUT_DIR = '/content/debug/pipeline_run'
TITLE_BLOCK_TEXT = 'UNLESS OTHERWISE SPECIFIED DIMENSIONS ARE IN INCHES'
os.makedirs(OUTPUT_DIR, exist_ok=True)


# ---- Auto-resolve SW JSON from ZIP if needed ----
def _normalize(s: str) -> str:
    return re.sub(r'[-\s_]', '', str(s or '')).lower()


def _extract_candidates(filename: str):
    stem = Path(filename).stem
    stem = re.sub(r"\s*\(\d+\)$", '', stem)
    base = stem
    candidates = [base, base.replace('-', '')]
    for tok in [' Paint', '_Paint', '-Paint', ' PAINT', '_PAINT', '-PAINT']:
        if base.endswith(tok):
            b2 = base[: -len(tok)]
            candidates.extend([b2, b2.replace('-', '')])
    m = re.match(r'^(.+?)[-_](\d{1,2})$', base)
    if m:
        candidates.extend([m.group(1), m.group(1).replace('-', '')])
    m = re.match(r'^(.+?)[-_]?REV[-_]?[A-Z0-9]*$', base, re.IGNORECASE)
    if m:
        candidates.extend([m.group(1), m.group(1).replace('-', '')])
    peeled = base.replace('-', '')
    while len(peeled) > 5:
        candidates.append(peeled)
        peeled = peeled[:-1]
    out, seen = [], set()
    for c in candidates:
        if c and c not in seen:
            seen.add(c)
            out.append(c)
    return out


def _resolve_sw_json_from_zip(sample_image_path, zip_path, extract_dir='/content/sw_json_library'):
    if not os.path.exists(zip_path):
        return None, 'zip_not_found'
    os.makedirs(extract_dir, exist_ok=True)
    idx_path = os.path.join(extract_dir, '_batch_index.json')
    with zipfile.ZipFile(zip_path, 'r') as zf:
        names = set(zf.namelist())
        if '_batch_index.json' in names and not os.path.exists(idx_path):
            zf.extract('_batch_index.json', extract_dir)
        if not os.path.exists(idx_path):
            candidates = _extract_candidates(os.path.basename(sample_image_path))
            for c in candidates:
                direct = f'{c}.json'
                if direct in names:
                    zf.extract(direct, extract_dir)
                    return os.path.join(extract_dir, direct), 'zip_direct_filename'
            return None, 'index_missing'
    # Use utf-8-sig to silently strip the BOM that Windows/.NET tools
    # (e.g. the SolidWorks extractor) prepend to JSON files.
    with open(idx_path, 'r', encoding='utf-8-sig') as f:
        idx = json.load(f)
    records = idx.get('records', [])
    if not records:
        return None, 'index_no_records'
    candidates = _extract_candidates(os.path.basename(sample_image_path))
    norm_cands = [_normalize(c) for c in candidates]
    by_pn, by_json_stem = {}, {}
    for r in records:
        pn = str(r.get('partNumber', '') or '')
        jf = str(r.get('jsonFileName', '') or '')
        stem = Path(jf).stem if jf else ''
        if pn:
            by_pn[_normalize(pn)] = jf
        if stem:
            by_json_stem[_normalize(stem)] = jf
    chosen, reason = None, None
    for c in norm_cands:
        if c in by_pn:
            chosen, reason = by_pn[c], f'index_partNumber:{c}'
            break
    if not chosen:
        for c in norm_cands:
            if c in by_json_stem:
                chosen, reason = by_json_stem[c], f'index_json_stem:{c}'
                break
    if not chosen:
        return None, f'no_match_for_candidates:{candidates[:5]}'
    out_path = os.path.join(extract_dir, chosen)
    if not os.path.exists(out_path):
        with zipfile.ZipFile(zip_path, 'r') as zf:
            zf.extract(chosen, extract_dir)
    if os.path.exists(out_path):
        return out_path, reason
    return None, 'extract_failed'


# Auto-resolve SW JSON from ZIP if we got a ZIP but no direct JSON
if not SW_JSON_PATH and SW_ZIP_PATH:
    resolved_sw, why = _resolve_sw_json_from_zip(SAMPLE_IMAGE, SW_ZIP_PATH)
    if resolved_sw:
        SW_JSON_PATH = resolved_sw
        print(f'Auto-matched SW JSON: {SW_JSON_PATH} ({why})')
    else:
        print(f'WARNING: Could not auto-match SW JSON from ZIP ({why}).')

print(f'\nModel:       {MODEL_PATH}')
print(f'Image:       {SAMPLE_IMAGE}  exists={os.path.exists(SAMPLE_IMAGE)}')
print(f'SW JSON:     {SW_JSON_PATH or "<none>"}  exists={os.path.exists(SW_JSON_PATH) if SW_JSON_PATH else False}')
print(f'Output dir:  {OUTPUT_DIR}')

In [None]:
# Cell 3: Import YOLOPipeline
from ai_inspector.pipeline.yolo_pipeline import YOLOPipeline, PipelineResult

print('YOLOPipeline imported successfully.')
print(f'PipelineResult fields: {[f.name for f in PipelineResult.__dataclass_fields__.values()]}')

In [None]:
# Cell 4: Create and load pipeline (show GPU memory usage)
import torch

def gpu_mem():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        return f'{allocated:.2f} GB allocated / {reserved:.2f} GB reserved / {total:.1f} GB total'
    return 'No CUDA'

print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "N/A"}')
print(f'Before load: {gpu_mem()}')

pipeline = YOLOPipeline(
    model_path=MODEL_PATH,
    hf_token=HF_TOKEN,
    confidence_threshold=0.25,
    device='cuda',
)

pipeline.load()
print(f'Pipeline loaded: {pipeline.is_loaded}')
print(f'After load:  {gpu_mem()}')

In [None]:
# Cell 5: Run pipeline on a test page
import time

start = time.time()

result = pipeline.run(
    image_path=SAMPLE_IMAGE,
    sw_json_path=SW_JSON_PATH,
    title_block_text=TITLE_BLOCK_TEXT,
    page_id='test_page_0',
    output_dir=OUTPUT_DIR,
    save_crops=True,
)

elapsed = time.time() - start
print(f'Pipeline completed in {elapsed:.1f}s')
print(f'Packets: {len(result.packets)}')
print(f'Match results: {len(result.match_results)}')

In [None]:
# Cell 6: Display results: scores, expansion summary, validation stats
import json

print('=== SCORES ===')
for key, val in result.scores.items():
    print(f'  {key:25s}: {val}')

print('\n=== EXPANSION SUMMARY ===')
print(json.dumps(result.expansion_summary, indent=2))

print('\n=== VALIDATION STATS ===')
print(json.dumps(result.validation_stats, indent=2))

print('\n=== PACKET SUMMARY ===')
print(json.dumps(result.packet_summary, indent=2))

In [None]:
# Cell 7: Display match results table
from ai_inspector.comparison.matcher import MatchStatus

print(f'{"#":>3s} {"Status":15s} {"Type":18s} {"Delta":>10s} {"Notes"}')
print('=' * 90)

for i, r in enumerate(result.match_results):
    # Determine type from whichever side is present
    callout_type = ''
    if r.drawing_callout:
        callout_type = r.drawing_callout.get('calloutType', '')
    elif r.sw_feature:
        callout_type = r.sw_feature.feature_type

    delta_str = f'{r.delta:+.4f}' if r.delta is not None else 'N/A'

    # Color coding via emoji-free markers
    status_marker = {
        MatchStatus.MATCHED: '[OK]',
        MatchStatus.MISSING: '[MISS]',
        MatchStatus.EXTRA: '[EXTRA]',
        MatchStatus.TOLERANCE_FAIL: '[TOL]',
        MatchStatus.SKIPPED: '[SKIP]',
    }.get(r.status, '[?]')

    print(f'{i:3d} {status_marker + " " + r.status.value:15s} '
          f'{callout_type:18s} {delta_str:>10s} {r.notes[:50]}')

In [None]:
# Cell 8: Display packet provenance for first 5 detections
from ai_inspector.schemas.callout_packet import packet_to_dict

print('=== Packet Provenance (first 5) ===')
print()

for i, pkt in enumerate(result.packets[:5]):
    print(f'--- Packet {i}: {pkt.det_id} ---')

    # Detection
    if pkt.detection:
        print(f'  Detection: class={pkt.detection.class_name}, '
              f'conf={pkt.detection.confidence:.3f}')

    # Crop
    if pkt.crop:
        meta = pkt.crop.meta
        print(f'  Crop: {meta.get("crop_w", "?")}x{meta.get("crop_h", "?")}px, '
              f'angle={meta.get("rotation_angle", 0):.1f}deg')

    # Rotation
    if pkt.rotation:
        print(f'  Rotation: {pkt.rotation.rotation_used}deg, '
              f'quality={pkt.rotation.quality_score:.2f}')

    # Reader
    if pkt.reader:
        print(f'  Reader: type={pkt.reader.callout_type}, '
              f'source={pkt.reader.source}, '
              f'ocr_conf={pkt.reader.ocr_confidence:.2f}')
        print(f'  Raw: "{pkt.reader.raw[:60]}"')
        parsed_keys = [k for k in pkt.reader.parsed.keys() if not k.startswith('_')]
        print(f'  Parsed fields: {parsed_keys}')

    # Normalization
    if pkt.normalized:
        method = pkt.normalized.get('_normalization_method', '?')
        units = pkt.normalized.get('_detected_units', '?')
        print(f'  Normalization: method={method}, detected_units={units}')

    # Validation
    print(f'  Validated: {pkt.validated}'
          + (f', error="{pkt.validation_error}"' if pkt.validation_error else ''))

    print()

In [None]:
# Cell 9: Save all artifacts to output dir
import json
from pathlib import Path

out = Path(OUTPUT_DIR)
print(f'Artifacts saved to: {OUTPUT_DIR}/')
print()

# List saved files
for f in sorted(out.rglob('*')):
    if f.is_file():
        size_kb = f.stat().st_size / 1024
        print(f'  {f.relative_to(out)}  ({size_kb:.1f} KB)')

# Save the full result dict as well
result_dict = result.to_dict()
summary_path = out / 'pipeline_summary.json'
with open(summary_path, 'w') as f:
    json.dump(result_dict, f, indent=2, ensure_ascii=False)
print(f'\nPipeline summary saved to: {summary_path}')

In [None]:
# Cell 10: Unload pipeline, show memory freed
import torch

def gpu_mem():
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        return f'{allocated:.2f} GB allocated / {reserved:.2f} GB reserved / {total:.1f} GB total'
    return 'No CUDA'

print(f'Before unload: {gpu_mem()}')

pipeline.unload()

# Force garbage collection
import gc
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print(f'After unload:  {gpu_mem()}')
print(f'Pipeline loaded: {pipeline.is_loaded}')
print('\nDone. Pipeline unloaded and memory freed.')