# Test Finetuning Prep: Evaluation (M12) + YOLO Training Setup

This notebook tests the evaluation harness and prepares for YOLO-OBB finetuning:

- **M12 -- Evaluator**: Compares pipeline predictions against ground truth sidecar
  annotations. Provides stage-by-stage metrics (detection F1, transcription CER/WER,
  parsing accuracy) and identifies the bottleneck stage.
- **YOLO Finetuning Config**: Placeholder for A100-optimized YOLO training setup
  using ultralytics.

**Runtime requirement:** GPU (A100 preferred). The evaluation itself is CPU-based,
but the finetuning configuration cell requires A100 to be meaningful.

**Required files:**
- Ground truth sidecar YAML/JSON files
- Pipeline prediction outputs (or run pipeline inline)

In [None]:
# Cell 1: Install dependencies
# NOTE: Set your runtime to GPU (A100) for the finetuning config cell.
%pip install ultralytics pillow pyyaml --quiet

# Clone the repo
!git clone https://github.com/skaumbdoallsaws-coder/AI-Drawing-Inspector.git /content/AI-Drawing-Inspector 2>/dev/null || \
    (cd /content/AI-Drawing-Inspector && git pull)

import sys
sys.path.insert(0, '/content/AI-Drawing-Inspector')

print('Dependencies installed.')

In [None]:
# Cell 2: Check GPU info (should be A100)
import torch

print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    props = torch.cuda.get_device_properties(0)
    print(f'GPU name: {gpu_name}')
    print(f'Compute capability: {props.major}.{props.minor}')
    print(f'Total VRAM: {props.total_mem / 1e9:.1f} GB')
    print(f'SM count: {props.multi_processor_count}')

    if 'A100' in gpu_name:
        print('\n[OK] A100 GPU detected. Full finetuning is supported.')
    elif 'T4' in gpu_name:
        print('\n[WARN] T4 GPU detected. Finetuning will be slower; consider A100.')
    else:
        print(f'\n[INFO] GPU: {gpu_name}. Check VRAM is sufficient for finetuning.')
else:
    print('\n[ERROR] No GPU detected. Set Runtime > Change runtime type > GPU.')

In [None]:
# Cell 3: Import evaluation modules
from ai_inspector.fine_tuning.evaluate import (
    evaluate_page,
    evaluate_batch,
    print_evaluation_table,
    load_sidecar,
    load_sidecars,
    pair_detections_iou,
    compute_cer,
    compute_wer,
    compute_parsing_accuracy,
)

print('Evaluation modules imported successfully.')

In [None]:
# Cell 4: Load ground truth sidecar annotations
# Option A: Load from Drive
# GT_DIR = '/content/drive/MyDrive/ai_inspector_data/ground_truth/'
# sidecars = load_sidecars(GT_DIR)

# Option B: Use synthetic ground truth for testing
print('Creating synthetic ground truth for evaluation testing...')

ground_truth = [
    {
        'class': 'Hole',
        'obb_points': [[100, 100], [300, 100], [300, 180], [100, 180]],
        'text': '\u2300.500 THRU',
        'parsed': {
            'calloutType': 'Hole',
            'diameter': '.500',
            'depth': 'THRU',
        },
    },
    {
        'class': 'Hole',
        'obb_points': [[400, 200], [600, 200], [600, 280], [400, 280]],
        'text': '\u2300.250 DEEP .500',
        'parsed': {
            'calloutType': 'Hole',
            'diameter': '.250',
            'depth': '.500',
        },
    },
    {
        'class': 'TappedHole',
        'obb_points': [[700, 300], [950, 300], [950, 380], [700, 380]],
        'text': 'M10x1.5 THRU',
        'parsed': {
            'calloutType': 'TappedHole',
            'threadSize': 'M10x1.5',
        },
    },
    {
        'class': 'Fillet',
        'obb_points': [[100, 400], [250, 400], [250, 460], [100, 460]],
        'text': 'R.125 TYP.',
        'parsed': {
            'calloutType': 'Fillet',
            'radius': '.125',
        },
    },
    {
        'class': 'Chamfer',
        'obb_points': [[300, 500], [500, 500], [500, 560], [300, 560]],
        'text': '.045 x 45\u00b0',
        'parsed': {
            'calloutType': 'Chamfer',
            'size': '.045',
            'angle': '45',
        },
    },
]

# Simulated predictions (imperfect -- some errors for realistic eval)
predictions = [
    {   # Correct detection + correct OCR
        'class': 'Hole',
        'obb_points': [[102, 98], [298, 102], [296, 182], [100, 178]],
        'text': '\u2300.500 THRU',
        'parsed': {'calloutType': 'Hole', 'diameter': '.500', 'depth': 'THRU'},
    },
    {   # Correct detection, slight OCR error
        'class': 'Hole',
        'obb_points': [[402, 198], [598, 202], [596, 282], [400, 278]],
        'text': '\u2300.250 DEEP .S00',  # OCR error: S instead of 5
        'parsed': {'calloutType': 'Hole', 'diameter': '.250', 'depth': '.S00'},
    },
    {   # Correct detection + correct OCR
        'class': 'TappedHole',
        'obb_points': [[698, 302], [952, 298], [954, 378], [700, 382]],
        'text': 'M10x1.5 THRU',
        'parsed': {'calloutType': 'TappedHole', 'threadSize': 'M10x1.5'},
    },
    {   # Correct detection, correct OCR
        'class': 'Fillet',
        'obb_points': [[98, 402], [252, 398], [254, 458], [100, 462]],
        'text': 'R.125 TYP.',
        'parsed': {'calloutType': 'Fillet', 'radius': '.125'},
    },
    {   # False positive (no matching GT)
        'class': 'Dimension',
        'obb_points': [[800, 600], [1000, 600], [1000, 660], [800, 660]],
        'text': '3.500',
        'parsed': {'calloutType': 'Dimension'},
    },
    # Missing: Chamfer (false negative -- not detected)
]

print(f'Ground truth annotations: {len(ground_truth)}')
print(f'Predictions: {len(predictions)}')
print(f'Expected: 4 TP, 1 FP (Dimension), 1 FN (Chamfer)')

In [None]:
# Cell 5: Run evaluate_page() on sample
eval_results = evaluate_page(
    predictions=predictions,
    ground_truth=ground_truth,
    iou_threshold=0.3,
)

import json
print('Raw evaluation results:')
print(json.dumps(eval_results, indent=2))

In [None]:
# Cell 6: Print evaluation table
print_evaluation_table(eval_results)

# Additional detail
print('\n--- Detection Detail ---')
det = eval_results['detection']
print(f'True positives:  {det["true_positives"]}')
print(f'False positives: {det["false_positives"]}')
print(f'False negatives: {det["false_negatives"]}')
print(f'Mean IoU:        {det["mean_iou"]}')

print('\n--- Transcription Detail ---')
trans = eval_results['transcription']
print(f'Evaluated pairs: {trans["evaluated_count"]}')
print(f'Mean CER:        {trans["mean_cer"]}')
print(f'Mean WER:        {trans["mean_wer"]}')

print('\n--- Parsing Detail ---')
parse = eval_results['parsing']
print(f'Fields correct:  {parse["fields_correct"]}')
print(f'Fields total:    {parse["fields_total"]}')
print(f'Accuracy:        {parse["accuracy"]}')

print('\n--- Class Breakdown ---')
for cls, info in eval_results.get('class_breakdown', {}).items():
    print(f'  {cls}: TP={info["tp"]}, class_match={info["class_match"]}')

In [None]:
# Cell 7: Identify bottleneck stage
summary = eval_results['summary']

print('=== Pipeline Bottleneck Analysis ===')
print()
print(f'Detection F1:        {summary["detection_f1"]:.4f}')
print(f'Mean CER:            {summary["mean_cer"]:.4f}  (lower is better)')
print(f'Parsing accuracy:    {summary["parsing_accuracy"]:.4f}')
print()
print(f'>>> Bottleneck stage: {summary["bottleneck"].upper()}')
print()

# Recommendations based on bottleneck
bottleneck = summary['bottleneck']
if bottleneck == 'detection':
    print('Recommendation: Finetune YOLO-OBB model on more annotated drawings.')
    print('  - Collect more OBB annotations (CVAT or Label Studio)')
    print('  - Increase training epochs or adjust augmentation')
    print('  - See Cell 8 for A100 finetuning configuration')
elif bottleneck == 'transcription':
    print('Recommendation: Improve OCR stage.')
    print('  - Check rotation selector quality scores')
    print('  - Consider finetuning LightOnOCR-2 on engineering text')
    print('  - Add more canonicalization rules for common OCR errors')
elif bottleneck == 'parsing':
    print('Recommendation: Improve regex patterns or add VLM fallback.')
    print('  - Review missed parses in crop_reader debug output')
    print('  - Add regex patterns for new callout formats')
    print('  - Enable VLM fallback (Qwen) for low-confidence OCR results')

In [None]:
# Cell 8: Placeholder for YOLO finetuning configuration (A100 optimized)
print('=== YOLO-OBB Finetuning Configuration (A100) ===')
print()
print('This cell provides a ready-to-run finetuning setup for YOLO11-OBB.')
print('Uncomment and run when you have annotated training data.\n')

# ---------- YOLO Finetuning Config (uncomment to run) ----------

# from ultralytics import YOLO
#
# # Load pretrained model
# model = YOLO('yolo11n-obb.pt')  # or your current best.pt
#
# # A100-optimized training configuration
# results = model.train(
#     # Dataset
#     data='/content/drive/MyDrive/ai_inspector_data/dataset.yaml',
#     task='obb',
#
#     # Training params (A100 optimized)
#     epochs=100,
#     batch=32,              # A100 can handle large batch with OBB
#     imgsz=1024,            # High res for engineering drawings
#     device='cuda',
#
#     # Optimization
#     optimizer='AdamW',
#     lr0=0.001,
#     lrf=0.01,              # Final LR = lr0 * lrf
#     warmup_epochs=3,
#     weight_decay=0.0005,
#
#     # Augmentation (conservative for technical drawings)
#     hsv_h=0.0,            # No hue shift (drawings are B&W)
#     hsv_s=0.0,            # No saturation shift
#     hsv_v=0.2,            # Slight brightness variation
#     degrees=5.0,          # Small rotation (callouts have orientation)
#     translate=0.1,
#     scale=0.3,
#     flipud=0.0,           # No vertical flip (text would be upside down)
#     fliplr=0.0,           # No horizontal flip (text would be mirrored)
#     mosaic=0.5,           # Reduced mosaic (preserve drawing context)
#     mixup=0.0,            # No mixup (drawings don't blend well)
#
#     # Output
#     project='/content/drive/MyDrive/ai_inspector_models/finetune',
#     name='yolo11_obb_v1',
#     save=True,
#     save_period=10,
#     plots=True,
#     verbose=True,
# )
#
# print('Training complete.')
# print(f'Best model: {results.save_dir}/weights/best.pt')

print('----------')
print('dataset.yaml format for OBB training:')
print('''
# dataset.yaml
path: /content/drive/MyDrive/ai_inspector_data/obb_dataset
train: images/train
val: images/val
test: images/test

names:
  0: Hole
  1: TappedHole
  2: CounterboreHole
  3: CountersinkHole
  4: Fillet
  5: Chamfer
  6: Thread
  7: Slot
  8: Bend
  9: GDT
  10: SurfaceFinish
  11: Dimension
  12: Tolerance
  13: Note
''')

print('OBB label format (per image .txt):')
print('  class_id x1 y1 x2 y2 x3 y3 x4 y4')
print('  (all values normalized 0-1)')
print()
print('A100 training estimates:')
print('  - 100 epochs, batch=32, imgsz=1024: ~2-4 hours')
print('  - VRAM usage: ~15-20 GB (plenty of headroom on 40 GB A100)')