# Model Evaluation and Results Analysis

This notebook evaluates trained models, analyzes results, and visualizes predictions.

## 1. Environment Setup and Imports

In [None]:
# Standard library imports
import os
import warnings
from collections import defaultdict
from pathlib import Path

warnings.filterwarnings('ignore')

# Data processing
# Visualization
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Deep learning
import torch
from matplotlib.patches import Rectangle
from PIL import Image
from sklearn.metrics import confusion_matrix

# YOLO
from ultralytics import YOLO

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 2. Configuration and Model Loading

In [None]:
# Define paths
PROJECT_ROOT = Path('/home/user/qontinui-finetune')
DATASET_PATH = PROJECT_ROOT / 'data' / 'dataset'  # Update with actual dataset path
RUNS_DIR = PROJECT_ROOT / 'runs'
CHECKPOINTS_DIR = RUNS_DIR / 'checkpoints'

# Configuration
evaluation_config = {
    'conf_threshold': 0.25,  # Confidence threshold for detection
    'iou_threshold': 0.45,   # IoU threshold for NMS
    'imgsz': 640,
    'batch_size': 16,
    'device': 0 if torch.cuda.is_available() else 'cpu',
}

# Dataset YAML path
dataset_yaml_path = PROJECT_ROOT / 'dataset.yaml'

print("Evaluation Configuration:")
for key, value in evaluation_config.items():
    print(f"  {key:20s}: {value}")

## 3. Load Trained Model

In [None]:
# Find and load best model
checkpoint_files = list(CHECKPOINTS_DIR.glob('best_*.pt'))

if checkpoint_files:
    # Use most recent checkpoint
    model_path = sorted(checkpoint_files, key=os.path.getctime, reverse=True)[0]
    print(f"Loading model: {model_path}")
    model = YOLO(str(model_path))
    print("Model loaded successfully!")

    # Display model info
    print(f"\nModel device: {next(model.model.parameters()).device}")
    print(f"Model parameters: {sum(p.numel() for p in model.model.parameters()):,}")
else:
    print("No checkpoint found. Please train a model first.")
    model_path = None
    model = None

## 4. Run Evaluation on Test Set

In [None]:
# Run evaluation on test set
if model is not None and dataset_yaml_path.exists():
    print("Running evaluation on test set...")

    eval_results = model.val(
        data=str(dataset_yaml_path),
        imgsz=evaluation_config['imgsz'],
        batch=evaluation_config['batch_size'],
        conf=evaluation_config['conf_threshold'],
        iou=evaluation_config['iou_threshold'],
        device=evaluation_config['device'],
        verbose=True,
    )

    print("\nEvaluation completed!")
    print("\nKey Metrics:")
    print(f"  mAP@0.5: {eval_results.box.map50:.4f}")
    print(f"  mAP@0.5:0.95: {eval_results.box.map:.4f}")

## 5. Per-Class Performance Analysis

In [None]:
def analyze_per_class_performance(results):
    """Extract and analyze per-class metrics from evaluation results."""

    # Get stats if available
    if hasattr(results, 'box') and hasattr(results.box, 'ap_class_index'):
        class_indices = results.box.ap_class_index
        ap_values = results.box.ap if hasattr(results.box, 'ap') else None

        if ap_values is not None:
            return class_indices, ap_values

    return None, None

# Analyze per-class performance
if model is not None and 'eval_results' in locals():
    class_indices, ap_values = analyze_per_class_performance(eval_results)

    if class_indices is not None and ap_values is not None:
        print("Per-Class Performance:")
        for idx, ap in zip(class_indices, ap_values, strict=False):
            print(f"  Class {idx}: AP = {ap:.4f}")
    else:
        print("Per-class metrics not available in current format")

## 6. Detection Predictions on Test Images

In [None]:
def run_predictions(model, test_images_dir, conf_threshold=0.25):
    """Run predictions on a directory of images."""
    results = []

    image_files = list(test_images_dir.glob('*.jpg')) + list(test_images_dir.glob('*.png'))

    for img_path in image_files:
        # Run inference
        pred = model.predict(
            source=str(img_path),
            conf=conf_threshold,
            imgsz=evaluation_config['imgsz'],
            device=evaluation_config['device'],
            verbose=False
        )

        results.append({
            'image_path': img_path,
            'predictions': pred[0] if pred else None
        })

    return results

# Run predictions on test set
if model is not None:
    test_images_dir = DATASET_PATH / 'images' / 'test'

    if test_images_dir.exists():
        print(f"Running predictions on test set: {test_images_dir}")
        print(f"Number of test images: {len(list(test_images_dir.glob('*.jpg')) + list(test_images_dir.glob('*.png')))}")

        predictions = run_predictions(model, test_images_dir, evaluation_config['conf_threshold'])
        print(f"Predictions completed for {len(predictions)} images")
    else:
        print(f"Test images directory not found: {test_images_dir}")

## 7. Visualize Predictions

In [None]:
def plot_prediction(pred_result, class_names=None):
    """Plot prediction results with bounding boxes."""
    if pred_result['predictions'] is None:
        return None

    img_path = pred_result['image_path']
    pred = pred_result['predictions']

    # Read image
    img = Image.open(img_path)
    img_array = np.array(img)
    img_h, img_w = img_array.shape[:2]

    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    ax.imshow(img_array)

    # Draw bounding boxes
    if hasattr(pred, 'boxes') and pred.boxes is not None:
        boxes = pred.boxes

        for _i, box in enumerate(boxes):
            # Get box coordinates (xyxy format)
            x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
            conf = box.conf[0].cpu().numpy()
            cls = int(box.cls[0].cpu().numpy())

            # Draw rectangle
            rect = Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=2, edgecolor='lime', facecolor='none')
            ax.add_patch(rect)

            # Draw label
            class_name = class_names.get(cls, f'Class {cls}') if class_names else f'Class {cls}'
            label_text = f'{class_name}: {conf:.2f}'
            ax.text(x1, y1 - 8, label_text, fontsize=10,
                   color='white', bbox={'facecolor': 'lime', 'alpha': 0.8})

    ax.set_title(f"Predictions: {img_path.name}", fontsize=12, fontweight='bold')
    ax.axis('off')

    return fig

# Visualize sample predictions
if 'predictions' in locals():
    # Sample up to 5 predictions
    for pred in predictions[:5]:
        plot_prediction(pred)
        plt.show()

## 8. Confusion Matrix Visualization

In [None]:
def create_confusion_matrix(predictions, labels_dir, num_classes=2):
    """Create confusion matrix from predictions."""
    all_true_labels = []
    all_pred_labels = []

    for pred_result in predictions:
        img_path = pred_result['image_path']
        pred = pred_result['predictions']

        # Get true labels from file
        label_file = labels_dir / (img_path.stem + '.txt')
        true_classes = []

        if label_file.exists():
            with open(label_file) as f:
                for line in f:
                    class_id = int(line.split()[0])
                    true_classes.append(class_id)

        # Get predicted labels
        pred_classes = []
        if hasattr(pred, 'boxes') and pred.boxes is not None:
            for box in pred.boxes:
                cls = int(box.cls[0].cpu().numpy())
                pred_classes.append(cls)

        # Match predictions to ground truth (simple matching)
        all_true_labels.extend(true_classes)
        all_pred_labels.extend(pred_classes[:len(true_classes)])

    # Create confusion matrix
    if all_true_labels and all_pred_labels:
        cm = confusion_matrix(all_true_labels, all_pred_labels,
                              labels=list(range(num_classes)))
        return cm

    return None

# Create and visualize confusion matrix
if 'predictions' in locals():
    labels_dir = DATASET_PATH / 'labels' / 'test'
    cm = create_confusion_matrix(predictions, labels_dir)

    if cm is not None:
        fig, ax = plt.subplots(figsize=(8, 7))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
                   xticklabels=list(range(cm.shape[0])),
                   yticklabels=list(range(cm.shape[0])),
                   ax=ax)
        ax.set_xlabel('Predicted Label', fontsize=12)
        ax.set_ylabel('True Label', fontsize=12)
        ax.set_title('Confusion Matrix', fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()

## 9. Detection Confidence Analysis

In [None]:
def analyze_confidence_scores(predictions):
    """Analyze confidence scores of predictions."""
    all_confidences = []
    confidence_by_class = defaultdict(list)

    for pred_result in predictions:
        pred = pred_result['predictions']

        if hasattr(pred, 'boxes') and pred.boxes is not None:
            for box in pred.boxes:
                conf = box.conf[0].cpu().numpy()
                cls = int(box.cls[0].cpu().numpy())

                all_confidences.append(conf)
                confidence_by_class[cls].append(conf)

    return all_confidences, confidence_by_class

# Analyze confidence scores
if 'predictions' in locals():
    all_confidences, conf_by_class = analyze_confidence_scores(predictions)

    if all_confidences:
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))

        # Overall confidence distribution
        axes[0].hist(all_confidences, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
        axes[0].axvline(np.mean(all_confidences), color='red', linestyle='--',
                        linewidth=2, label=f'Mean: {np.mean(all_confidences):.3f}')
        axes[0].set_xlabel('Confidence Score', fontsize=11)
        axes[0].set_ylabel('Frequency', fontsize=11)
        axes[0].set_title('Detection Confidence Distribution', fontsize=12, fontweight='bold')
        axes[0].legend()
        axes[0].grid(axis='y', alpha=0.3)

        # Confidence by class (box plot)
        class_labels = sorted(conf_by_class.keys())
        class_confidences = [conf_by_class[c] for c in class_labels]

        bp = axes[1].boxplot(class_confidences, labels=[f'Class {c}' for c in class_labels],
                             patch_artist=True)
        for patch in bp['boxes']:
            patch.set_facecolor('lightblue')
        axes[1].set_ylabel('Confidence Score', fontsize=11)
        axes[1].set_title('Confidence by Class', fontsize=12, fontweight='bold')
        axes[1].grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.show()

        # Print statistics
        print("Overall Confidence Statistics:")
        print(f"  Mean: {np.mean(all_confidences):.4f}")
        print(f"  Std: {np.std(all_confidences):.4f}")
        print(f"  Min: {np.min(all_confidences):.4f}")
        print(f"  Max: {np.max(all_confidences):.4f}")

## 10. Export Model for Deployment

In [None]:
# Export model to different formats
if model is not None:
    export_dir = PROJECT_ROOT / 'exports'
    export_dir.mkdir(exist_ok=True)

    print("Exporting model to different formats...\n")

    # Supported export formats
    export_formats = {
        'onnx': 'ONNX (Open Neural Network Exchange)',
        'torchscript': 'TorchScript',
        'tflite': 'TensorFlow Lite',
    }

    # Export to ONNX
    try:
        print("Exporting to ONNX format...")
        onnx_path = model.export(format='onnx', imgsz=evaluation_config['imgsz'])
        print(f"  Success: {onnx_path}")
    except Exception as e:
        print(f"  Failed: {e}")

    # Export to TorchScript
    try:
        print("\nExporting to TorchScript format...")
        torchscript_path = model.export(format='torchscript', imgsz=evaluation_config['imgsz'])
        print(f"  Success: {torchscript_path}")
    except Exception as e:
        print(f"  Failed: {e}")

    print(f"\nExports saved to: {export_dir}")

## 11. Inference Speed Benchmarking

In [None]:
import time


def benchmark_inference(model, test_images_dir, num_images=10):
    """Benchmark inference speed."""
    image_files = list(test_images_dir.glob('*.jpg')) + list(test_images_dir.glob('*.png'))
    image_files = image_files[:num_images]

    inference_times = []

    print(f"Benchmarking inference on {len(image_files)} images...\n")

    for _i, img_path in enumerate(image_files):
        start = time.time()

        model.predict(
            source=str(img_path),
            conf=evaluation_config['conf_threshold'],
            imgsz=evaluation_config['imgsz'],
            device=evaluation_config['device'],
            verbose=False
        )

        elapsed = time.time() - start
        inference_times.append(elapsed)

    return inference_times

# Run benchmark
if model is not None:
    test_images_dir = DATASET_PATH / 'images' / 'test'

    if test_images_dir.exists():
        inference_times = benchmark_inference(model, test_images_dir, num_images=10)

        print("\nInference Speed Statistics:")
        print(f"  Mean: {np.mean(inference_times)*1000:.2f} ms")
        print(f"  Std: {np.std(inference_times)*1000:.2f} ms")
        print(f"  Min: {np.min(inference_times)*1000:.2f} ms")
        print(f"  Max: {np.max(inference_times)*1000:.2f} ms")
        print(f"  FPS: {1/np.mean(inference_times):.2f}")

## 12. Comprehensive Evaluation Report

In [None]:
# Generate comprehensive evaluation report
if model is not None:
    print("="*70)
    print("MODEL EVALUATION REPORT".center(70))
    print("="*70)

    print(f"\nModel: {model_path.name if model_path else 'Unknown'}")
    print(f"Device: {evaluation_config['device']}")
    print(f"Input Size: {evaluation_config['imgsz']}x{evaluation_config['imgsz']}")

    if 'eval_results' in locals():
        print("\n[Evaluation Results]")
        print(f"  mAP@0.5: {eval_results.box.map50:.4f}")
        print(f"  mAP@0.5:0.95: {eval_results.box.map:.4f}")

    if 'inference_times' in locals():
        print("\n[Inference Performance]")
        print(f"  Mean Inference Time: {np.mean(inference_times)*1000:.2f} ms")
        print(f"  Throughput: {1/np.mean(inference_times):.2f} FPS")

    if 'all_confidences' in locals():
        print("\n[Detection Confidence]")
        print(f"  Mean Confidence: {np.mean(all_confidences):.4f}")
        print(f"  Std: {np.std(all_confidences):.4f}")

    print(f"\nExports: {PROJECT_ROOT / 'exports'}")
    print("="*70)