# Model Results Visualization

This notebook loads a trained model and visualizes its performance on the test dataset.

**Setup:**
1. Update the paths below to point to your trained model and dataset
2. Run all cells to see visualizations


In [None]:
import os
import sys
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

MODEL_PATH = "models/cnn_model.keras"  # specify path to model
DATASET_DIR = "dataset"  # specify path to dataset directory
IMAGE_SIZE = 224
BATCH_SIZE = 32

plt.style.use("seaborn-v0_8")
sns.set_palette("husl")
%matplotlib inline


## 1. Load Model and Dataset


In [None]:
print(f"Loading model from: {MODEL_PATH}")
model = tf.keras.models.load_model(MODEL_PATH)
model.summary()


In [None]:
# Load test dataset
test_dir = Path(DATASET_DIR) / "test"
print(f"Loading test dataset from: {test_dir}")

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    shuffle=False,
)

class_names = test_ds.class_names
print(f"\nFound {len(class_names)} classes: {class_names}")

# Collect all images and labels
test_images = []
test_labels = []

for images, labels in test_ds:
    test_images.append(images.numpy())
    test_labels.append(labels.numpy())

test_images = np.concatenate(test_images, axis=0)
test_labels = np.concatenate(test_labels, axis=0)

print(f"Test set size: {len(test_labels)} images")


## 2. Make Predictions


In [None]:
print("Making predictions...")
y_pred_prob = model.predict(test_images, verbose=1)
y_pred = np.argmax(y_pred_prob, axis=1)

print(f"\nPredictions complete. Shape: {y_pred_prob.shape}")


## 3. Overall Metrics


In [None]:
acc = accuracy_score(test_labels, y_pred)
prec = precision_score(test_labels, y_pred, average="macro", zero_division=0)
rec = recall_score(test_labels, y_pred, average="macro", zero_division=0)
f1 = f1_score(test_labels, y_pred, average="macro", zero_division=0)

print("Overall Performance Metrics:")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-Score:  {f1:.4f}")

# Visualize metrics
fig, ax = plt.subplots(figsize=(8, 6))
metrics = ["Accuracy", "Precision", "Recall", "F1-Score"]
values = [acc, prec, rec, f1]
bars = ax.bar(metrics, values, color=["#3498db", "#2ecc71", "#e74c3c", "#f39c12"])
ax.set_ylim([0, 1])
ax.set_ylabel("Score", fontsize=12)
ax.set_title("Overall Model Performance", fontsize=14, fontweight="bold")
for bar, val in zip(bars, values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
            f"{val:.3f}", ha="center", va="bottom", fontsize=11)
plt.tight_layout()
plt.show()


## 4. Confusion Matrix


In [None]:
cm = confusion_matrix(test_labels, y_pred)

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={"label": "Count"}
)
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Confusion Matrix", fontsize=14, fontweight="bold")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Normalized confusion matrix
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(12, 10))
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    xticklabels=class_names,
    yticklabels=class_names,
    cbar_kws={"label": "Proportion"}
)
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.title("Normalized Confusion Matrix", fontsize=14, fontweight="bold")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
report = classification_report(test_labels, y_pred, target_names=class_names, output_dict=True)

print("Detailed Classification Report:")
print(classification_report(test_labels, y_pred, target_names=class_names))

# Extract per-class metrics
classes = [c for c in class_names if c in report]
precisions = [report[c]["precision"] for c in classes]
recalls = [report[c]["recall"] for c in classes]
f1_scores = [report[c]["f1-score"] for c in classes]

# Visualize per-class metrics
x = np.arange(len(classes))
width = 0.25

fig, ax = plt.subplots(figsize=(14, 8))
ax.bar(x - width, precisions, width, label="Precision", color="#3498db")
ax.bar(x, recalls, width, label="Recall", color="#2ecc71")
ax.bar(x + width, f1_scores, width, label="F1-Score", color="#e74c3c")

ax.set_xlabel("Class", fontsize=12)
ax.set_ylabel("Score", fontsize=12)
ax.set_title("Per-Class Performance Metrics", fontsize=14, fontweight="bold")
ax.set_xticks(x)
ax.set_xticklabels(classes, rotation=45, ha="right")
ax.set_ylim([0, 1])
ax.legend()
ax.grid(axis="y", alpha=0.3)
plt.tight_layout()
plt.show()


## 6. Sample Predictions Visualization


In [None]:
def visualize_predictions(images, true_labels, pred_labels, pred_probs, class_names, num_samples=16, title="Predictions"):
    """Visualize a grid of images with their predictions."""
    fig, axes = plt.subplots(4, 4, figsize=(16, 16))
    axes = axes.ravel()
    
    indices = np.random.choice(len(images), min(num_samples, len(images)), replace=False)
    
    for i, idx in enumerate(indices):
        img = images[idx]
        true_label = class_names[true_labels[idx]]
        pred_label = class_names[pred_labels[idx]]
        conf = pred_probs[idx][pred_labels[idx]]
        
        axes[i].imshow(img.astype("uint8"))
        axes[i].axis("off")
        
        color = "green" if true_label == pred_label else "red"
        axes[i].set_title(
            f"True: {true_label}\nPred: {pred_label} ({conf:.2f})",
            color=color,
            fontsize=10
        )
    
    plt.suptitle(title, fontsize=16, fontweight="bold")
    plt.tight_layout()
    plt.show()

# Show random samples
visualize_predictions(test_images, test_labels, y_pred, y_pred_prob, class_names, num_samples=16, title="Random Sample Predictions")


## 7. Correct vs Incorrect Predictions


In [None]:
# Separate correct and incorrect predictions
correct_mask = test_labels == y_pred
incorrect_mask = ~correct_mask

correct_images = test_images[correct_mask]
correct_true = test_labels[correct_mask]
correct_pred = y_pred[correct_mask]
correct_probs = y_pred_prob[correct_mask]

incorrect_images = test_images[incorrect_mask]
incorrect_true = test_labels[incorrect_mask]
incorrect_pred = y_pred[incorrect_mask]
incorrect_probs = y_pred_prob[incorrect_mask]

print(f"Correct predictions: {len(correct_images)} ({len(correct_images)/len(test_labels)*100:.1f}%)")
print(f"Incorrect predictions: {len(incorrect_images)} ({len(incorrect_images)/len(test_labels)*100:.1f}%)")

# Show correct predictions
if len(correct_images) > 0:
    visualize_predictions(
        correct_images, correct_true, correct_pred, correct_probs, class_names,
        num_samples=min(16, len(correct_images)),
        title="Correct Predictions (Green)"
    )

# Show incorrect predictions
if len(incorrect_images) > 0:
    visualize_predictions(
        incorrect_images, incorrect_true, incorrect_pred, incorrect_probs, class_names,
        num_samples=min(16, len(incorrect_images)),
        title="Incorrect Predictions (Red)"
    )


## 8. Confidence Distribution


In [None]:
# Get confidence scores for predictions
confidence_scores = np.max(y_pred_prob, axis=1)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram of confidence scores
axes[0].hist(confidence_scores, bins=30, edgecolor="black", alpha=0.7)
axes[0].set_xlabel("Confidence Score", fontsize=12)
axes[0].set_ylabel("Frequency", fontsize=12)
axes[0].set_title("Distribution of Prediction Confidence", fontsize=12, fontweight="bold")
axes[0].axvline(np.mean(confidence_scores), color="red", linestyle="--", label=f"Mean: {np.mean(confidence_scores):.3f}")
axes[0].legend()
axes[0].grid(alpha=0.3)

# Confidence by correctness
axes[1].hist(confidence_scores[correct_mask], bins=30, alpha=0.6, label="Correct", color="green", edgecolor="black")
axes[1].hist(confidence_scores[incorrect_mask], bins=30, alpha=0.6, label="Incorrect", color="red", edgecolor="black")
axes[1].set_xlabel("Confidence Score", fontsize=12)
axes[1].set_ylabel("Frequency", fontsize=12)
axes[1].set_title("Confidence: Correct vs Incorrect", fontsize=12, fontweight="bold")
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Average confidence for correct predictions: {np.mean(confidence_scores[correct_mask]):.3f}")
print(f"Average confidence for incorrect predictions: {np.mean(confidence_scores[incorrect_mask]):.3f}")


## 9. Class Distribution Analysis

In [None]:
# Count true and predicted class distributions
unique_true, counts_true = np.unique(test_labels, return_counts=True)
unique_pred, counts_pred = np.unique(y_pred, return_counts=True)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# True distribution
axes[0].bar([class_names[i] for i in unique_true], counts_true, color="#3498db", edgecolor="black")
axes[0].set_xlabel("Class", fontsize=12)
axes[0].set_ylabel("Count", fontsize=12)
axes[0].set_title("True Class Distribution", fontsize=12, fontweight="bold")
axes[0].tick_params(axis="x", rotation=45)
axes[0].grid(axis="y", alpha=0.3)

# Predicted distribution
pred_names = [class_names[i] for i in unique_pred]
axes[1].bar(pred_names, counts_pred, color="#2ecc71", edgecolor="black")
axes[1].set_xlabel("Class", fontsize=12)
axes[1].set_ylabel("Count", fontsize=12)
axes[1].set_title("Predicted Class Distribution", fontsize=12, fontweight="bold")
axes[1].tick_params(axis="x", rotation=45)
axes[1].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()
