In [1]:
"""
SqueezeNet FPGA Inference on CIFAR-10 Test Images
Loads trained weights, runs inference on test images, displays results
"""

from pynq import Overlay, allocate
import numpy as np
import time
from PIL import Image
from pathlib import Path

# ============================================================================
# Configuration
# ============================================================================

CIFAR10_CLASSES = [
    'airplane', 'automobile', 'bird', 'cat', 'deer',
    'dog', 'frog', 'horse', 'ship', 'truck'
]

# Buffer sizes (must match HLS design)
INPUT_SIZE = 3 * 224 * 224        # 150,528 elements
OUTPUT_SIZE = 10                   # 10 class scores
WEIGHTS_SIZE = 740000              # Maximum weights
BIASES_SIZE = 3000                 # Maximum biases
DDR_BUFFER_SIZE = 1605632          # From DDR_BUFFER_SIZE

# File paths
BITSTREAM_PATH = "design_1.bit"
WEIGHTS_PATH = "weights.bin"
BIASES_PATH = "biases.bin"
IMAGES_DIR = "."  # Current directory, adjust as needed

# ============================================================================
# Q3.4 Fixed-Point Conversion
# ============================================================================

def float_to_q34(x):
    """Convert float to Q3.4 fixed-point (8-bit signed)."""
    x = np.clip(x, -8.0, 7.9375)
    return np.round(x * 16.0).astype(np.int8)

def q34_to_float(x):
    """Convert Q3.4 fixed-point to float."""
    return x.astype(np.float32) / 16.0

# ============================================================================
# Image Preprocessing
# ============================================================================

def preprocess_image(image_path):
    """
    Load and preprocess CIFAR-10 image for SqueezeNet.
    
    Args:
        image_path: Path to input image (JPEG)
        
    Returns:
        Q3.4 formatted input array (flattened)
    """
    # Load image
    img = Image.open(image_path).convert('RGB')
    
    # Resize to 224x224 (SqueezeNet input size)
    img = img.resize((224, 224), Image.BILINEAR)
    
    # Convert to numpy array and normalize to [0, 1]
    img_array = np.array(img).astype(np.float32) / 255.0
    
    # =========================================================================
    # CIFAR-10 Normalization (MUST match training!)
    # =========================================================================
    mean = np.array([0.4914, 0.4822, 0.4465])
    std = np.array([0.2023, 0.1994, 0.2010])
    
    img_normalized = (img_array - mean) / std
    
    # Rearrange from HWC to CHW (channels first)
    img_chw = np.transpose(img_normalized, (2, 0, 1))
    
    # Flatten
    img_flat = img_chw.flatten()
    
    # Convert to Q3.4
    img_q34 = float_to_q34(img_flat)
    
    return img_q34

# ============================================================================
# Accelerator Control
# ============================================================================

class SqueezeNetAccelerator:
    """Wrapper class for SqueezeNet FPGA accelerator."""
    
    def __init__(self, bitstream_path):
        """Initialize accelerator and allocate buffers."""
        
        print("="*70)
        print("SQUEEZENET FPGA ACCELERATOR")
        print("="*70)
        
        # Load bitstream
        print(f"\nLoading bitstream: {bitstream_path}")
        self.overlay = Overlay(bitstream_path)
        self.ip = self.overlay.squeezenet_top_0
        print(f"✓ Bitstream loaded (base address: 0x{self.ip.mmio.base_addr:08X})")
        
        # Allocate buffers
        print("\nAllocating buffers...")
        self.input_buffer = allocate(shape=(INPUT_SIZE,), dtype=np.int8)
        self.output_buffer = allocate(shape=(OUTPUT_SIZE,), dtype=np.int8)
        self.weights_buffer = allocate(shape=(WEIGHTS_SIZE,), dtype=np.int8)
        self.biases_buffer = allocate(shape=(BIASES_SIZE,), dtype=np.int8)
        self.ddr_buffer_a = allocate(shape=(DDR_BUFFER_SIZE,), dtype=np.int8)
        self.ddr_buffer_b = allocate(shape=(DDR_BUFFER_SIZE,), dtype=np.int8)
        
        print(f"  Input:        {self.input_buffer.nbytes:,} bytes")
        print(f"  Output:       {self.output_buffer.nbytes:,} bytes")
        print(f"  Weights:      {self.weights_buffer.nbytes:,} bytes")
        print(f"  Biases:       {self.biases_buffer.nbytes:,} bytes")
        print(f"  DDR Buffer A: {self.ddr_buffer_a.nbytes:,} bytes")
        print(f"  DDR Buffer B: {self.ddr_buffer_b.nbytes:,} bytes")
        print("✓ Buffers allocated")
        
        # Clear DDR buffers
        self.ddr_buffer_a[:] = 0
        self.ddr_buffer_b[:] = 0
        self.ddr_buffer_a.sync_to_device()
        self.ddr_buffer_b.sync_to_device()
        
    def load_weights(self, weights_path, biases_path):
        """Load trained weights and biases from binary files."""
        
        print("\nLoading trained weights and biases...")
        
        # Load weights
        if not Path(weights_path).exists():
            raise FileNotFoundError(f"Weights file not found: {weights_path}")
        
        weights = np.fromfile(weights_path, dtype=np.int8)
        print(f"  Weights: {len(weights):,} elements loaded")
        
        if len(weights) > WEIGHTS_SIZE:
            raise ValueError(f"Weights too large: {len(weights)} > {WEIGHTS_SIZE}")
        
        self.weights_buffer[:len(weights)] = weights
        self.weights_buffer[len(weights):] = 0  # Zero-pad
        
        # Load biases
        if not Path(biases_path).exists():
            raise FileNotFoundError(f"Biases file not found: {biases_path}")
        
        biases = np.fromfile(biases_path, dtype=np.int8)
        print(f"  Biases:  {len(biases):,} elements loaded")
        
        if len(biases) > BIASES_SIZE:
            raise ValueError(f"Biases too large: {len(biases)} > {BIASES_SIZE}")
        
        self.biases_buffer[:len(biases)] = biases
        self.biases_buffer[len(biases):] = 0  # Zero-pad
        
        # Sync to device
        self.weights_buffer.sync_to_device()
        self.biases_buffer.sync_to_device()
        
        print("✓ Weights and biases loaded")
        
    def _write_pointer(self, register_base_name, physical_address):
        """Write 64-bit address to split register pair."""
        lower_32 = physical_address & 0xFFFFFFFF
        upper_32 = (physical_address >> 32) & 0xFFFFFFFF
        setattr(self.ip.register_map, f"{register_base_name}_1", lower_32)
        setattr(self.ip.register_map, f"{register_base_name}_2", upper_32)
    
    def run_inference(self, input_data):
        """
        Run inference on input data.
        
        Args:
            input_data: Preprocessed input (Q3.4 format, flattened)
            
        Returns:
            output_scores: Classification scores (float)
            inference_time: Time taken (seconds)
        """
        
        # Copy input data
        self.input_buffer[:] = input_data
        self.output_buffer[:] = 0
        
        # Sync input
        self.input_buffer.sync_to_device()
        self.output_buffer.sync_to_device()
        
        # Write buffer addresses
        self._write_pointer("input_r", self.input_buffer.physical_address)
        self._write_pointer("output_r", self.output_buffer.physical_address)
        self._write_pointer("weights", self.weights_buffer.physical_address)
        self._write_pointer("biases", self.biases_buffer.physical_address)
        self._write_pointer("ddr_buffer_a", self.ddr_buffer_a.physical_address)
        self._write_pointer("ddr_buffer_b", self.ddr_buffer_b.physical_address)
        
        # Set start parameter to 1
        self.ip.register_map.start_r = 1
        
        # Start accelerator (ap_start)
        start_time = time.time()
        self.ip.register_map.CTRL = 0x01
        
        # Wait for completion (ap_done)
        while True:
            ctrl = int(self.ip.register_map.CTRL)
            if (ctrl >> 1) & 0x1:  # AP_DONE bit
                break
            time.sleep(0.001)
        
        inference_time = time.time() - start_time
        
        # Read output
        self.output_buffer.sync_from_device()
        output_scores = q34_to_float(self.output_buffer)
        
        return output_scores, inference_time
    
    def __del__(self):
        """Cleanup buffers."""
        if hasattr(self, 'input_buffer'):
            del self.input_buffer
            del self.output_buffer
            del self.weights_buffer
            del self.biases_buffer
            del self.ddr_buffer_a
            del self.ddr_buffer_b

# ============================================================================
# Main Testing Function
# ============================================================================

def test_all_classes():
    """Test inference on all 10 CIFAR-10 classes."""
    
    print("\n" + "="*70)
    print("TESTING SQUEEZENET ON CIFAR-10")
    print("="*70)
    
    # Initialize accelerator
    accel = SqueezeNetAccelerator(BITSTREAM_PATH)
    
    # Load trained weights
    accel.load_weights(WEIGHTS_PATH, BIASES_PATH)
    
    # Test each class
    print("\n" + "="*70)
    print("RUNNING INFERENCE ON TEST IMAGES")
    print("="*70)
    
    results = []
    total_time = 0
    
    for class_id in range(10):
        image_path = Path(IMAGES_DIR) / f"class_{class_id}.png"
        
        if not image_path.exists():
            print(f"\n✗ Image not found: {image_path}")
            continue
        
        print(f"\n[Class {class_id}: {CIFAR10_CLASSES[class_id].upper()}]")
        print(f"  Image: {image_path}")
        
        # Preprocess image
        input_data = preprocess_image(image_path)
        print(f"  Preprocessed: {len(input_data)} elements (Q3.4)")
        
        # Run inference
        print("  Running inference...")
        output_scores, inference_time = accel.run_inference(input_data)
        
        total_time += inference_time
        
        # Get top-5 predictions
        sorted_indices = np.argsort(output_scores)[::-1]
        
        print(f"  ✓ Inference completed in {inference_time:.3f} seconds")
        print(f"\n  Top 5 predictions:")
        for rank in range(5):
            idx = sorted_indices[rank]
            score = output_scores[idx]
            correct_mark = "✓" if idx == class_id else " "
            print(f"    {rank+1}. {correct_mark} {CIFAR10_CLASSES[idx]:12s} : {score:7.4f}")
        
        predicted_class = sorted_indices[0]
        is_correct = (predicted_class == class_id)
        
        results.append({
            'true_class': class_id,
            'predicted_class': predicted_class,
            'correct': is_correct,
            'confidence': output_scores[predicted_class],
            'inference_time': inference_time,
            'scores': output_scores.copy()
        })
        
        if is_correct:
            print(f"\n  ✓ CORRECT! Predicted: {CIFAR10_CLASSES[predicted_class]}")
        else:
            print(f"\n  ✗ INCORRECT! Predicted: {CIFAR10_CLASSES[predicted_class]}, Expected: {CIFAR10_CLASSES[class_id]}")
    
    # ========================================================================
    # Summary Statistics
    # ========================================================================
    
    print("\n" + "="*70)
    print("RESULTS SUMMARY")
    print("="*70)
    
    num_correct = sum(r['correct'] for r in results)
    accuracy = 100.0 * num_correct / len(results)
    avg_time = total_time / len(results)
    fps = 1.0 / avg_time
    
    print(f"\nAccuracy: {num_correct}/{len(results)} = {accuracy:.1f}%")
    print(f"Average inference time: {avg_time:.3f} seconds")
    print(f"Throughput: {fps:.2f} FPS")
    
    print("\nPer-class results:")
    print("  " + "-"*60)
    print(f"  {'Class':<12} {'True Label':<12} {'Prediction':<12} {'Correct'}")
    print("  " + "-"*60)
    
    for r in results:
        true_label = CIFAR10_CLASSES[r['true_class']]
        pred_label = CIFAR10_CLASSES[r['predicted_class']]
        correct_str = "✓" if r['correct'] else "✗"
        print(f"  {r['true_class']:<12} {true_label:<12} {pred_label:<12} {correct_str}")
    
    print("  " + "-"*60)
    
    # ========================================================================
    # Confusion Matrix
    # ========================================================================
    
    print("\nConfusion Matrix:")
    print("  (rows=true, cols=predicted)")
    print("\n      ", end="")
    for i in range(10):
        print(f"{i:3}", end="")
    print()
    
    confusion = np.zeros((10, 10), dtype=int)
    for r in results:
        confusion[r['true_class'], r['predicted_class']] += 1
    
    for i in range(10):
        print(f"  {i:2}: ", end="")
        for j in range(10):
            if confusion[i, j] > 0:
                print(f"{confusion[i, j]:3}", end="")
            else:
                print("  .", end="")
        print()
    
    print("\n" + "="*70)
    print("TESTING COMPLETE!")
    print("="*70)
    
    return results

# ============================================================================
# Main Entry Point
# ============================================================================

if __name__ == "__main__":
    try:
        results = test_all_classes()
    except KeyboardInterrupt:
        print("\n\nInterrupted by user")
    except Exception as e:
        print(f"\n\n✗ Error: {e}")
        import traceback
        traceback.print_exc()


TESTING SQUEEZENET ON CIFAR-10
SQUEEZENET FPGA ACCELERATOR

Loading bitstream: design_1.bit


✓ Bitstream loaded (base address: 0xA0000000)

Allocating buffers...
  Input:        150,528 bytes
  Output:       10 bytes
  Weights:      740,000 bytes
  Biases:       3,000 bytes
  DDR Buffer A: 1,605,632 bytes
  DDR Buffer B: 1,605,632 bytes
✓ Buffers allocated

Loading trained weights and biases...
  Weights: 737,568 elements loaded
  Biases:  2,986 elements loaded
✓ Weights and biases loaded

RUNNING INFERENCE ON TEST IMAGES

[Class 0: AIRPLANE]
  Image: class_0.png
  Preprocessed: 150528 elements (Q3.4)
  Running inference...
  ✓ Inference completed in 29.983 seconds

  Top 5 predictions:
    1. ✓ airplane     :  4.2500
    2.   automobile   :  3.3125
    3.   ship         :  3.0625
    4.   bird         :  1.9375
    5.   truck        :  1.8125

  ✓ CORRECT! Predicted: airplane

[Class 1: AUTOMOBILE]
  Image: class_1.png
  Preprocessed: 150528 elements (Q3.4)
  Running inference...
  ✓ Inference completed in 29.983 seconds

  Top 5 predictions:
    1. ✓ automobile   :  4.5625
 

In [2]:
import numpy as np

# Load float32 weights
weights_float = np.fromfile('weights.bin', dtype=np.float32)
biases_float = np.fromfile('biases.bin', dtype=np.float32)

# Convert to int8 (Q3.4 format)
weights_int8 = np.clip(np.round(weights_float * 16), -128, 127).astype(np.int8)
biases_int8 = np.clip(np.round(biases_float * 16), -128, 127).astype(np.int8)

# Save as int8
#weights_int8.tofile('weights_int8.bin')
#biases_int8.tofile('biases_int8.bin')

print(f"Weights: {len(weights_int8):,} elements ({len(weights_int8)/1024:.1f} KB)")
print(f"Biases:  {len(biases_int8):,} elements ({len(biases_int8)/1024:.1f} KB)")


Weights: 184,392 elements (180.1 KB)
Biases:  746 elements (0.7 KB)


  weights_int8 = np.clip(np.round(weights_float * 16), -128, 127).astype(np.int8)
  weights_int8 = np.clip(np.round(weights_float * 16), -128, 127).astype(np.int8)
  biases_int8 = np.clip(np.round(biases_float * 16), -128, 127).astype(np.int8)
  biases_int8 = np.clip(np.round(biases_float * 16), -128, 127).astype(np.int8)


In [7]:
#!/usr/bin/env python3
"""
SqueezeNet FPGA Debug Script (Fixed)
====================================
Run this ON THE FPGA to identify register names and debug issues.
"""

from pynq import Overlay, allocate
import numpy as np
from pathlib import Path

BITSTREAM_PATH = "design_1.bit"
WEIGHTS_PATH = "weights.bin"
BIASES_PATH = "biases.bin"

def debug_registers(overlay):
    """Print all register names in the IP."""
    print("\n" + "="*60)
    print("DEBUG 1: IP REGISTER NAMES")
    print("="*60)
    
    ip = overlay.squeezenet_top_0
    
    print(f"\nBase Address: 0x{ip.mmio.base_addr:08X}")
    
    print("\nRegister Map attributes:")
    reg_map = ip.register_map
    
    for reg_name in dir(reg_map):
        if not reg_name.startswith('_'):
            try:
                val = getattr(reg_map, reg_name)
                print(f"  {reg_name}")
            except Exception as e:
                print(f"  {reg_name} (error: {e})")
    
    print("\n" + "-"*60)
    print("Look for pointer register patterns like:")
    print("  input_1/input_2 or input_r_1/input_r_2")
    print("  weights_1/weights_2")
    print("  ddr_buffer_a_1/ddr_buffer_a_2")
    print("-"*60)


def debug_weight_stats(weights_path, biases_path):
    """Check weight and bias statistics."""
    print("\n" + "="*60)
    print("DEBUG 2: WEIGHT/BIAS STATISTICS")
    print("="*60)
    
    w_size = Path(weights_path).stat().st_size
    expected_int8 = 737568
    
    print(f"\nFile: {weights_path}")
    print(f"Size: {w_size:,} bytes")
    
    if w_size == expected_int8:
        print("Format: INT8 (correct)")
        weights = np.fromfile(weights_path, dtype=np.int8)
    elif w_size == expected_int8 * 4:
        print("Format: FLOAT32 (will convert)")
        weights_f = np.fromfile(weights_path, dtype=np.float32)
        weights = np.clip(np.round(weights_f * 16), -128, 127).astype(np.int8)
    else:
        print(f"ERROR: Unexpected file size!")
        return None, None
    
    print(f"\nWeight Statistics:")
    print(f"  Count: {len(weights):,}")
    print(f"  Min:   {weights.min()}")
    print(f"  Max:   {weights.max()}")
    print(f"  Mean:  {weights.astype(float).mean():.2f}")
    print(f"  Zeros: {np.sum(weights == 0)} ({100*np.sum(weights == 0)/len(weights):.1f}%)")
    print(f"  Conv1 weights (first 20): {weights[:20]}")
    
    # Load biases
    b_size = Path(biases_path).stat().st_size
    expected_b_int8 = 2986
    
    print(f"\nFile: {biases_path}")
    print(f"Size: {b_size:,} bytes")
    
    if b_size == expected_b_int8:
        print("Format: INT8 (correct)")
        biases = np.fromfile(biases_path, dtype=np.int8)
    elif b_size == expected_b_int8 * 4:
        print("Format: FLOAT32 (will convert)")
        biases_f = np.fromfile(biases_path, dtype=np.float32)
        biases = np.clip(np.round(biases_f * 16), -128, 127).astype(np.int8)
    else:
        print(f"ERROR: Unexpected file size!")
        return weights, None
    
    print(f"\nBias Statistics:")
    print(f"  Count: {len(biases):,}")
    print(f"  Conv10 biases (CLASSIFIER - last 10):")
    for i, b in enumerate(biases[-10:]):
        print(f"    Class {i}: {b:4d}  ({b/16:.4f} in float)")
    
    return weights, biases


# ============================================================================
# Main
# ============================================================================

if __name__ == "__main__":
    print("="*60)
    print("SQUEEZENET FPGA DEBUG SCRIPT")
    print("="*60)
    
    print(f"\nLoading bitstream: {BITSTREAM_PATH}")
    overlay = Overlay(BITSTREAM_PATH)
    print("✓ Bitstream loaded")
    
    # Debug 1: Check registers
    debug_registers(overlay)
    
    # Debug 2: Check weights
    if Path(WEIGHTS_PATH).exists() and Path(BIASES_PATH).exists():
        weights, biases = debug_weight_stats(WEIGHTS_PATH, BIASES_PATH)
    else:
        print(f"\n✗ Weight files not found!")
    
    print("\n" + "="*60)
    print("DEBUG COMPLETE - Share the output above")
    print("="*60)

SQUEEZENET FPGA DEBUG SCRIPT

Loading bitstream: design_1.bit
✓ Bitstream loaded

DEBUG 1: IP REGISTER NAMES

Base Address: 0xA0000000

Register Map attributes:
  CTRL
  GIER
  IP_IER
  IP_ISR
  biases_1
  biases_2
  create_subclass
  ddr_buffer_a_1
  ddr_buffer_a_2
  ddr_buffer_b_1
  ddr_buffer_b_2
  done
  done_ctrl
  input_r_1
  input_r_2
  output_r_1
  output_r_2
  start_r
  weights_1
  weights_2

------------------------------------------------------------
Look for pointer register patterns like:
  input_1/input_2 or input_r_1/input_r_2
  weights_1/weights_2
  ddr_buffer_a_1/ddr_buffer_a_2
------------------------------------------------------------

DEBUG 2: WEIGHT/BIAS STATISTICS

File: weights.bin
Size: 737,568 bytes
Format: INT8 (correct)

Weight Statistics:
  Count: 737,568
  Min:   -18
  Max:   24
  Mean:  -0.12
  Zeros: 349308 (47.4%)
  Conv1 weights (first 20): [1 2 3 3 3 2 1 2 3 4 4 4 3 2 2 3 4 4 4 3]

File: biases.bin
Size: 2,986 bytes
Format: INT8 (correct)

Bias Statis

In [8]:
def test_gradient_input():
    """Test with exact same gradient input as C simulation."""
    
    print("="*60)
    print("TEST: Gradient Input (Same as C Simulation)")
    print("="*60)
    
    # Create gradient input (same as C simulation)
    input_float = np.zeros((3, 224, 224), dtype=np.float32)
    for c in range(3):
        for h in range(224):
            for w in range(224):
                # gradient_fill formula: ((y/h + x/w) / 2) * 2 - 1) * scale
                val = ((h / 224.0 + w / 224.0) / 2.0 * 2.0 - 1.0) * 0.5
                input_float[c, h, w] = val
    
    # Convert to Q3.4 int8
    input_flat = input_float.flatten()
    input_q34 = np.clip(np.round(input_flat * 16.0), -128, 127).astype(np.int8)
    
    print(f"Input shape: {input_q34.shape}")
    print(f"Input range: [{input_q34.min()}, {input_q34.max()}]")
    print(f"First 10 values: {input_q34[:10]}")
    
    # Initialize accelerator
    accel = SqueezeNetAccelerator(BITSTREAM_PATH)
    accel.load_weights(WEIGHTS_PATH, BIASES_PATH)
    
    # Run inference
    print("\nRunning inference...")
    output_scores, inference_time = accel.run_inference(input_q34)
    
    print(f"\nFPGA Output Scores:")
    for i in range(10):
        print(f"  Class {i}: {output_scores[i]:.4f}")
    
    # C Simulation expected output (from your test):
    csim_output = [1.75, 1.3125, 1.375, 0.75, 0.6875, 0.0, 0.75, 0.5, 1.8125, 0.875]
    
    print(f"\nC Simulation Output (expected):")
    for i in range(10):
        print(f"  Class {i}: {csim_output[i]:.4f}")
    
    print(f"\nComparison:")
    print(f"{'Class':<8} {'FPGA':<10} {'C Sim':<10} {'Diff':<10} {'Match?'}")
    print("-" * 50)
    
    max_diff = 0
    for i in range(10):
        diff = abs(output_scores[i] - csim_output[i])
        max_diff = max(max_diff, diff)
        match = "✓" if diff < 0.2 else "✗"
        print(f"{i:<8} {output_scores[i]:<10.4f} {csim_output[i]:<10.4f} {diff:<10.4f} {match}")
    
    print(f"\nMax difference: {max_diff:.4f}")
    
    fpga_pred = np.argmax(output_scores)
    csim_pred = np.argmax(csim_output)
    
    print(f"\nFPGA prediction:  Class {fpga_pred}")
    print(f"C Sim prediction: Class {csim_pred}")
    
    if max_diff < 0.2:
        print("\n✓ FPGA matches C Simulation - Host code is CORRECT!")
        print("  The issue must be elsewhere (bitstream matches)")
    else:
        print("\n✗ FPGA does NOT match C Simulation")
        print("  Possible causes:")
        print("    1. Bitstream is out of date (most likely)")
        print("    2. Weight loading issue")
        print("    3. Pointer address issue")

# Run the test
if __name__ == "__main__":
    test_gradient_input()

TEST: Gradient Input (Same as C Simulation)
Input shape: (150528,)
Input range: [-8, 8]
First 10 values: [-8 -8 -8 -8 -8 -8 -8 -8 -8 -8]
SQUEEZENET FPGA ACCELERATOR

Loading bitstream: design_1.bit
✓ Bitstream loaded (base address: 0xA0000000)

Allocating buffers...
  Input:        150,528 bytes
  Output:       10 bytes
  Weights:      740,000 bytes
  Biases:       3,000 bytes
  DDR Buffer A: 1,605,632 bytes
  DDR Buffer B: 1,605,632 bytes
✓ Buffers allocated

Loading trained weights and biases...
  Weights: 737,568 elements loaded
  Biases:  2,986 elements loaded
✓ Weights and biases loaded

Running inference...

FPGA Output Scores:
  Class 0: 1.8125
  Class 1: 1.3125
  Class 2: 1.8125
  Class 3: 1.0625
  Class 4: 0.6875
  Class 5: 0.0000
  Class 6: 0.8125
  Class 7: 0.4375
  Class 8: 1.5000
  Class 9: 0.8125

C Simulation Output (expected):
  Class 0: 1.7500
  Class 1: 1.3125
  Class 2: 1.3750
  Class 3: 0.7500
  Class 4: 0.6875
  Class 5: 0.0000
  Class 6: 0.7500
  Class 7: 0.5000
 

In [10]:
import numpy as np
from PIL import Image

def debug_preprocessing():
    """Debug image preprocessing."""
    
    print("="*60)
    print("DEBUG: Image Preprocessing Analysis")
    print("="*60)
    
    # 1. Gradient input (what works)
    gradient = np.zeros((3, 224, 224), dtype=np.float32)
    for c in range(3):
        for h in range(224):
            for w in range(224):
                val = ((h / 224.0 + w / 224.0) / 2.0 * 2.0 - 1.0) * 0.5
                gradient[c, h, w] = val
    
    gradient_q34 = np.clip(np.round(gradient.flatten() * 16), -128, 127).astype(np.int8)
    
    print("\n[Gradient Input - WORKS]")
    print(f"  Float range: [{gradient.min():.4f}, {gradient.max():.4f}]")
    print(f"  Int8 range:  [{gradient_q34.min()}, {gradient_q34.max()}]")
    print(f"  Int8 mean:   {gradient_q34.astype(float).mean():.2f}")
    
    # 2. Image input
    img = Image.open("class_0.jpeg").convert('RGB')
    img = img.resize((224, 224), Image.BILINEAR)
    img_array = np.array(img).astype(np.float32) / 255.0
    
    # CIFAR-10 normalization
    mean = np.array([0.4914, 0.4822, 0.4465])
    std = np.array([0.2023, 0.1994, 0.2010])
    img_normalized = (img_array - mean) / std
    
    print("\n[Image After Normalization]")
    print(f"  Float range: [{img_normalized.min():.4f}, {img_normalized.max():.4f}]")
    
    # Convert to CHW and Q3.4
    img_chw = np.transpose(img_normalized, (2, 0, 1))
    img_q34 = np.clip(np.round(img_chw.flatten() * 16), -128, 127).astype(np.int8)
    
    print("\n[Image as Q3.4 Int8]")
    print(f"  Range: [{img_q34.min()}, {img_q34.max()}]")
    print(f"  Mean:  {img_q34.astype(float).mean():.2f}")
    
    # Check saturation
    saturated = np.sum(np.abs(img_q34) >= 127)
    print(f"  Saturated: {saturated} ({100*saturated/len(img_q34):.1f}%)")
    
    print("\n" + "="*60)
    print("COMPARISON")
    print("="*60)
    print(f"  {'Metric':<15} {'Gradient':<12} {'Image':<12}")
    print(f"  {'-'*40}")
    print(f"  {'Min':<15} {gradient_q34.min():<12} {img_q34.min():<12}")
    print(f"  {'Max':<15} {gradient_q34.max():<12} {img_q34.max():<12}")
    print(f"  {'Mean':<15} {gradient_q34.mean():<12.1f} {img_q34.mean():<12.1f}")
    
debug_preprocessing()

DEBUG: Image Preprocessing Analysis

[Gradient Input - WORKS]
  Float range: [-0.5000, 0.4955]
  Int8 range:  [-8, 8]
  Int8 mean:   -0.04

[Image After Normalization]
  Float range: [-2.4291, 2.4611]

[Image as Q3.4 Int8]
  Range: [-39, 39]
  Mean:  8.99
  Saturated: 0 (0.0%)

COMPARISON
  Metric          Gradient     Image       
  ----------------------------------------
  Min             -8           -39         
  Max             8            39          
  Mean            -0.0         9.0         
