# Understanding Convolution Layer  

In [12]:
import numpy as np

def conv2d(image, kernel, bias=None, stride=1, padding=0):
    """
    Performs a 2D convolution operation on the input image.

    Args:
        image: A 4D NumPy array representing the input image with shape (batch_size, height, width, channels).
        kernel: A 4D NumPy array representing the convolutional kernel with shape (kernel_height, kernel_width, channels_in, channels_out).
        bias: An optional 1D NumPy array representing the bias to be added to the output.
        stride: The stride of the convolution operation. Default is 1.
        padding: The amount of padding to add to the input image. Default is 0.

    Returns:
        A 4D NumPy array representing the output of the convolution operation.
    """

    # Calculate output dimensions
    output_height = int((image.shape[1] - kernel.shape[0] + 2 * padding) / stride) + 1
    output_width = int((image.shape[2] - kernel.shape[1] + 2 * padding) / stride) + 1

    # Pad the input image if necessary
    if padding > 0:
        image = np.pad(image, ((0, 0), (padding, padding), (padding, padding), (0, 0)), mode='constant')

    # Initialize output array
    output = np.zeros((image.shape[0], output_height, output_width, kernel.shape[3]))

    # Perform convolution
    for i in range(output_height):
        for j in range(output_width):
            for k in range(kernel.shape[3]):  # Loop over output channels
                img = image[:, i * stride:i * stride + kernel.shape[0], j * stride:j * stride + kernel.shape[1], :]
                ker = kernel[:, :, :, k]
                out = img * ker
                out_sum = np.sum(
                    out,
                    axis=(1, 2, 3)
                )
                output[:, i, j, k] = out_sum
                print(img.shape)
                print(ker.shape)
                print(out.shape)
                print(out_sum.shape)

    # Add bias if provided
    if bias is not None:
        output += bias

    return output


In [13]:
# Sample image
image = np.random.rand(1, 4, 4, 3)
#print(image)  # Output: (1, 4, 4, 3)

# Sample kernel
kernel = np.random.rand(3, 3, 3, 2)

# Perform convolution
output = conv2d(image, kernel)

print(output.shape)  # Output: (1, 2, 2, 2)
print(output)  # Output: [[[[ 4.02495814  3.73717332]


(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 3, 3, 3)
(3, 3, 3)
(1, 3, 3, 3)
(1,)
(1, 2, 2, 2)
[[[[5.17805817 6.09919783]
   [5.80320233 6.68575538]]

  [[6.23592444 7.96160916]
   [5.88585523 7.01857965]]]]


# My understanding

* In above functiom, operation inside the for loop is equivalent to [y = mx]
* where x = all the input image cells that fit in the kernel FOR ALL the channels (for every image)
* so for an image with 3 channels and shape 28x28 a 3x3 kernel will start from 0,0 (unless paddning > 0 then image is padded 1st)
* The kernel is kernel[:, :, :, k] where k is output channel. so there are 9 learnable parameters per output channel in this case that is m
* x will be 3x3 cells and 3 layers so same 3x3x3. We multiply each of this cell with corresponding m value. Then we sum them up to a single y value.
* so in the end for every kernel overlap we get one y value (per output channel) then we slide right and get another one, when we hit end of width we slide down.

* Looks very complicated. we are asking NN to learn a lot. No doubt we need a lot of Conv layers.