Many image processing libraries and frameworks actually implement cross-correlation when they say "convolution".
This is because for symmetric filters (like Gaussian filters), convolution and cross-correlation give identical results.
For asymmetric filters, you might need to be careful about which operation you're actually using.

When you do true convolution with an impulse:

1. You first flip the filter (both horizontally and vertically)
1. Then slide it over the impulse
1. Because of the mathematical properties of convolution with an impulse, the result you get is the original filter (not the flipped version)


If you did cross-correlation with an impulse:

1. You'd slide the filter directly (no flipping)
1. The result would be the flipped version of the filter

Cross-correlation is when you directly slide the filter over the image, multiplying and summing. The operation looks like this:
$$
(f \cdot g)[n] = \sum f[m] \times g[m+n]
$$

Convolution is similar but involves flipping the filter (both horizontally and vertically) first:
$$
(f \star g)[n] = \sum f[m] \times g[n-m]
$$

In [1]:
import numpy as np
from scipy import signal

In [2]:
_identity = np.identity(3)
_identity

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [3]:
# Flip along the vertical axis (axis=0).
# axis=1 is a flip along the horizontal axis.
np.flip(_identity), np.flip(_identity, axis=0), np.flip(_identity, axis=1)

(array([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]),
 array([[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.]]),
 array([[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.]]))

In [4]:
_reverse_identity = np.flip(_identity, axis=0)
_reverse_identity

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [5]:
# The transpose of the reverse identity is itself.
_reverse_identity.T

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [6]:
# The transpose of the identity is itself.
_identity.T

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [7]:
img_filter = np.array(
    [
        [0, 1, 2],
        [3, 4, 5],
        [6, 7, 8],
    ]
)
img_filter, img_filter.shape, img_filter.dtype

(array([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]),
 (3, 3),
 dtype('int64'))

In [8]:
# Vertical flip.
np.matmul(_reverse_identity, img_filter)

array([[6., 7., 8.],
       [3., 4., 5.],
       [0., 1., 2.]])

In [9]:
# Vertical flip.
# Same as above but using the more convenient notation.
_reverse_identity.T @ img_filter

array([[6., 7., 8.],
       [3., 4., 5.],
       [0., 1., 2.]])

In [10]:
# Horizontal flip!!!
img_filter @ _reverse_identity

array([[2., 1., 0.],
       [5., 4., 3.],
       [8., 7., 6.]])

In [11]:
# Double flip!!!
_reverse_identity.T @ img_filter @ _reverse_identity

array([[8., 7., 6.],
       [5., 4., 3.],
       [2., 1., 0.]])

In [12]:
_impulse_img = signal.unit_impulse((3, 3), 'mid')
_impulse_img

array([[0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.]])

In [13]:
_convolve = _reverse_identity.T @ img_filter @ _reverse_identity
_convolve

array([[8., 7., 6.],
       [5., 4., 3.],
       [2., 1., 0.]])

In [14]:
# The convention in image processing/computer vision is typically:
# First argument is considered the "image" or "signal".
# Second argument is considered the "kernel" or "filter".
# First img_filter gets double flipped and then it does correlation.
signal.convolve(_impulse_img, img_filter)

array([[0., 0., 0., 0., 0.],
       [0., 0., 1., 2., 0.],
       [0., 3., 4., 5., 0.],
       [0., 6., 7., 8., 0.],
       [0., 0., 0., 0., 0.]])

In [15]:
# See!
# Here we manually did the double flipping of the filter before doing the correlation step
# ourselves.
signal.correlate(_impulse_img, _convolve)

array([[0., 0., 0., 0., 0.],
       [0., 0., 1., 2., 0.],
       [0., 3., 4., 5., 0.],
       [0., 6., 7., 8., 0.],
       [0., 0., 0., 0., 0.]])

In [16]:
# And just for kicks...
signal.convolve(_impulse_img, _convolve)

array([[0., 0., 0., 0., 0.],
       [0., 8., 7., 6., 0.],
       [0., 5., 4., 3., 0.],
       [0., 2., 1., 0., 0.],
       [0., 0., 0., 0., 0.]])

## Manual Cross-Correlation

In [17]:
"""
4x4 input:        Possible 2x2 kernel positions:
[ ][ ][ ][ ]      [K][K][ ][ ]     [ ][K][K][ ]     [ ][ ][K][K]
[ ][ ][ ][ ]      [K][K][ ][ ]     [ ][K][K][ ]     [ ][ ][K][K]
[ ][ ][ ][ ]      [ ][ ][ ][ ]     [ ][ ][ ][ ]     [ ][ ][ ][ ]
[ ][ ][ ][ ]      [ ][ ][ ][ ]     [ ][ ][ ][ ]     [ ][ ][ ][ ]

                  [ ][ ][ ][ ]      [ ][ ][ ][ ]     [ ][ ][ ][ ]
                  [K][K][ ][ ]      [ ][K][K][ ]     [ ][ ][K][K]
                  [K][K][ ][ ]      [ ][K][K][ ]     [ ][ ][K][K]
                  [ ][ ][ ][ ]      [ ][ ][ ][ ]     [ ][ ][ ][ ]

                  [ ][ ][ ][ ]      [ ][ ][ ][ ]     [ ][ ][ ][ ]
                  [ ][ ][ ][ ]      [ ][ ][ ][ ]     [ ][ ][ ][ ]
                  [K][K][ ][ ]      [ ][K][K][ ]     [ ][ ][K][K]
                  [K][K][ ][ ]      [ ][K][K][ ]     [ ][ ][K][K]
"""
# Manual correlation.
# Create a simple 4x4 "image".
image = np.array([
    [1,  2,  3,  4],
    [5,  6,  7,  8],
    [9,  10, 11, 12],
    [13, 14, 15, 16]
])

# Create a 2x2 kernel.
kernel = np.identity(2)

# Output will be 3x3 due to kernel size.
# output_size = input_size - kernel_size + 1 -> 4 - 2 + 1 = 3.
result = np.zeros((3, 3))

for i in range(3):
    for j in range(3):
        # For each position, multiply kernel with corresponding image region.
        # This (the following op, not the op + the np.sum()) is the Hadamard product,
        # element-wise multiplication, not matrix multiplication.
        region = image[i:i+2, j:j+2]
        value = np.sum(region * kernel)
        result[i, j] = value

result

array([[ 7.,  9., 11.],
       [15., 17., 19.],
       [23., 25., 27.]])

In [18]:
"""
Hadamard product:
[[1, 2],     [[1, 0],     [[1*1, 2*0],     [[1, 0],
 [5, 6]]  *   [0, 1]]  =   [5*0, 6*1]]  =  [0, 6]]
"""
# First is Hadamard product.
# Second is the matmul.
image[0:2, 0:2] * kernel, image[0:2, 0:2] @ kernel

(array([[1., 0.],
        [0., 6.]]),
 array([[1., 2.],
        [5., 6.]]))

In [19]:
# The default mode will take into account the boundary conditions.
signal.correlate(image, kernel, mode='valid'), signal.correlate(image, kernel, mode='full')

(array([[ 7.,  9., 11.],
        [15., 17., 19.],
        [23., 25., 27.]]),
 array([[ 1.,  2.,  3.,  4.,  0.],
        [ 5.,  7.,  9., 11.,  4.],
        [ 9., 15., 17., 19.,  8.],
        [13., 23., 25., 27., 12.],
        [ 0., 13., 14., 15., 16.]]))