In [1]:
from convolutions import *
from scipy import signal

## small N
direct matrix multiplication is faster, and the scipy `signal.convolve` uses the direct matrix multiplication to compute convolution.

In [2]:
image=np.random.randn(25,25)
kernel=np.random.randn(3,3)
mode='full'

In [3]:
# direct mat. mul. method
result_direct=convolve_direct(image, kernel,mode=mode)[0]
%timeit result_direct=convolve_direct(image, kernel,mode=mode)[0]

171 µs ± 4.87 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
# fft method
result_fft=convolve_fft(image, kernel,mode=mode)
%timeit result_fft=convolve_fft(image, kernel,mode=mode)

622 µs ± 7.11 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [5]:
# overlap add fft method
result_oa=convolve_oa(image, kernel,mode=mode)[0]
%timeit result_oa=convolve_oa(image, kernel,mode=mode)[0]

791 µs ± 26 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [6]:
# scipy signal convolve
result_true=signal.convolve(image,kernel,mode=mode)
%timeit result_true=signal.convolve(image,kernel,mode=mode)
print(f'scipy uses the "{signal.choose_conv_method(image,kernel)}" method to compute this convolution.\n')

182 µs ± 772 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
scipy uses the "direct" method to compute this convolution.



In [7]:
print(np.allclose(result_true,result_direct))
print(np.allclose(result_true,result_fft))
print(np.allclose(result_true,result_oa))

True
True
True


## large N (output size a bit smaller than a power of 2)
fft method is faster, as direct matrix multiplication method grows in O(n^2). Note, dimension of full size convolution is (image size + kernel size - 1). fft will pad zeros to the next nearest power of 2. For example, if the size is 1000, fft will pad it to 1024.

In [8]:
# output size will be (823,809) + (123,109) - (1,1) = (945,917), very close to its next power of 2, i.e. 1024
image=np.random.randn(823,809)
kernel=np.random.randn(123,109)
mode='full'

In [9]:
result_direct=convolve_direct(image, kernel,mode=mode)[0]
%timeit -r1 -n1 result_direct=convolve_direct(image, kernel,mode=mode)[0]

16.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [10]:
result_fft=convolve_fft(image, kernel,mode=mode)
%timeit result_fft=convolve_fft(image, kernel,mode=mode)

1.52 s ± 6.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
result_oa=convolve_oa(image, kernel,mode=mode)[0]
%timeit result_oa=convolve_oa(image, kernel,mode=mode)[0]

1.56 s ± 5.54 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
result_true=signal.convolve(image,kernel,mode=mode)
%timeit result_true=signal.convolve(image,kernel,mode=mode)
print(f'scipy uses the "{signal.choose_conv_method(image,kernel)}" method to compute this convolution.\n')

33.8 ms ± 309 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
scipy uses the "fft" method to compute this convolution.



In [13]:
print(np.allclose(result_true,result_direct))
print(np.allclose(result_true,result_fft))
print(np.allclose(result_true,result_oa))

True
True
True


## larger N (output size a bit larger than a power of 2)
If the output size is 1025, fft will pad it to 2048, thus making it 4x inefficient in 2D, a better approach is to divide image to chunks such that (chunk size + kernel size - 1) equals to a power of 2 (i.e. 1024), then overlap the results and add them together.

Chunk size should be set to the largest power of 2 that is smaller than output shape. For example, if the output shape is (1045,1027), chunk size should be (1024,1024). chunk size can also be set lower to save memory.

In [14]:
# output size will be (823,809) + (223,219) - (1,1) = (1045,1027), fft will pad to (2048,2048), thus 4x slower
image=np.random.randn(823,809)
kernel=np.random.randn(223,219)
mode='full'

In [15]:
result_direct=convolve_direct(image, kernel,mode=mode)[0]
%timeit -r1 -n1 result_direct=convolve_direct(image, kernel,mode=mode)[0]

1min 15s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [16]:
result_fft=convolve_fft(image, kernel,mode=mode)
%timeit -r1 -n1 result_fft=convolve_fft(image, kernel,mode=mode)

13.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [17]:
result_oa=convolve_oa(image, kernel,mode=mode,chunk_thres=(1024,1024))[0]
%timeit result_oa=convolve_oa(image, kernel,mode=mode,chunk_thres=(1024,1024))[0]

2.35 s ± 34.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
result_true=signal.convolve(image,kernel,mode=mode)
%timeit result_true=signal.convolve(image,kernel,mode=mode)
print(f'scipy uses the "{signal.choose_conv_method(image,kernel)}" method to compute this convolution.\n')

56.3 ms ± 250 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
scipy uses the "fft" method to compute this convolution.



In [19]:
print(np.allclose(result_true,result_direct))
print(np.allclose(result_true,result_fft))
print(np.allclose(result_true,result_oa))

True
True
True
