In [1]:
# check which kernel we are using
#!jupyter kernelspec list 

In [220]:
import numpy as np
from numpy.lib.stride_tricks import as_strided
from numba import jit
from skimage.restoration import denoise_nl_means, estimate_sigma
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [178]:
s = 7
#a = np.arange(s**2).reshape(s,s)
#b = np.arange(s**2).reshape(s,s)-2
a = np.arange(s**2).reshape(s,s)
b = np.arange(s**2).reshape(s,s)-2
w = 0.5 * np.ones((3,3))
DISTANCE_CUTOFF = 5.0

In [25]:
a

array([[ 0,  1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25, 26, 27],
       [28, 29, 30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39, 40, 41],
       [42, 43, 44, 45, 46, 47, 48]])

In [26]:
b

array([[-2, -1,  0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17, 18],
       [19, 20, 21, 22, 23, 24, 25],
       [26, 27, 28, 29, 30, 31, 32],
       [33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46]])

In [27]:
w

array([[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]])

Standard python implementation

In [4]:
def get_dist_loop(s,p1,p2,w,var):
    distance = 0
    for i in range(s):
        for j in range(s):
            tmp_diff = p1[i, j] - p2[i, j]
            distance += w[i, j] * (tmp_diff * tmp_diff - 2 * var)
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance


def get_dist_op(s,p1,p2,w,var):
    distance = np.sum(w * ((p1 - p2)**2 - 2 * var))
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance

In [5]:
%timeit get_dist_loop(s,a,b,w,0.)
%timeit get_dist_op(s,a,b,w,0.)

147 µs ± 8.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
10.9 µs ± 197 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Cython implementation

In [6]:
%load_ext cython

In [19]:
%%cython
import numpy as np
cimport numpy as np
#cdef extern from "fast_exp.h":
#    double fast_exp(double y) nogil
def get_dist_loopx(int s, np.int32_t [:, :] p1, np.int32_t [:, :] p2, np.float64_t [:, :] w, float var):
    cdef double distance = 0., tmp_diff = 0.
    for i in range(s):
        for j in range(s):
            tmp_diff = p1[i, j] - p2[i, j]
            distance += w[i, j] * (tmp_diff * tmp_diff - 2 * var)
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance

In [28]:
%%cython
import numpy as np
cimport numpy as np
def get_dist_opx(int s, np.int32_t [:, :] p1, np.int32_t [:, :] p2, np.float64_t [:, :] w, float var):
    cdef double distance = 0.
    distance = np.sum(np.multiply(w,(np.square(np.subtract(p1,p2)) - 2 * var)))
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance

In [30]:
%timeit get_dist_loopx(s,a,b,w,0.)
%timeit get_dist_opx(s,a,b,w,0.)
%timeit get_dist_loop(s,a,b,w,0.)
%timeit get_dist_op(s,a,b,w,0.)

3.37 µs ± 78.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
20.5 µs ± 459 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
150 µs ± 2.27 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
11.3 µs ± 55.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Numba, parallelization

In [22]:
from numba import jit

In [32]:
get_dist_loopn = jit(get_dist_loop)
get_dist_opn = jit(get_dist_op)

In [34]:
%timeit get_dist_loopn(s,a,b,w,0.)
%timeit get_dist_loopx(s,a,b,w,0.)
# %timeit get_dist_op(s,a,b,w,0.)
# %timeit get_dist_loop(s,a,b,w,0.)
timeit get_dist_opn(s,a,b,w,0.)

845 ns ± 34.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
3.36 µs ± 46.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
The slowest run took 14.25 times longer than the fastest. This could mean that an intermediate result is being cached.
3.66 µs ± 5.49 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# matrix optimization (numpy strides)

In [179]:
s = 5
a = np.arange(s**2+s).reshape(s,s+1)
a_padded = np.pad(a, pad_width=2, mode='constant')
b = np.arange(s**2).reshape(s,s)-2
#w = 0.5 * np.ones_like(a)
DISTANCE_CUTOFF = 5.0

In [161]:
a_padded

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  2,  3,  4,  5,  0,  0],
       [ 0,  0,  6,  7,  8,  9, 10, 11,  0,  0],
       [ 0,  0, 12, 13, 14, 15, 16, 17,  0,  0],
       [ 0,  0, 18, 19, 20, 21, 22, 23,  0,  0],
       [ 0,  0, 24, 25, 26, 27, 28, 29,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [37]:
b

array([[-2, -1,  0,  1,  2],
       [ 3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12],
       [13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22]])

In [38]:
w

array([[0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5, 0.5, 0.5]])

In [123]:
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return as_strided(a, shape=shape, strides=strides)


def rolling_block(A, block=(3, 3)):
    shape = (A.shape[0] - block[0] + 1, A.shape[1] - block[1] + 1) + block
    strides = (A.strides[0], A.strides[1]) + A.strides
    return as_strided(A, shape=shape, strides=strides)


def rolling_patch_sets(A, blocks = (3,3), block = (3,3)):
    shape = (A.shape[0] - block[0] + 1 - blocks[0] + 1,
             A.shape[1] - block[1] + 1 - blocks[1] + 1) + blocks + block
    strides = 3*A.strides
    return as_strided(A, shape=shape, strides=strides)

In [34]:
%timeit np.mean(rolling_block(a),-1)

27.2 µs ± 2.23 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
def rolling_apply(fun, a, w):
    r = np.empty(a.shape)
    r.fill(np.nan)
    for i in range(w - 1, a.shape[0]):
        r[i] = fun(a[(i-w+1):i+1])
    return r

In [29]:
%timeit rolling_apply(np.mean,a,3)

30.7 µs ± 2.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [223]:
a.dtype

dtype('int32')

In [181]:
w

array([[0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5],
       [0.5, 0.5, 0.5]])

In [238]:
def nlmeans_one_loop(a,w,var):
    b = np.zeros(a.shape, dtype='float')
    a_padded = np.pad(a, pad_width=2, mode='reflect').astype('float')
    rps = rolling_patch_sets(a_padded)
    for index in np.ndindex(a.shape):
        #import pdb; pdb.set_trace()
        weight = np.sum( w * (np.square(rps[index] - rps[index+(1,1)]) - 2 * var), axis = (-1,-2) )
        weight = np.maximum(weight, 0)
        weight_sum = np.sum( np.exp(-weight) )
        patch_centers = rps[index[0],index[1],:,:,1,1]
        b[index] = np.sum( weight * patch_centers ) / weight_sum
    return b

In [239]:
denoise_nl_means(a,h=0.1,fast_mode=False,patch_size=3,patch_distance=2,multichannel=False)

array([[ 0.,  1.,  2.,  3.,  4.,  5.],
       [ 6.,  7.,  8.,  9., 10., 11.],
       [12., 13., 14., 15., 16., 17.],
       [18., 19., 20., 21., 22., 23.],
       [24., 25., 26., 27., 28., 29.]])

In [240]:
nlmeans_one_loop(a,w,0)

array([[ 6635.570931  ,  6999.48548006,  7835.90198397,  8813.18863591,
         9532.21334979,  9802.21446684],
       [ 6970.13753256,  7334.05208163,  7888.72828948,  8866.01494142,
         9303.29935925,  9573.3004763 ],
       [12810.37908616, 13127.33691922, 13682.01312707, 14659.299779  ,
        15213.97598686, 15530.93381992],
       [18087.14004765, 18357.1411647 , 19475.29796466, 20452.58461659,
        21688.13320657, 22052.04775564],
       [17858.2260571 , 18128.22717416, 19528.12427017, 20505.4109221 ,
        22022.69980814, 22386.61435721]])