In [1]:
# check which kernel we are using
#!jupyter kernelspec list 

In [31]:
import numpy as np
from numpy.lib.stride_tricks import as_strided
from numba import jit
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [67]:
s = 7
#a = np.arange(s**2).reshape(s,s)
#b = np.arange(s**2).reshape(s,s)-2
a = np.arange(s**2).reshape(s,s)
b = np.arange(s**2).reshape(s,s)-2
w = 0.5 * np.ones_like(a)
DISTANCE_CUTOFF = 5.0

In [51]:
a

array([[ 0,  1,  2,  3,  4,  5,  6],
       [ 7,  8,  9, 10, 11, 12, 13],
       [14, 15, 16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25, 26, 27],
       [28, 29, 30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39, 40, 41],
       [42, 43, 44, 45, 46, 47, 48]])

Standard python implementation

In [53]:
def get_dist_loop(s,p1,p2,w,var):
    distance = 0
    for i in range(s):
        for j in range(s):
            tmp_diff = p1[i, j] - p2[i, j]
            distance += w[i, j] * (tmp_diff * tmp_diff - 2 * var)
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance


def get_dist_op(s,p1,p2,w,var):
    distance = np.sum(w * ((p1 - p2)**2 - 2 * var))
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance

In [56]:
%timeit get_dist_loop(s,a,b,w,0.)
%timeit get_dist_op(s,a,b,w,0.)

141 µs ± 360 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
13.1 µs ± 497 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Cython implementation

In [3]:
%load_ext cython

In [57]:
%%cython
import numpy as np
cimport numpy as np
#cdef extern from "fast_exp.h":
#    double fast_exp(double y) nogil
def get_dist_loopx(int s, np.int64_t [:, :] p1, np.int64_t [:, :] p2, np.float64_t [:, :] w, float var):
    cdef double distance = 0., tmp_diff = 0.
    for i in range(s):
        for j in range(s):
            tmp_diff = p1[i, j] - p2[i, j]
            distance += w[i, j] * (tmp_diff * tmp_diff - 2 * var)
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance

In [58]:
%%cython
import numpy as np
cimport numpy as np
def get_dist_opx(int s, np.int64_t [:, :] p1, np.int64_t [:, :] p2, np.float64_t [:, :] w, float var):
    cdef double distance = 0.
    distance = np.sum(np.multiply(w,(np.multiply(p1,p2) - 2 * var)))
    distance = max(distance, 0)
    distance = np.exp(-distance)
    return distance

In [60]:
%timeit get_dist_loopx(s,a,b,w,0.)
#%timeit get_dist_opx(s,a,b,w,0.)
%timeit get_dist_loop(s,a,b,w,0.)
%timeit get_dist_op(s,a,b,w,0.)

3.76 µs ± 247 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
151 µs ± 581 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
13.7 µs ± 620 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Numba, parallelization

In [61]:
from numba import jit

In [62]:
get_dist_loopn = jit(get_dist_loop)
#get_dist_opn = jit(get_dist_op)

In [68]:
%timeit get_dist_loopn(s,a,b,w,0.)
%timeit get_dist_loopx(s,a,b,w,0.)
# %timeit get_dist_op(s,a,b,w,0.)
# %timeit get_dist_loop(s,a,b,w,0.)
# %timeit get_dist_opn(s,a,b,w,0.)
# %timeit get_dist_loopn(s,a,b,w,0.)

854 ns ± 35.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
3.88 µs ± 273 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


# matrix optimization (numpy strides)

In [8]:
s = 5
a = np.arange(s**2).reshape(s,s)
b = np.arange(s**2).reshape(s,s)-2
w = 0.5 * np.ones_like(a)
DISTANCE_CUTOFF = 5.0

In [32]:
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return as_strided(a, shape=shape, strides=strides)


def rolling_block(A, block=(3, 3)):
    shape = (A.shape[0] - block[0] + 1, A.shape[1] - block[1] + 1) + block
    strides = (A.strides[0], A.strides[1]) + A.strides
    return as_strided(A, shape=shape, strides=strides)

In [34]:
%timeit np.mean(rolling_block(a),-1)

27.2 µs ± 2.23 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
def rolling_apply(fun, a, w):
    r = np.empty(a.shape)
    r.fill(np.nan)
    for i in range(w - 1, a.shape[0]):
        r[i] = fun(a[(i-w+1):i+1])
    return r

In [29]:
%timeit rolling_apply(np.mean,a,3)

30.7 µs ± 2.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
