In [28]:
%load_ext Cython
import numpy as np
import theano as th
import theano.tensor as T
import tensorflow as tf
import numba as nb
import math
import time
import cv2
import socket

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [39]:
%%cython
from cython cimport boundscheck, wraparound
from libc.stdlib cimport malloc
from cython.parallel cimport prange
cimport numpy as cnp
import numpy as np

cpdef float[:] cython_mul(float[:,:] a, float[:] b):
    cdef:
        int i, j, k, n, m
        float *mardas
    n = a.shape[0]
    m = a.shape[1]
    mardas = <float*> malloc(n * sizeof(float))
    with boundscheck(False), wraparound(False):
        for j in prange(n, nogil=True):
            mardas[j] = 0
    with boundscheck(False), wraparound(False):
        for j in prange(n, nogil=True):
            for i in range(m):
                mardas[j] += a[j, i] * b[i]
    return <float[:n]> mardas

In [40]:
N = 10000
a = np.random.rand(N, N).astype('float32')
b = np.random.rand(N).astype('float32')

In [41]:
np.array(cython_mul(a, b)) - a.dot(b)

array([-0.00561523, -0.0090332 ,  0.00488281, ...,  0.00341797,
       -0.0012207 , -0.00244141], dtype=float32)

In [42]:
%timeit cython_mul(a, b)

111 ms ± 569 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [51]:
th.config.openmp = True
ath = th.tensor.matrix()
bth = th.tensor.matrix()
out = ath.dot(bth)
f = th.function([ath, bth], out)

In [53]:
bb = b.reshape(-1, 1)
%timeit f(a, bb)

425 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [49]:
atf = tf.placeholder(shape=[None, None], dtype=tf.float32)
btf = tf.placeholder(shape=[None, 1], dtype=tf.float32)
out = tf.matmul(atf, btf)
sess= tf.Session()
feed_dict = {atf: a, btf: b.reshape(-1, 1)}

In [50]:
%timeit sess.run(out, feed_dict=feed_dict)

44.3 ms ± 2.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [54]:
bb = b.reshape(-1, 1)
%timeit a.dot(b)

41.9 ms ± 164 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [56]:
@nb.vectorize(['float32(float32, float32)'])
def mul_numba(a, b):
    return a * b

In [57]:
mul_numba(a, b)

array([[0.30815125, 0.03520628, 0.294531  , ..., 0.82657033, 0.04556881,
        0.03632921],
       [0.30391175, 0.48041525, 0.12499926, ..., 0.10415496, 0.21428944,
        0.01982157],
       [0.06418305, 0.11020334, 0.1917314 , ..., 0.8691895 , 0.04590769,
        0.03038564],
       ...,
       [0.3571819 , 0.8580767 , 0.26147088, ..., 0.6585892 , 0.11944653,
        0.04103302],
       [0.25646242, 0.6363841 , 0.18894507, ..., 0.40003654, 0.07482015,
        0.03396439],
       [0.28920603, 0.09723116, 0.20632674, ..., 0.72940606, 0.06852794,
        0.06087457]], dtype=float32)

In [58]:
@nb.vectorize(['float32(float32, float32)'], target='cuda')
def expon_gpu(x, y):
    return math.exp(x) + math.exp(y)

In [59]:
@nb.jit("float32(float32, float32)")
def mul_jit(a, b):
    n = a.shape[0]
    m = a.shape[1]
    out = np.zeros(n)
    for i in range(n):
        for j in range(m):
            out[i] += a[i, j] * b[j]
    return out
            

In [60]:
c = np.zeros(N)
%timeit mul_jit(a, b)

127 ms ± 2.16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
mul_jit(a, b) - a.dot(b)

array([ 5.99537385e-04, -1.51222142e-04, -3.62341161e-04, ...,
       -2.59238186e-06, -5.31115625e-04, -5.42285883e-04])