In [8]:
%load_ext Cython
import numpy as np
import theano as th
import theano.tensor as T
import numba as nb
import math
import time

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [39]:
%%cython
from cython cimport boundscheck, wraparound
from cython.parallel cimport prange

cpdef void forLoop(float[:] f):
    cdef:
        int i, j, n
    n = f.shape[0]
    with boundscheck(False), wraparound(False):
        for j in prange(n, nogil=True):
            for i in range(100):
                f[j] = f[j] +1

In [46]:
a = np.ones(1000, dtype=np.float32)

In [47]:
b = a.copy()
%timeit forLoop(b)

14.4 µs ± 87.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [48]:
b = a.copy()
forLoop(b)
b

array([101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 10

In [49]:
ath = th.tensor.vector()
out = ath
for i in range(100):
    out += 1
f = th.function([ath], out)

In [50]:
b = a.copy()
%timeit f(b)

7.23 µs ± 115 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [51]:
f(a)

array([101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 101., 101., 101., 101.,
       101., 101., 101., 101., 101., 101., 101., 10

In [52]:
def forNumpy(a):
    for i in range(100):
        a += 1
    return a

In [53]:
b = a.copy()
%timeit forNumpy(b)

168 µs ± 1.35 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [14]:
a = T.vector()
b = T.vector()
out = a + b
f = th.function([a, b], out)

In [15]:
@nb.vectorize(target='cpu')
def expon_cpu(x, y):
    return math.exp(x) + math.exp(y)

In [16]:
@nb.vectorize(['float32(float32, float32)'], target='cuda')
def expon_gpu(x, y):
    return math.exp(x) + math.exp(y)

In [17]:
N = 1000000
niter = 100
a = np.random.rand(N).astype('float32')
b = np.random.rand(N).astype('float32')
# Trigger compilation
expon_cpu(a, b)
#expon_gpu(a, b)
## Timing
start = time.time()
for i in range(niter):
    expon_cpu(a, b)
print("CPU:", time.time() - start)
start = time.time()
for i in range(niter):
    f(a, b)
print("GPU:", time.time() - start)

CPU: 2.132935047149658
GPU: 0.9638705253601074
