In [1]:
%load_ext Cython
import cython

import numba as nb
import math

import tensorflow as tf

import pycuda.driver as drv
import pycuda.tools
import pycuda.autoinit
import numpy
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import pycuda.cumath
from pycuda.elementwise import ElementwiseKernel

blocks = 64
block_size = 128 * 8
nbr_values = blocks * block_size

print("Using nbr_values ==", nbr_values)

# Number of iterations for the calculations,
# 100 is very quick, 2000000 will take a while
n_iter = 100000
print("Calculating %d iterations" % (n_iter))

# create two timers so we can speed-test each approach
start = drv.Event()
end = drv.Event()

ModuleNotFoundError: No module named 'tensorflow'

In [128]:
######################
# SourceModele SECTION
# We write the C code and the indexing and we have lots of control

mod = SourceModule("""
__global__ void gpusin(float *dest, float *a, int n_iter)
{
  const int i = blockDim.x*blockIdx.x + threadIdx.x;
  for(int n = 0; n < n_iter; n++) {
    a[i] = sin(a[i]);
  }
  dest[i] = a[i];
}
""")

gpusin = mod.get_function("gpusin")

# create an array of 1s
a = numpy.ones(nbr_values).astype(numpy.float32)
# create a destination array that will receive the result
dest = numpy.zeros_like(a)

start.record() # start timing
gpusin(drv.Out(dest), drv.In(a), numpy.int32(n_iter), grid=(blocks,1), block=(block_size,1,1) )
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("SourceModule time and first three results:")
print("%fs, %s" % (secs, str(dest[:3])))


SourceModule time and first three results:
0.673472s, [0.005477 0.005477 0.005477]


In [129]:
#####################
# Elementwise SECTION
# use an ElementwiseKernel with sin in a for loop all in C call from Python
kernel = ElementwiseKernel(
   "float *a, int n_iter",
   "for(int n = 0; n < n_iter; n++) { a[i] = sin(a[i]);}",
   "gpusin")

a = numpy.ones(nbr_values).astype(numpy.float32)
a_gpu = gpuarray.to_gpu(a)
start.record() # start timing
kernel(a_gpu, numpy.int(n_iter))
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("Elementwise time and first three results:")
print("%fs, %s" % (secs, str(a_gpu.get()[:3])))


Elementwise time and first three results:
0.890893s, [0.005477 0.005477 0.005477]


In [108]:
####################################
# Elementwise Python looping SECTION
# as Elementwise but the for loop is in Python, not in C
kernel = ElementwiseKernel(
   "float *a",
   "a[i] = sin(a[i]);",
   "gpusin")

a = numpy.ones(nbr_values).astype(numpy.float32)
a_gpu = gpuarray.to_gpu(a)
start.record() # start timing
for i in range(n_iter):
    kernel(a_gpu)
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("Elementwise Python looping time and first three results:")
print("%fs, %s" % (secs, str(a_gpu.get()[:3])))

Elementwise Python looping time and first three results:
2.559730s, [0.005477 0.005477 0.005477]


In [134]:
##################
# GPUArray SECTION
# The result is copied back to main memory on each iteration, this is a bottleneck

a = numpy.ones(nbr_values).astype(numpy.float32)
a_gpu = gpuarray.to_gpu(a)
start.record() # start timing
for i in range(n_iter):
    a_gpu = pycuda.cumath.sin(a_gpu)
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("GPUArray time and first three results:")
print("%fs, %s" % (secs, str(a_gpu.get()[:3])))


GPUArray time and first three results:
6.921821s, [0.005477 0.005477 0.005477]


In [133]:
#############
# CPU SECTION
# use numpy the calculate the result on the CPU for reference

a = numpy.ones(nbr_values).astype(numpy.float32)
start.record() # start timing
start.synchronize()

for i in range(n_iter):
    a = numpy.sin(a)

end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("CPU time and first three results:")
print("%fs, %s" % (secs, str(a[:3])))

CPU time and first three results:
2.457248s, [0.005477 0.005477 0.005477]


In [110]:
%%cython
#############
# Cython SECTION
# use numpy the calculate the result on the CPU for reference
from libc.math cimport sin
from libc.stdlib cimport malloc
from cython cimport wraparound, boundscheck
from cython.parallel cimport prange
import numpy as np
cimport numpy as np
cpdef cython_sin(float[:] a, int n_iter):
    cdef int n = a.shape[0]
    cdef int i, t
    with wraparound(False), boundscheck(False):
        for t in range(n_iter):
            for i in prange(n, nogil=True):
                a[i] = sin(a[i])
def cython_sin2(float[:] a, int n_iter):        
    cdef int i
    for i in range(n_iter):
        a = np.sin(a)
    return a

cpdef cython_sin3(float[:] a):
    cdef int n = a.shape[0]
    cdef int i
    with wraparound(False), boundscheck(False):
        for i in prange(n, nogil=True):
            a[i] = sin(a[i])
            
cpdef cython_sin4(float[:] a):
    cdef int n = a.shape[0]
    cdef int i
    cdef float* b = <float*> malloc(n * sizeof(float))
    with wraparound(False), boundscheck(False):
        for i in prange(n, nogil=True):
            b[i] = sin(a[i])           
    return <float[:n]> b

In [111]:
a = numpy.ones(nbr_values).astype(numpy.float32)
start.record() # start timing
start.synchronize()
cython_sin(a, n_iter)
end.record() # end timing
# calculate the run length
end.synchronize()
secs = start.time_till(end)*1e-3
print("CPU time and first three results:")
print("%fs, %s" % (secs, str(np.array(a)[:3])))

CPU time and first three results:
45.384426s, [0.005477 0.005477 0.005477]


In [131]:
#############
# Tensorflow SECTION
# use numpy the calculate the result on the CPU for reference

atf = tf.placeholder(tf.float32, [None])
btf = atf
for i in range(n_iter):
    btf = tf.sin(btf)

In [132]:
a = numpy.ones(nbr_values).astype(numpy.float32)
start.record() # start timing
start.synchronize()
with tf.Session() as sess:
    a = sess.run(btf, feed_dict={atf: a})
end.record() # end timing
end.synchronize()
# calculate the run length
secs = start.time_till(end)*1e-3
print("CPU time and first three results:")
print("%fs, %s" % (secs, str(np.array(a)[:3])))

CPU time and first three results:
13.156031s, [0.005477 0.005477 0.005477]


In [99]:
%%cython
from libc.math cimport pow
from cython.parallel cimport prange
from cython cimport boundscheck, wraparound
def mysin(float[:] a):
    cdef int i, n
    n = a.shape[0]
    with wraparound(False), boundscheck(False):
        for i in prange(n, nogil=True):
        #for i in range(n):
            a[i] = (a[i]-1)**2
    

In [100]:
a = np.ones(nbr_values).astype('float32')
%timeit mysin(a)

4.38 µs ± 15.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [101]:
a = np.ones(nbr_values).astype('float32')
%timeit (a-1)**2

7.32 µs ± 18.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [121]:
@nb.vectorize(['float32(float32, int32)'], target='cuda')
def numba_sin_gpu(a, n_iter):
    b = a
    for i in range(n_iter):
        b = math.sin(b)
    return b

In [130]:
a = numpy.ones(nbr_values).astype(numpy.float32)
start.record() # start timing
start.synchronize()

a = numba_sin_gpu(a, n_iter)

end.record() # end timing
end.synchronize()
# calculate the run length
secs = start.time_till(end)*1e-3
print("CPU time and first three results:")
print("%fs, %s" % (secs, str(np.array(a)[:3])))

CPU time and first three results:
0.865828s, [0.005477 0.005477 0.005477]


In [125]:
@nb.vectorize(['float32(float32, int32)'], target='cpu')
def numba_sin_cpu(a, n_iter):
    b = a
    for i in range(n_iter):
        b = math.sin(b)
    return b

In [126]:
a = numpy.ones(nbr_values).astype(numpy.float32)
start.record() # start timing
start.synchronize()

a = numba_sin_cpu(a, n_iter)

end.record() # end timing
end.synchronize()
# calculate the run length
secs = start.time_till(end)*1e-3
print("CPU time and first three results:")
print("%fs, %s" % (secs, str(np.array(a)[:3])))

CPU time and first three results:
55.331465s, [0.005477 0.005477 0.005477]


In [136]:
import theano as th
import theano.tensor as T

In [137]:
ath = T.vector()
bth = ath
for i in range(n_iter):
    bth = T.sin(bth)
f = th.function([ath], bth)

RecursionError: maximum recursion depth exceeded

In [126]:
a = numpy.ones(nbr_values).astype(numpy.float32)
start.record() # start timing
start.synchronize()

a = numba_sin_cpu(a, n_iter)

end.record() # end timing
end.synchronize()
# calculate the run length
secs = start.time_till(end)*1e-3
print("CPU time and first three results:")
print("%fs, %s" % (secs, str(np.array(a)[:3])))

CPU time and first three results:
55.331465s, [0.005477 0.005477 0.005477]
