In [1]:
import numpy as np
import cupy as cp

In [2]:
# It is trickier to time GPU kernels, because they behave asynchronously w.r.t the host
# With CuPy, the CUDA API can be accessed to time the kernels
def benchmark(func, args, n_repeat=10, n_warmup=0):
    import time
    start_gpu = cp.cuda.Event()
    end_gpu = cp.cuda.Event()
    for i in range(n_warmup):
        out = func(*args)

    start_gpu.record()
    for i in range(n_repeat):
        out = func(*args)

    end_gpu.record()
    end_gpu.synchronize()
    t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
    print('Average GPU time (ms): ', t_gpu / n_repeat)
    return t_gpu / n_repeat

In [3]:
# A simple function to calculate the sum of two arrays
def sum_arrays(a, b):
    return a + b

In [4]:
# Generate two arrays on the CPU
x = np.random.uniform(size=100)
y = np.random.uniform(size=100)

In [5]:
cpu_time = %timeit -o sum_arrays(x, y)

235 ns ± 0.113 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [6]:
gpu_time = benchmark(sum_arrays, (cp.asarray(x), cp.asarray(y)), n_repeat=10000)

Average GPU time (ms):  0.0056649726867675785


In [7]:
speedup = cpu_time.average / (gpu_time / 1000)
print("Speedup from GPU to CPU:", 1 / speedup)
print("Speedup from CPU to GPU:", speedup)

Speedup from GPU to CPU: 24.0691924556409
Speedup from CPU to GPU: 0.041546886204968554


**The GPU operation is much slower? Why?**
 
* Input size: Is the input size large enough to keep the GPU cores busy?
* Arithmetic intensity: Is the computation heavy enough, and does it involve math operations?
* Datatype length: GPU hardware is usually way less efficient ( 2x to 25x) in running float64 operations than it is in running float32 operations.
* Memory transfer: Data transfer to GPU is included in the timing. But sometimes it's possible to minimize data transfers by keeping data on the GPU.

**Lets try again!**

In [8]:
# Let's define a more complex function using trigonometric functions
def foo_cpu(x, y, a):
    return np.exp(a * np.sin(x) + np.cos(y))

# And the same for the GPU with CuPy
def foo_gpu(x, y, a):
    return cp.exp(a * cp.sin(x) + cp.cos(y))

In [9]:
# Let's create new arrays with a bigger size and float32 instead of float64.
x = np.random.uniform(size=10000000).astype(np.float32)
y = np.random.uniform(size=10000000).astype(np.float32)
a = 0.5

# This time we initialize x_gpu and y_gpu before calling the function
x_gpu = cp.asarray(x)
y_gpu = cp.asarray(y)

In [10]:
cpu_time = %timeit -o foo_cpu(x, y, a)

41.8 ms ± 58.9 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
gpu_time = benchmark(foo_gpu, (x_gpu,y_gpu,a), n_repeat=1000)

Average GPU time (ms):  1.2199140625


In [12]:
speedup = cpu_time.average / (gpu_time / 1000)
print("Speedup from CPU to GPU:", speedup)

Speedup from CPU to GPU: 34.297884288175226


**Much better!**
How did we address the points?
* Input size: Arrays with 10 million elements instead of 100
* Arithmetic intensity: Computing exponentials, sines and cosines are way more intensive than a simple sum
* Datatype length: float32 numbers instead of float64 make a big difference in GPUs
* Memory transfer: Memory transfer done before calling GPU functions