In [1]:
from pycuda import gpuarray
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
%load_ext autoreload
%autoreload 1

In [2]:
ker = SourceModule(""" 
__global__ void mult_ker(float * array, int array_len)
{
     int thd = blockIdx.x*blockDim.x + threadIdx.x;
     int num_iters = array_len / blockDim.x;

     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;

         for(int k = 0; k < 50; k++)
         {
              array[i] *= 2.0;
              array[i] /= 2.0;
         }
     }
}
""")

mult_ker = ker.get_function("mult_ker")

In [3]:
array_len = 100*1024**2
data = np.random.randn(array_len).astype('float32')
data_gpu = gpuarray.to_gpu(data)

In [4]:
start_event = drv.Event()
end_event = drv.Event()

In [8]:
start_event.record()
mult_ker(data_gpu, np.int32(array_len), block=(1024,1,1), grid=(1,1,1))
end_event.record()
end_event.synchronize()
print('Has the kernel started yet? {}'.format(start_event.query()))
print('Has the kernel ended yet? {}'.format(end_event.query()))

Has the kernel started yet? True
Has the kernel ended yet? True


In [9]:
print('Has the kernel started yet? {}'.format(start_event.query()))
print('Has the kernel ended yet? {}'.format(end_event.query()))

Has the kernel started yet? True
Has the kernel ended yet? True


In [10]:
print('Kernel execution time in milliseconds: %f ' % start_event.time_till(end_event))

Kernel execution time in milliseconds: 178.464569 


In [6]:
32*32

1024

## Events and streams

We will now see how to use event objects with respect to streams; this will give us a highly intricate level of control over the flow of our various GPU operations, allowing us to know exactly how far each individual stream has progressed via the ```query``` function, and even allowing us to synchronize particular streams with the host while ignoring the other streams. 

First, though, we have to realize this—each stream has to have its own dedicated collection of event objects; multiple streams cannot share an event object. Let's see what this means exactly by modifying the prior example, ```multi_kernel_streams.py```. After the kernel definition, let's add two additional empty lists—`start_events` and `end_events`. We will fill these lists up with event objects, which will correspond to each stream that we have. This will allow us to time one GPU operation in each stream, since every GPU operation requires two events:

## Stream test

In [12]:
array_len = 1024
data = np.random.randn(array_len).astype('float32')
data_gpu = gpuarray.to_gpu(data)
out_gpu  = gpuarray.empty_like(data_gpu)