In [10]:
#multiplico por 2 un array usando kernel
import numpy as np
#picks a GPU to run on, based on availability and the number, if any
import pycuda.autoinit
from pycuda import gpuarray
from time import time

# The functionality in the module pycuda.elementwise contains tools to help generate kernels 
#that evaluate multi-stage expressions on one or several operands in a single pass.
#All these instances are required to have the same length.
from pycuda.elementwise import ElementwiseKernel

#maximo 2e8 in 6GB, depends on GPU memory
host_data = np.float32( np.random.random(200000000) )

#//input and output variables in the first line, with C pointers
# //PyCUDA sets up the index i for us,paralleize among the GPU cores
gpu_2x_ker = ElementwiseKernel(
"float *in, float *out", 
"out[i] = 2*in[i];", 
"gpu_2x_ker")#kernel name

def speedcomparison():
    t1 = time()
    host_data_2x =  host_data * np.float32(2)
    t2 = time()
    print ('total time to compute on CPU: %f' % (t2 - t1))
    device_data = gpuarray.to_gpu(host_data)
    
    # allocate memory for output
    #This acts as a plain malloc in C, allocating an array of the same size and
    #data type as device_data, but without copying anything.
    device_data_2x = gpuarray.empty_like(device_data)
    t1 = time()
    
    #the variables we set correspond directly to the first line we defined with
    gpu_2x_ker(device_data, device_data_2x)
    t2 = time()
    h_from_device = device_data_2x.get()
    print ('total time to compute on GPU: %f' % (t2 - t1))


if __name__ == '__main__':
    speedcomparison()

total time to compute on CPU: 0.118444
total time to compute on GPU: 0.053909
