In [16]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule

#when we declare a kernel function in CUDA C proper, we precede it with the
# __global__ keyword.This will distinguish the function as a kernel to the compiler.

#We'll always just declare this as a void function, because we'll always get our
# output values by passing a pointer to some empty chunk of memory that we pass 
#in as a parameter.

#the identification of each individual thread is given by the threadIdx value,
# which we retrieve as follows: int i = threadIdx.x;.
ker = SourceModule("""
__global__ void scalar_multiplication_kernel (float *output_vector, float scalar,
float *vec)
{
 int i = threadIdx.x;
 output_vector[i] = scalar * vec[i];
}                   
""")

#pull out a reference to our compiled kernel function from the CUDA module
scalar_multiply_gpu = ker.get_function("scalar_multiplication_kernel")

#Return a sample (or samples) from the “standard normal” distribution.
#cast to a specified type.
#no puede exceder el tamaño maximo que es 1024
vector =    np.random.randn(2048).astype(np.float32)
vector_gpu  = gpuarray.to_gpu(vector)
output_vector = gpuarray.empty_like(vector_gpu)

scalar_multiply_gpu(output_vector,np.float32(2),vector_gpu,block=(1024,1,1),grid=(2,1,1))
#Returns True if two arrays are element-wise equal within a tolerance.
print ("La multiplicación se realizó correctamente? : {}".format(np.allclose(output_vector.get() , 2*vector) ))

La multiplicación se realizó correctamente? : False
