# GPU Programming with Python - PyCUDA

In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import gpuarray
import numpy as np
import time

## Test it out

In [None]:
print(f"{cuda.Device.count()} device(s) found")
for i in range(cuda.Device.count()):
    dev = cuda.Device(i)
    print(f"Device {i}: {dev.name()}")
    a, b = dev.compute_capability()
    print(f"  Compute capability: {a}.{b}")
    print(f"  Total memory: {dev.total_memory() / 1024} KB")

## Matrix * 2
1. set up your data (array/vector, matrix) on the host, setting type to `np.float32`
1. allocate space on the GPU's memory and copy the data to it (to device)
1. write the key computational kernel for the GPU
1. get the function and call it, give as parameters the pointer to your data on the GPU and the block size
1. create a new variable to contain the data from the GPU and copy it (to host)

In [None]:
"""Create a random matrix of 5x5 and convert to float32 for GPU architecture"""
a = np.random.randn(5,5).astype(np.float32)
a

In [None]:
"""Allocate space on the GPU's memory and copy the data to it"""
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

In [None]:
""""Write the key computational kernel for the GPU, double the matrix given"""
mod = SourceModule("""
    __global__ void doubleMatrix(float *a) {
        int idx = threadIdx.x + threadIdx.y * blockDim.x;
        a[idx] *= 2;
    }
""")

In [None]:
starttime = time.time()
func = mod.get_function("doubleMatrix")
func(a_gpu, block=(5,5,1))
print(f"Time: {time.time() - starttime}")

In [None]:
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)

In [None]:
print("------------- ORIGINAL MATRIX -------------")
print(a)
print("---- DOUBLED MATRIX AFTER COMPUTATION -----")
print(a_doubled)

## dot product (matrix * matrix)