In [1]:
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2024.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2024.1.3-py2.py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.4/87.4 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting appdirs>=1.4.0 (from pycuda)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pycuda
  Building wheel for pycuda (pyproject.toml) ... [?25l[?25hdone
  

In [4]:
%%writefile array_addition_pycuda.py
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

# CUDA kernel code for array addition
cuda_code = """
__global__ void add_arrays(int *a, int *b, int *c, int size) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < size) {
        c[tid] = a[tid] + b[tid];
    }
}
"""

# Host data
a_host = np.array([1, 2, 3], dtype=np.int32)
b_host = np.array([4, 5, 6], dtype=np.int32)
size = len(a_host)

# Device data
a_device = cuda.mem_alloc(a_host.nbytes)
b_device = cuda.mem_alloc(b_host.nbytes)
c_device = cuda.mem_alloc(a_host.nbytes)

# Copy data to device
cuda.memcpy_htod(a_device, a_host)
cuda.memcpy_htod(b_device, b_host)

# Load the CUDA module
cuda_module = SourceModule(cuda_code)
add_arrays_kernel = cuda_module.get_function("add_arrays")

# Set up block and grid dimensions
block_dim = (size, 1, 1)
grid_dim = (1, 1)

# Launch the CUDA kernel
add_arrays_kernel(a_device, b_device, c_device, np.int32(size), block=block_dim, grid=grid_dim)

# Copy the result back to the host
c_host = np.empty_like(a_host)
cuda.memcpy_dtoh(c_host, c_device)

# Display the result
print("Array A: ", a_host)
print("Array B: ", b_host)
print("Result Array C: ", c_host)



Overwriting array_addition_pycuda.py


In [5]:
!python array_addition_pycuda.py

Array A:  [1 2 3]
Array B:  [4 5 6]
Result Array C:  [5 7 9]
