# Exercise 1: Compute pi on the GPU

0. CPU version: about 18 seconds for 512 000 000 points
1. Maximum 1024 threads (one block with 1024 threads)

In [2]:
%%time
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2024.1.2.tar.gz (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/1.7 MB[0m [31m14.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading pytools-2025.1.1-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Mak

In [3]:
import pycuda
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
import numpy as np
import time

In [4]:
rng = np.random.default_rng()

def compute_pi_cpu(n_points):
    #First, generate random points
    rng = np.random.RandomState(42)
    x_rand = rng.random(n_points)
    y_rand = rng.random(n_points)

    #Compute radius from origin
    inside = np.sqrt(x_rand**2+y_rand**2) <= 1.0
    #Count number of points inside
    n_inside = np.sum(inside)

    #n_inside = 0
    #for i in range(n_points):
    #    n_inside += np.sqrt(x_rand[i]**2+y_rand[i]**2) <= 1.0

    #We can estimate pi by the following formula:
    #pi = 4 * n_inside / n_total
    pi = 4*n_inside/n_points

    return pi

In [5]:
tic = time.time()
print(compute_pi_cpu(1024))
toc = time.time()

print("Time to execute cpu version: {:f} seconds".format(toc-tic))

3.109375
Time to execute cpu version: 0.005301 seconds


In [6]:
pi_kernel_src = """
//Based on Stroustrup, adapted for CUDA
//pseudorandom numbers
__device__ float generateRandomNumber(long& last_draw) {
    last_draw = last_draw*1103515245 + 12345;
    long abs = last_draw & 0x7fffffff;
    return abs / 2147483648.0;
}

__global__ void computePi(unsigned int* inside, unsigned int seed) {
    //1 generate random numbers
    unsigned int tid = threadIdx.x;

    long rand_seed = seed + tid;
    float x = generateRandomNumber(rand_seed);
    float y = generateRandomNumber(rand_seed);

    //2 compute the radius from the origin
    float r = sqrt(x*x + y*y);

    //3 check if it is inside the circle
    if (r <= 1.0) { // oh-oh
        inside[tid] = 1;
    }
}
"""

mod = SourceModule(pi_kernel_src)
func = mod.get_function("computePi")

In [7]:
def compute_pi_gpu(n_points):
    bytes_per_uint = 4

    inside_gpu = cuda.mem_alloc(bytes_per_uint * n_points)

    func(inside_gpu, np.uint32(time.time()), block=(n_points,1,1), grid=(1,1,1))

    inside_cpu = np.empty(n_points, dtype=np.uint32)
    cuda.memcpy_dtoh(inside_cpu, inside_gpu)

    n_inside = np.sum(inside_cpu)

    pi = 4 * n_inside/n_points

    return pi

In [10]:
tic = time.time()
print(compute_pi_gpu(1024))
toc = time.time()

print("Time to execute gpu version: {:f} seconds".format(toc-tic))

3.15625
Time to execute gpu version: 0.001398 seconds
