In [6]:
print("Hello world")

Hello world


In [7]:
import numpy as np
import matplotlib as plt

import pycuda.driver as cuda_driver
import pycuda.compiler as cuda_compiler
from pycuda.gpuarray import GPUArray

import IPythonMagic
from Timer import Timer

In [8]:
%setup_logging

Global logger already initialized!


In [9]:
%cuda_context_handler context

Registering context in user workspace
Context already registered! Ignoring


In [23]:
kernel_src = """

__global__ void shmemReduction(float* output, float* input, int size) {
    // First stride through global memory and compute
    // the maximum for every thread
    int gid = blockIdx.x * blockDim.x + threadIdx.x; // blockIdx.x is always zero because we use just one block
    
    float max_value = -999999999.999; // FIX ME!
    for (int i = threadIdx.x; i < size; i = i + blockDim.x) {
        max_value = fmaxf(max_value, input[i]);
    }
    
    // Temporary write to memory to check if things work so far
    output[threadIdx.x] = max_value;
    
    // Store the per-thread maximum in shared memory
    __shared__ float max_shared[128];
    max_shared[threadIdx.x] = max_value;
    
    // Synchronize so that all thread see the same shared memory
    __syncthreads();
        
    // Find the maximum of shared memory
    
    // Reduce from 128 to 64 elements
    if (threadIdx.x < 64) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 64]);
    }
    
    // Since we have here more than one active warp (threadIdx.x > 32), we need to make
    // sure all threads have finished before continuing
    __syncthreads();
    
    // Reduce from 64 to 32 elements
    if (threadIdx.x < 32) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 32]);
    }
    
    // Reduce from 32 to 16 elements
    if (threadIdx.x < 16) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 16]);
    }
    
    // Reduce from 16 to 8 elements
    if (threadIdx.x < 8) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 8]);
    }
    
    // Reduce from 8 to 4 elements
    if (threadIdx.x < 4) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 4]);
    }
    
    // Reduce from 4 to 2 elements
    if (threadIdx.x < 2) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 2]);
    }
    
    // Reduce from 2 to 1 elements
    if (threadIdx.x < 1) {
    max_shared[threadIdx.x] = fmaxf(max_shared[threadIdx.x], max_shared[threadIdx.x + 1]);
    }
    
    // Eventually write out to output
    if (threadIdx.x == 0) {
        output[0] = max_shared[0];
    }    
}
"""

kernel_module = cuda_compiler.SourceModule(kernel_src)
kernel_function = kernel_module.get_function("shmemReduction")





In [24]:
n = 256
a = np.random.random((1, n)).astype(np.float32)

a_g = GPUArray(a.shape, a.dtype)
a_g.set(a)

num_threads = 128
b = np.empty((1, num_threads), dtype=np.float32)

b_g = GPUArray(b.shape, b.dtype)

In [25]:
block_size = (num_threads, 1, 1)
grid_size = (1, 1, 1)

kernel_function(b_g, a_g, np.int32(n), np.int32(n), grid=grid_size, block=block_size)

#Download data
b_g.get(b)

#print(a)
print(b)
print(np.max(a))

[[0.98645794 0.11465081 0.7008942  0.71582574 0.19230945 0.6307102
  0.99297667 0.8555335  0.78378296 0.503948   0.5153691  0.9033084
  0.93761325 0.92897975 0.77439135 0.7345318  0.49218673 0.7092952
  0.4174651  0.79176825 0.9132511  0.8660131  0.54959965 0.7723747
  0.3197224  0.8160064  0.71256185 0.5057406  0.488644   0.39682478
  0.84822094 0.81397504 0.5303921  0.92334276 0.76257855 0.8678269
  0.87212366 0.9366278  0.63259107 0.7359021  0.9261337  0.23706184
  0.70179826 0.30871013 0.36185932 0.6378562  0.81562024 0.16962792
  0.948172   0.6919221  0.4522929  0.5903881  0.5093269  0.6629031
  0.8754085  0.8933242  0.7824821  0.6522927  0.82839596 0.8040152
  0.54342407 0.8473027  0.9494782  0.78500223 0.4642936  0.9062707
  0.4790082  0.5832815  0.35919693 0.32458237 0.66091955 0.8869996
  0.8783214  0.82406616 0.6984912  0.8067058  0.8294999  0.81367797
  0.4305992  0.52413195 0.8388642  0.73126245 0.8397078  0.98645794
  0.7415238  0.33788818 0.08715762 0.47475618 0.9221223  