In [1]:
%%writefile cudaarraysum.cu
#include <stdio.h>

__global__ void reduce(int *a, int *result) {
    __shared__ int sdata[8];
    int tid = threadIdx.x;
    sdata[tid] = a[tid];
    __syncthreads();

    for (int s = 1; s < blockDim.x; s *= 2) {
        if (tid % (2 * s) == 0) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0)
        *result = sdata[0];
}

int main() {
    int h_a[8] = {1, 2, 3, 4, 5, 6, 7, 8};
    int h_res;

    int *d_a, *d_res;
    cudaMalloc(&d_a, 8 * sizeof(int));
    cudaMalloc(&d_res, sizeof(int));

    cudaMemcpy(d_a, h_a, 8 * sizeof(int), cudaMemcpyHostToDevice);

    reduce<<<1, 8>>>(d_a, d_res);

    cudaMemcpy(&h_res, d_res, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Sum=%d\n", h_res);

    cudaFree(d_a);
    cudaFree(d_res);

    return 0;
}


Writing cudaarraysum.cu


In [2]:
!nvcc cudaarraysum.cu -o cudaarraysum -arch=sm_70

In [3]:
!./cudaarraysum

Sum=36
