<a href="https://colab.research.google.com/github/siavashadpey/gpu_intro/blob/master/cs179_lec2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

A simple kernel for vector addition (c = a + b)

From lecture 2 of CS179 (http://courses.cms.caltech.edu/cs179/2020_lectures/cs179_2020_lec02.pdf)



In [2]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-a8axwdaz
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-a8axwdaz
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=1af6c0512d210778f534a458ea244afa60f8c3c6bfd6fddfbff29bbffc59b266
  Stored in directory: /tmp/pip-ephem-wheel-cache-bxpp8cv_/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [23]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h> 

__global__ void cudaAddVectorKernel(const float *a, const float *b, 
                                    float *c, const int size)
{
    const int idx =  blockIdx.x * blockDim.x + threadIdx.x;
    
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

int main(int argc, char **argv) {
    const int per_block_thread_count = 1024;

    // host memory
    const int array_size = 1024;
    float *a = new float[array_size];
    float *b = new float[array_size];
    float *c = new float[array_size];

    for (int i=0; i<array_size; i++) {
        a[i] = i;
        b[i] = array_size - i;
    }

    // device memory 
    float *d_a; 
    float *d_b;
    float *d_c;
    cudaMalloc((void**) &d_a, array_size*sizeof(float));
    cudaMalloc((void**) &d_b, array_size*sizeof(float));
    cudaMalloc((void**) &d_c, array_size*sizeof(float));

    // copy input variables from host to device
    cudaMemcpy(d_a, a, array_size*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, array_size*sizeof(float), cudaMemcpyHostToDevice);

    // call kernel
    const int block_count = 1; // array_size is equal to thread_count/block
    cudaAddVectorKernel<<<block_count, per_block_thread_count>>>(d_a, d_b, d_c, array_size);

    // copy output variable from device to host
    cudaMemcpy(c, d_c, array_size*sizeof(float), cudaMemcpyDeviceToHost);

    // check result
    for (int i = 0; i < array_size; i++) {
        //printf("%d: %f \n", i, c[i]);
        assert(c[i] == float(array_size));
    }

    printf("All is good!");

    // free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}



All is good!
