In [12]:
%%writefile add.cpp

#include <iostream>
#include <math.h>
using namespace std;

void add(int n, float *x, float *y)
{
    for (int i=0; i<n; i++)
        y[i] = x[i] + y[i];
}

int main(void)
{
    int N = 1<<20;  // one milliion
    float *x = new float[N];
    float *y = new float[N];
    for (int i=0; i<N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    // run on CPU
    add(N, x, y);

    float maxError = 0.0f;
    for (int i=0; i<N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    cout << "Max Error : " << maxError << endl;

    delete [] x;
    delete [] y;
    return 0;
}

Overwriting add.cpp


In [13]:
%%sh
g++ add.cpp -o add

In [14]:
%%sh
./add

Max Error : 0


## Memory Allocation in CUDA
First allocate memory accessible by all CPUs and GPUs provided by [Unified Memory](https://developer.nvidia.com/blog/unified-memory-in-cuda-6/) in CUDA.


In [33]:
%%writefile add.cu

#include <iostream>
#include <math.h>
using namespace std;

// Kernel to add arrays elements
__global__
void add(int n, float *x, float *y)
{
    for (int i=0; i<n; i++)
        y[i] = x[i] + y[i];
}

__global__
void add_2(int n, float *x, float *y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

__global__
void add_3(int n, float *x, float *y)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  for (int i = index; i < n; i += stride)
        y[i] = x[i] + y[i];
}

int main(void)
{
    int N = 1<<20;
    float *x , *y;

    // allocate Unified Memory accessible from GPU or CPU
    cudaMallocManaged(&x, N*sizeof(float));
    cudaMallocManaged(&y, N*sizeof(float));

    // initialize x and y on the host (CPU)
    for (int i=0; i<N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }
    // run kernel function on 1M elements on the GPU (note: triple braces)
    // add<<<1,1>>>(N, x, y);

    // Run kernel on 1M elements on the GPU
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    add_3<<<numBlocks, blockSize>>>(N, x, y);
    
    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    float maxError = 0.0f;
    for (int i=0; i<N; i++)
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
    cout << "Max Error : " << maxError << endl;

    cudaFree(x);
    cudaFree(y);
    return 0;
}


Overwriting add.cu


In [34]:
%%sh
nvcc add.cu -o add_cuda
./add_cuda

Max Error : 0


Profile the code to check how long the kernel takes for execution

In [36]:
%%sh
nsys nvprof ./add_cuda




Max Error : 0
Generating '/tmp/nsys-report-61ba.qdstrm'


SKIPPED: /home/skwok/codelab2/report4.sqlite does not contain NV Tools Extension (NVTX) data.
SKIPPED: /home/skwok/codelab2/report4.sqlite does not contain CUDA kernel data.
SKIPPED: /home/skwok/codelab2/report4.sqlite does not contain GPU memory data.
SKIPPED: /home/skwok/codelab2/report4.sqlite does not contain GPU memory data.


[3/7] Executing 'nvtx_sum' stats report
[4/7] Executing 'cuda_api_sum' stats report

 Time (%)  Total Time (ns)  Num Calls   Avg (ns)    Med (ns)   Min (ns)  Max (ns)   StdDev (ns)           Name         
 --------  ---------------  ---------  ----------  ----------  --------  ---------  -----------  ----------------------
     97.9        127792213          2  63896106.5  63896106.5   2141254  125650959   87334549.9  cudaMallocManaged     
      1.2          1606623          2    803311.5    803311.5    751190     855433      73710.9  cudaFree              
      0.6           772337          1    772337.0    772337.0    772337     772337          0.0  cudaDeviceSynchronize 
      0.3           421656          1    421656.0    421656.0    421656     421656          0.0  cudaLaunchKernel      
      0.0             1057          1      1057.0      1057.0      1057       1057          0.0  cuModuleGetLoadingMode

[5/7] Executing 'cuda_gpu_kern_sum' stats report
[6/7] Executing 'cuda_gpu