<p style="text-align: right;">
In order to successfully complete this assignment you need to participate both individually and in groups during class on **Monday February 25th**.
</p>


# In-Class Assignment: CUDA

<img src="http://www.amax.com/blog/wp-content/uploads/2016/06/NVIDIA-GPU-Card-Comparison.jpg" width=75%>

<p style="text-align: right;">https://www.amax.com/blog/?p=907</p>

### Agenda for today's class (70 minutes)

</p>
1. (20 minutes) Finish up OpenMP
1. (20 minutes) Pre-class review
1. (20 minutes) BCCD CUDA Example
1. (?? minutes - As far as we get...) Vector Add Example


----

# 0. Finish up OpenMP

Here is my working OpenMP snip-it from class on friday: 
```c++
#pragma omp parallel shared(dvdt, v, y) private(i, dx2inv,nx,nxm1,nt,dt)
{             
    for(int it=0;it<nt-1;it++) {
        #pragma omp for schedule(static, nx/500)
        for(i=1;i<nxm1;i++)
            dvdt[i]=(y[i+1]+y[i-1]-2.0*y[i])*(dx2inv);

        #pragma omp for schedule(static, nx/500)
        for(i=1; i<nxm1; i++)  {
            v[i] = v[i] + dt*dvdt[i];
            y[i] = y[i] + dt*v[i];
        }
        #pragma omp barrier
    }
}


```
Minimal speed-up version

```c++
#pragma omp parallel private(nt) 
{
    for(it=0;it<nt-1;it++) {
        #pragma omp for 
        for(i=1;i<nxm1;i++)
            dvdt[i]=(y[i+1]+y[i-1]-2.0*y[i])*(dx2inv);
        #pragma omp for 
        for(i=1; i<nxm1; i++)  {
            v[i] = v[i] + dt*dvdt[i];
            y[i] = y[i] + dt*v[i];
        }
        #pragma omp barrier
    }
}
```



---
# 1. Pre-class Review

[0224--CUDA_Intro-pre-class-assignment](0224--CUDA_Intro-pre-class-assignment.ipynb)

We learned the following in the video:
1. You must use a dev node with a CUDA card (Ex. dev-intel14-k20 or dev-intel16-k80).
2. You need to install the CUDA software (How do we do that on the HPC?).
3. Once you do that, build thee software and run it.


```c++
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf("CUDA error: " #x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}
```

Steps in a common CUDA program:
* STEP 1: ALLOCATE
* STEP 2: TRANSFER
* STEP 3: SET UP
* STEP 4: RUN
* STEP 5: TRANSFER

---

# 2. BCCD CUDA Example

&#9989; <font color=red>**DO THIS:**</font> In the class Git repository, go back to the BCCD directory and compile and run the CUDA example.  Read though the output and discuss it with your neighbors and the class. 


----
# 3. Vector Add Example

The following is a vector add example.  

In [None]:
%%writefile NCode/vecadd.cu

//Example modified from: https://gist.github.com/vo/3899348
//Timing code from: https://www.pluralsight.com/blog/software-development/how-to-measure-execution-time-intervals-in-c--

#include <iostream>
#include <cuda.h>
#include <chrono>
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) { fprintf(stderr, "CUDA error: " #x " returned \"%s\"\n", cudaGetErrorString(cuda_error__)); fflush(stderr); exit(cuda_error__); } }

__global__ void vecAdd(int *a_d,int *b_d,int *c_d,int N)
{
   int i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < N)
       c_d[i] = a_d[i] + b_d[i]; 
}

void vecAdd_h(int *A1,int *B1, int *C1, int N)
{
   for(int i=0;i<N;i++)
      C1[i] = A1[i] + B1[i];
}

int main(int argc,char **argv)
{
   int n=10000000;
   int nBytes = n*sizeof(int);
   int *a,*b,*c,*c2;
   int *a_d,*b_d,*c_d;

   int num_threads = 1024;
   int num_blocks = n/num_threads+1;
   dim3 numThreads(num_threads,1,1);
   dim3 numBlocks(num_blocks,1,1); 
    
   //Check device
   struct cudaDeviceProp properties;
   cudaGetDeviceProperties(&properties, 0);
   printf("using %d multiprocessors\n",properties.multiProcessorCount);
   printf("max threads per processor: %d \n\n",properties.maxThreadsPerMultiProcessor);
    
    
   printf("nBytes=%d num_threads=%d, num_blocks=%d\n",nBytes,num_threads,num_blocks);

   if (!(a = (int*) malloc(nBytes))) {
        fprintf(stderr, "malloc() FAILED (thread)\n");
        exit(0);
    }

   if (!(b = (int*) malloc(nBytes))) {
        fprintf(stderr, "malloc() FAILED (thread)\n");
        exit(0);
    }

   if (!(c = (int*) malloc(nBytes))) {
        fprintf(stderr, "malloc() FAILED (thread)\n");
        exit(0);
    }

   if (!(c2 = (int*) malloc(nBytes))) {
        fprintf(stderr, "malloc() FAILED (thread)\n");
        exit(0);
    }
    
   for(int i=0;i<n;i++)
      a[i]=i,b[i]=i;
    
   printf("Allocating device memory on host..\n");
   CUDA_CALL(cudaMalloc((void **)&a_d,nBytes));
   CUDA_CALL(cudaMalloc((void **)&b_d,nBytes));
   CUDA_CALL(cudaMalloc((void **)&c_d,nBytes));
    
   auto start_d = std::chrono::high_resolution_clock::now();

   printf("Copying to device..\n");
   CUDA_CALL(cudaMemcpy(a_d,a,nBytes,cudaMemcpyHostToDevice));
   CUDA_CALL(cudaMemcpy(b_d,b,nBytes,cudaMemcpyHostToDevice));
   
   printf("Doing GPU Vector add\n");
   vecAdd<<<numBlocks, numThreads>>>(a_d,b_d,c_d,n);

   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
        fprintf(stderr, "\n\nError: %s\n\n", cudaGetErrorString(err)); fflush(stderr); exit(err);   
   }
    
   printf("Copying results to host..\n");   
   CUDA_CALL(cudaMemcpy(c,c_d,nBytes,cudaMemcpyDeviceToHost));
   
   auto end_d = std::chrono::high_resolution_clock::now();
   
   auto start_h = std::chrono::high_resolution_clock::now();
   printf("Doing CPU Vector add\n");
   vecAdd_h(a,b,c2,n);
   auto end_h = std::chrono::high_resolution_clock::now();
    
   //Test results
   int error = 0;
   for(int i=0;i<n;i++) {
      error += abs(c[i]-c2[i]);
      if (error)
          printf("%i, %d, %d\n", i, c[i], c2[i]);
   }

   //Print Timing
   std::chrono::duration<double> time_d = end_d - start_d;
   std::chrono::duration<double> time_h = end_h - start_h;
   printf("vectorsize=%d\n",n);
   printf("difference_error=%d\n",error);
   printf("Device time: %f s\n ", time_d.count());
   printf("Host time: %f s\n", time_h.count()); 
    
   cudaFree(a_d);
   cudaFree(b_d);
   cudaFree(c_d);
   return 0;
}

In [None]:
#Compile Cuda
!nvcc -std=c++11 -o vecadd NCode/vecadd.cu

In [None]:
#Run Example
!./vecadd

&#9989; <font color=red>**DO THIS:**</font> Copy and paste the above code to the HPCC and get it to compile and run.

&#9989; <font color=red>**DO THIS:**</font> Analyse the code and see if you can figure out what it is doing.  Where are the key steps? 

&#9989; <font color=red>**DO THIS:**</font> Think about why this code does not do a fair timing comparison between the CPU and the GPU.  Make modifications to make it a more fair comparison.

&#9989; <font color=red>**DO THIS:**</font> As in the pre-class video, the exit codes for the CUDA program are not being checked.  Add the CUDA_CALL command to your program. 

-----
### Congratulations, we're done!


**Course Resources:**
- [Syllabus](https://tinyurl.com/y75cnzam)
- [Preliminary Schedule](https://tinyurl.com/CMSE314-Schedule)
- [Git Repository](https://gitlab.msu.edu/colbrydi/cmse401-s19)
- [Jargon Jar and Command History](https://tinyurl.com/CMSE314-JargonJar) 



&#169; Copyright 2019,  Michigan State University Board of Trustees