In [4]:
# Check CUDA is installed
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [5]:
# Enable NVCC for Jupyter
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-yeu15jjb
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-yeu15jjb
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4308 sha256=d0300e622f8dabf411bec64a54490352336b557de0f88c18f1564c30fb1afed0
  Stored in directory: /tmp/pip-ephem-wheel-cache-ovr162wy/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


Test some sample code

In [13]:
%%cu 
#include <cstdio> 
#include <iostream> 
  
    using namespace std; 
  
__global__ void maxi(int* a, int* b, int n) 
{ 
    int block = 256 * blockIdx.x; 
    int max = 0; 
  
    for (int i = block; i < min(256 + block, n); i++) { 
  
        if (max < a[i]) { 
            max = a[i]; 
        } 
    } 
    b[blockIdx.x] = max; 
} 
  
int main() 
{ 
  
    int n; 
    n = 3 >> 2; 
    int a[n]; 
  
    for (int i = 0; i < n; i++) { 
        a[i] = rand() % n; 
        cout << a[i] << "\t"; 
    } 
  
    cudaEvent_t start, end; 
    int *ad, *bd; 
    int size = n * sizeof(int); 
    cudaMalloc(&ad, size); 
    cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice); 
    int grids = ceil(n * 1.0f / 256.0f); 
    cudaMalloc(&bd, grids * sizeof(int)); 
  
    dim3 grid(grids, 1); 
    dim3 block(1, 1); 
  
    cudaEventCreate(&start); 
    cudaEventCreate(&end); 
    cudaEventRecord(start); 
  
    while (n > 1) { 
        maxi<<<grids, block>>>(ad, bd, n); 
        n = ceil(n * 1.0f / 256.0f); 
        cudaMemcpy(ad, bd, n * sizeof(int), cudaMemcpyDeviceToDevice); 
    } 
  
    cudaEventRecord(end); 
    cudaEventSynchronize(end); 
  
    float time = 0; 
    cudaEventElapsedTime(&time, start, end); 
  
    int ans[2]; 
    cudaMemcpy(ans, ad, 4, cudaMemcpyDeviceToHost); 
  
    cout << "The maximum element is : " << ans[0] << endl; 
  
    cout << "The time required : "; 
    cout << time << endl; 
} 

The maximum element is : 969696160
The time required : 0.003232



## Task 1: Writing a simple CUDA program
For a given integer a and two arrays X and Y (both of size N), write a CUDA program to compute an array Z, such that C = a*X+Y.

You can refer to CUDA programming guide for syntax.
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html


In [34]:
%%cu 
#include <stdio.h>

__global__
void compute(float *x, float *y, float *z, int a, int N)
{
  int tid = blockIdx.x*blockDim.x + threadIdx.x;
  //Compute part
  if(tid<N)
    z[tid] = a*x[tid] + y[tid];
}

int main(void)
{
  int N = 1024*32, a=1;
  cudaEvent_t start, stop;
 
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

   // Allocate x, y and z in the host array.
    float *x = (float *)malloc(N*sizeof(float));
    float *y = (float *)malloc(N*sizeof(float));
    float *z = (float *)malloc(N*sizeof(float));

   // CUDA Malloc for d_x , d_y, d_z in the device array
    float *d_x, *d_y, *d_z;
    cudaMalloc(&d_x, (N*sizeof(float)));
    cudaMalloc(&d_y, (N*sizeof(float)));
    cudaMalloc(&d_z, (N*sizeof(float)));

  // Initialize X and Y arrays and a.
    for(int i=0;i<N;i++)
    {
        x[i]=i;
        y[i]=1;
    }

  //Copy X and Y from host to GPU Device  
    cudaMemcpy(d_x, x, (N*sizeof(float)), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, (N*sizeof(float)), cudaMemcpyHostToDevice);

  //Start the timer
  cudaEventRecord(start);

  //Compute Kernel 
  //Compute Z using X, Y, and a.
  compute<<<N/1024,1024>>>(d_x,d_y,d_z,a,N);
  cudaDeviceSynchronize();

  //Stop the timer
  cudaEventRecord(stop);

  //Copy output from  GPU Device to CPU.  
    cudaMemcpy(z, d_z, (N*sizeof(float)), cudaMemcpyDeviceToHost);
 
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

  for (int i = 0; i < N; i++)  {
    //print Z
    printf("%.1f\n",z[i]);
  }
   printf("Execution Time %f \n", milliseconds);  
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
27771.0
27772.0
27773.0
27774.0
27775.0
27776.0
27777.0
27778.0
27779.0
27780.0
27781.0
27782.0
27783.0
27784.0
27785.0
27786.0
27787.0
27788.0
27789.0
27790.0
27791.0
27792.0
27793.0
27794.0
27795.0
27796.0
27797.0
27798.0
27799.0
27800.0
27801.0
27802.0
27803.0
27804.0
27805.0
27806.0
27807.0
27808.0
27809.0
27810.0
27811.0
27812.0
27813.0
27814.0
27815.0
27816.0
27817.0
27818.0
27819.0
27820.0
27821.0
27822.0
27823.0
27824.0
27825.0
27826.0
27827.0
27828.0
27829.0
27830.0
27831.0
27832.0
27833.0
27834.0
27835.0
27836.0
27837.0
27838.0
27839.0
27840.0
27841.0
27842.0
27843.0
27844.0
27845.0
27846.0
27847.0
27848.0
27849.0
27850.0
27851.0
27852.0
27853.0
27854.0
27855.0
27856.0
27857.0
27858.0
27859.0
27860.0
27861.0
27862.0
27863.0
27864.0
27865.0
27866.0
27867.0
27868.0
27869.0
27870.0
27871.0
27872.0
27873.0
27874.0
27875.0
27876.0
27877.0
27878.0
27879.0
27880.0
27881.0
27882.0
27883.0
27884.0
27885.0
27886.0
27887.0

## Task 2: Measure execution times

In [41]:
%%cu 
#include <stdio.h>

__global__
void compute(float *x, float *y, float *z, int a, int N)
{
  int tid = blockIdx.x*blockDim.x + threadIdx.x;
  //Compute part
  if(tid<N)
    z[tid] = a*x[tid] + y[tid];
}

int main(void)
{
  int N = 1024*32, a=1;
  cudaEvent_t start, stop;
 
  cudaEventCreate(&start);
  cudaEventCreate(&stop);

   // Allocate x, y and z in the host array.
    float *x = (float *)malloc(N*sizeof(float));
    float *y = (float *)malloc(N*sizeof(float));
    float *z = (float *)malloc(N*sizeof(float));

   // CUDA Malloc for d_x , d_y, d_z in the device array
    float *d_x, *d_y, *d_z;
    cudaMalloc(&d_x, (N*sizeof(float)));
    cudaMalloc(&d_y, (N*sizeof(float)));
    cudaMalloc(&d_z, (N*sizeof(float)));

  // Initialize X and Y arrays and a.
    for(int i=0;i<N;i++)
    {
        x[i]=i;
        y[i]=1;
    }

  //Copy X and Y from host to GPU Device  
    cudaMemcpy(d_x, x, (N*sizeof(float)), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, y, (N*sizeof(float)), cudaMemcpyHostToDevice);

  //Start the timer
  cudaEventRecord(start);

  //Compute Kernel 
  //Compute Z using X, Y, and a.
  compute<<<16,2048>>>(d_x,d_y,d_z,a,N);
  cudaDeviceSynchronize();

  //Stop the timer
  cudaEventRecord(stop);

  //Copy output from  GPU Device to CPU.  
    cudaMemcpy(z, d_z, (N*sizeof(float)), cudaMemcpyDeviceToHost);
 
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

  //for (int i = 0; i < N; i++)  {
    //print Z
  //  printf("%.1f\n",z[i]);
  //}
   printf("Execution Time %f \n", milliseconds);  
  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

Execution Time 0.009888 



In [42]:
!nvidia-smi

Wed Dec 23 09:21:38 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces