
Write a CUDA program for:
1. Addition two large vectors
2. Matrix multiplication

In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


/usr/bin/sh: 1: pip: not found


In [4]:
%load_ext nvcc_plugin


ModuleNotFoundError: No module named 'nvcc_plugin'

In [None]:
%%cu
#include <stdio.h>

#define HANDLE_ERROR( err ) ( HandleError( err, __FILE__, __LINE__ ) )

static void HandleError( cudaError_t err, const char *file, int line )
{
    if (err != cudaSuccess)
      {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
                file, line );
        exit( EXIT_FAILURE );
    }
}



const long N = 100 ;

// CUDA Kernel for Vector Addition
__global__ void Vector_Addition ( const int *dev_a , const int *dev_b , int *dev_c)
{
      //Get the id of thread within a block
      unsigned short tid = threadIdx.x ;
     
      if ( tid < N ) // check the boundry condition for the threads
            dev_c [tid] = dev_a[tid] + dev_b[tid] ;

}


int main (void)
{
    
      int max = pow(2,29);
      clock_t start_t , end_t;
      int size = 0;
      double walltime, th;

      //Host array
      int Host_a[N], Host_b[N], Host_c[N];

      //Device array
      int *dev_a , *dev_b, *dev_c ;

      //Allocate the memory on the GPU
      HANDLE_ERROR ( cudaMalloc((void **)&dev_a , N*sizeof(int) ) );
      HANDLE_ERROR ( cudaMalloc((void **)&dev_b , N*sizeof(int) ) );
      HANDLE_ERROR ( cudaMalloc((void **)&dev_c , N*sizeof(int) ) );

      //fill the Host array with random elements on the CPU
 
      for(int i=0; i<N; i++)
      {
        Host_a[i]=rand()%1000;
      }
 
      for(int i=0; i<N; i++)
      {
        Host_b[i]=rand()%1000;
      }

      //Copy Host array to Device array
      HANDLE_ERROR (cudaMemcpy (dev_a , Host_a , N*sizeof(int) , cudaMemcpyHostToDevice));
      HANDLE_ERROR (cudaMemcpy (dev_b , Host_b , N*sizeof(int) , cudaMemcpyHostToDevice));

      //Make a call to GPU kernel
      Vector_Addition <<< 1, N  >>> (dev_a , dev_b , dev_c ) ;

      //Copy back to Host array from Device array
      HANDLE_ERROR (cudaMemcpy(Host_c , dev_c , N*sizeof(int) , cudaMemcpyDeviceToHost));

      //Display the result
      for ( int i = 0; i<N; i++ )
                  printf ("%d + %d = %d\n", Host_a[i] , Host_b[i] , Host_c[i] ) ;

      //Free the Device array memory
      cudaFree (dev_a) ;
      cudaFree (dev_b) ;
      cudaFree (dev_c) ;
 
      walltime =(end_t - start_t)/(double)CLOCKS_PER_SEC;
      th = (max*sizeof(int))/walltime;
      th = th/pow(10,9);

      printf("Time Taken: %lf\n",walltime);

      system("pause");
      return 0 ;

}

**Matrix Multiplication**

In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>
#define row1 4 /* Number of rows of first matrix */
#define col1 4 /* Number of columns of first matrix */
#define row2 4 /* Number of rows of second matrix */
#define col2 4 /* Number of columns of second matrix */

__global__ void matproduct(int *l,int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    int k;
  
n[col2*y+x]=0;
for(k=0;k<col1;k++)
   {
    n[col2*y+x]=n[col2*y+x]+l[col1*y+k]*m[col2*k+x];
   }
}

int main()
{
    int a[row1][col1] = { { 4, 2, 7, 1 },
                       { 3, 8, 2, 3 },
                       { 6, 9, 12, 4 },
                       { 10, 7, 2, 4 } };
    int b[row2][col2] = { { 3, 5, 4, 9 },
                       { 7, 1, 6, 13 },
                       { 11, 5, 9, 2 },
                       { 4, 7, 12, 3 } };
    int c[row1][col2];
    int *d,*e,*f;
    int i,j;

    /*
    printf("\n Enter elements of first matrix of size 2*3\n");
    for(i=0;i<row1;i++)
    {
        for(j=0;j<col1;j++)
            {
                scanf("%d",&a[i][j]);
            }
    }
    printf("\n Enter elements of second matrix of size 3*2\n");
        for(i=0;i<row2;i++)
        {
            for(j=0;j<col2;j++)
                {
                    scanf("%d",&b[i][j]);
                }
        }
    */

    cudaMalloc((void **)&d,row1*col1*sizeof(int));
    cudaMalloc((void **)&e,row2*col2*sizeof(int));
    cudaMalloc((void **)&f,row1*col2*sizeof(int));

 cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
 cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);

dim3 grid(col2,row1);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

    matproduct<<<grid,1>>>(d,e,f);

 cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);
    printf("\nProduct of two matrices:\n ");
    for(i=0;i<row1;i++)
    {
        for(j=0;j<col2;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d);
    cudaFree(e);
    cudaFree(f);

    return 0;
}