In [None]:
%%writefile block.cu
#include<stdio.h>
__global__ void add( int* a, int* b, int* c ) {
c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
#define N 512
int main( void ) {
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N *sizeof( int); // we need space for 512 integers

// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
*a = 2;
*b = 7;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with N parallel blocks
add<<< N, 1 >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

printf("Sum=%d\n", *c);
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
return 0;
}

Writing block.cu


In [None]:
!nvcc block.cu -o block

In [None]:
!./block

Sum=9


In [None]:
%%writefile thread.cu
#include<stdio.h>
__global__ void add( int* a, int* b, int* c ) {
c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}
#define N 512
int main( void ) {
int *a, *b, *c; //host copies of a, b, c
int *dev_a, *dev_b, *dev_c; //device copies of a, b, c
int size = N * sizeof( int); //we need space for 512 integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
*a = 2;
*b = 7;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with N
add<<<1, N >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);

printf("Sum=%d\n", *c);
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
return 0;
}


Overwriting thread.cu


In [None]:
!nvcc thread.cu -o thread

In [None]:
!./thread

Sum=9


In [None]:
%%writefile threadblock.cu
#include<stdio.h>
__global__ void add( int* a, int* b, int* c ) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
c[index] = a[index] + b[index];
}

#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main( void ) {
int *a, *b, *c; // host copies of a, b, c
int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
int size = N * sizeof( int); // we need space for N integers
// allocate device copies of a, b, c
cudaMalloc( (void**)&dev_a, size );
cudaMalloc( (void**)&dev_b, size );
cudaMalloc( (void**)&dev_c, size );
a = (int*)malloc( size );
b = (int*)malloc( size );
c = (int*)malloc( size );
*a = 2;
*b = 7;
// copy inputs to device
cudaMemcpy( dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy( dev_b, b, size, cudaMemcpyHostToDevice);
// launch add() kernel with blocks and threads
add<<< N/THREADS_PER_BLOCK, THREADS_PER_BLOCK >>>( dev_a, dev_b, dev_c);
// copy device result back to host copy of c
cudaMemcpy( c, dev_c, size, cudaMemcpyDeviceToHost);
printf("Sum is =%d",*c);
free( a ); free( b ); free( c );
cudaFree( dev_a);
cudaFree( dev_b);
cudaFree( dev_c);
return 0;
}

Overwriting threadblock.cu


In [None]:
!nvcc threadblock.cu -o threadblock

In [None]:
!./threadblock

Sum is =9

In [None]:
%%writefile mat.cu
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define TILE_WIDTH 2
/*matrix multiplication kernels*/
//non shared
__global__ void
MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{
           // calculate thread id
           unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
           unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;
         for (int k = 0 ; k<WIDTH ; k++ )
         {
                  Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
          }
}
// shared
__global__ void MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{        //Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele
          __shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;
           __shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;
         // calculate thread id
          unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
          unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;


        for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
       {
            Mds[threadIdx.y][threadIdx.x] =  Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)]  ;
            Nds[threadIdx.y][threadIdx.x] =  Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
         __syncthreads() ; // for syncronizeing the threads
         // Do for tile
           for ( int k = 0; k<TILE_WIDTH ; k++ )
                       Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
         __syncthreads() ; // for syncronizeing the threads
     }
}
// main routine
int main ()
{
   const int WIDTH = 100 ;
   struct timeval tv1, tv2;
  struct timezone tz;
  double elapsed;

   float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH],
                     result_array_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH]  ;
  float *array1_d , *array2_d ,*result_array_d  ,*M_result_array_d ; // device array
  int i , j ;
  //input in host array
  for ( i = 0 ; i<WIDTH ; i++ )
  {
     for (j = 0 ; j<WIDTH ; j++ )
     {
        array1_h[i][j] = 1 ;
        array2_h[i][j] = 2 ;
     }
  }
  gettimeofday(&tv1, &tz);
  //create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;
  cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (int) ) ;
  cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (int) ) ;
  //copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
  cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;
  cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;
    //allocating memory for resultent device array
  cudaMalloc((void **) &result_array_d , WIDTH*WIDTH*sizeof (int) ) ;
  cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (int) ) ;
    //calling kernal
  dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;
  dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;
// Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
#if 0
  MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif

#if 1
     MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
#endif
  cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(int) ,
                                    cudaMemcpyDeviceToHost) ;
   gettimeofday(&tv2, &tz);
    elapsed = (double) (tv2.tv_sec-tv1.tv_sec) + (double) (tv2.tv_usec-tv1.tv_usec) * 1.e-6;
  for ( i = 0 ; i<WIDTH ; i++ )
  {
      for ( j = 0 ; j < WIDTH ; j++ )
     {
        printf ("%f   ",M_result_array_h[i][j] ) ;
     }
     printf ("\n") ;
}
    printf("Elapsed time = %f seconds", elapsed);
}


Writing mat.cu


In [None]:
!nvcc mat.cu -o mat

                       result_array_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH] ;
                       ^




In [None]:
!./mat

200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000   200.000000  