In [1]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin 

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-n3hd2ur9
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-n3hd2ur9
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=40acb5fcf40f96722ee19cb9c8dbdcb52330d76eb01d6e4e22e50854e5382eb4
  Stored in directory: /tmp/pip-ephem-wheel-cache-pwcrkgon/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


In [3]:
%%cu
#include<stdio.h>
#include<stdlib.h>
#define Mask_Width 5
 
__global__ void Convolution_global(int *src, int *res, int *d_mask, int src_length){
    //taking the threadid
    int id =  blockIdx.x * blockDim.x + threadIdx.x;
    if(id < src_length){
        //declaring the start point
        int start = id - (Mask_Width / 2);
              int pval = 0;
    
       //Looping throught the array and multiplying with the mask array
        for(int i = 0; i < Mask_Width; i++){
            if((start + i) >= 0 && (start + i) < src_length){
               // printf("elements being multiplied are: src = %d mask = %d\n", src[start + i], d_mask[i]);
                pval += (src[start + i] * d_mask[i]);
               // printf("pval = %d\n", pval);
            }
        }
 
        //storing the answer in the resultant array
        res[id] = pval;        
      }
}
 
int main(){
    //Initializing the input array and the mask array
    int n = 8;
    int input [] = {8, 9, 3, 4, 5, 6, 11, 67};
    int mask[] = {7, 8, 9, 10, 11};
    int size_input = sizeof(int) * n;
    int size_mask = sizeof(int) * Mask_Width;
    int h_output[n];
 
    //Allocating space in the device
    int *d_input, *d_output, *d_mask_s;
    cudaMalloc((void **)&d_input, size_input);
    cudaMalloc((void **)&d_output, size_input);
    cudaMalloc((void **)&d_mask_s, size_mask);
 
    //Copying to the device memory
    cudaMemcpy(d_input, input, size_input, cudaMemcpyHostToDevice);
    cudaMemcpy(d_mask_s, mask, size_mask, cudaMemcpyHostToDevice);
 
    //Creating event to calculate the time elapsed
    float et;
    cudaEvent_t start, stop;
    cudaEventCreate(&start); cudaEventCreate(&stop);
 
    //Calling the kernel along with time calculation
    int threads = 4;
    int blocks = (threads + n - 1) / threads;
    cudaEventRecord(start);
    Convolution_global<<<blocks, threads>>>(d_input, d_output, d_mask_s, n);
    cudaEventRecord(stop);
    cudaDeviceSynchronize();
    //Calculating the elapsed time of first kernel
    cudaEventElapsedTime(&et, start, stop);
    printf("\nThe time taken by global memory kernel to execute is: %f milliseconds\n", et);
 
    //Copying the shared memory result to host
    cudaMemcpy(h_output, d_output, size_input, cudaMemcpyDeviceToHost);
 
    //printing the result
    printf("\nPrinting the input array:\n");
    for(int i = 0; i < n; i++){
        printf("%d\t", input[i]);
    }
    printf("\n");
    printf("\nPrinting the mask array:\n");
    for(int i = 0; i < Mask_Width; i++){
        printf("%d\t", mask[i]);
    }
    printf("\n");
    printf("\nPrinting the resultant array:\n");
    for(int i = 0; i < n; i++){
        printf("%d\t", h_output[i]);
    }
 
    //Freeing the cuda resources
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_mask_s);
 
}
 



The time taken by global memory kernel to execute is: 0.018752 milliseconds

Printing the input array:
8	9	3	4	5	6	11	67	

Printing the mask array:
7	8	9	10	11	

Printing the resultant array:
195	219	250	239	279	969	852	733	
