Install a jupyter extension

In [1]:
!git clone https://github.com/andreinechaev/nvcc4jupyter
!pip install git+file:/content/nvcc4jupyter

Cloning into 'nvcc4jupyter'...


Collecting git+file:/content/nvcc4jupyter


  Running command git clone -q file:///content/nvcc4jupyter 'C:\Users\DELL\AppData\Local\Temp\pip-req-build-_dam7gva'
  fatal: 'C:/Program Files/Git/content/nvcc4jupyter' does not appear to be a git repository
  fatal: Could not read from remote repository.

  Please make sure you have the correct access rights
  and the repository exists.
ERROR: Command errored out with exit status 128: git clone -q file:///content/nvcc4jupyter 'C:\Users\DELL\AppData\Local\Temp\pip-req-build-_dam7gva' Check the logs for full command output.
You should consider upgrading via the 'C:\Users\DELL\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


  Cloning file:///content/nvcc4jupyter to c:\users\dell\appdata\local\temp\pip-req-build-_dam7gva


Load the plugin

In [2]:
%load_ext nvcc_plugin

ModuleNotFoundError: No module named 'nvcc_plugin'

In [None]:
%%cu 
#include <math.h>
#include <time.h>
#include <iostream>
#include "cuda_runtime.h"


void cpuSum(int* A, int* B, int* C, int N){
    for (int i=0; i<N; ++i){
        C[i] = A[i] + B[i];
    }
}

__global__ void kernel(int* A, int* B, int* C, int N){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N){
        C[i] = A[i] + B[i];
    }
}

void gpuSum(int* A, int* B, int* C, int N){
    
    int threadsPerBlock = min(1024, N);
    int blocksPerGrid = ceil(double(N) / double(threadsPerBlock));

    kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);

}

bool isVectorEqual(int* A, int* B, int N){
    for (int i=0; i<N; ++i){
        if (A[i] != B[i])   return false; 
    }
    return true;
}
int main(){
    int N = 12e3;
    int *A, *B, *C, *D, *d_A, *d_B, *d_C;
    int size = N * sizeof(int);

    A = (int*)malloc(size);
    B = (int*)malloc(size);
    C = (int*)malloc(size);
    D = (int*)malloc(size);
    
    
    for (int i=0; i<N; ++i){
        A[i] = rand() % 1000;
        B[i] = rand() % 1000;
    }


    // CPU
    clock_t tic, toc;

    tic = clock();
    cpuSum(A, B, C, N);
    toc = clock();

    float timeTakenCPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;
    

    // GPU
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    tic = clock();
    gpuSum(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    toc = clock();
    
    float timeTakenGPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;

    cudaMemcpy(D, d_C, size, cudaMemcpyDeviceToHost);
    // free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Verify result
    bool success = isVectorEqual(C, D, N);

    printf("CPU Time: %f \n", timeTakenCPU);
    printf("GPU Time: %f \n", timeTakenGPU);
    printf("Speed Up: %f \n", timeTakenCPU/timeTakenGPU);
    printf("Verification: %s \n", success ? "true" : "false");
    
}