In [None]:
!git clone https://github.com/andreinechaev/nvcc4jupyter
!pip install git+file:/content/nvcc4jupyter

Cloning into 'nvcc4jupyter'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 48 (delta 0), reused 0 (delta 0), pack-reused 45[K
Unpacking objects: 100% (48/48), 8.29 KiB | 606.00 KiB/s, done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+file:/content/nvcc4jupyter
  Cloning file:///content/nvcc4jupyter to /tmp/pip-req-build-c913u9n5
  Running command git clone --filter=blob:none --quiet file:///content/nvcc4jupyter /tmp/pip-req-build-c913u9n5
  Resolved file:///content/nvcc4jupyter to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=ffa33c84e0918069d666122b94

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu 
#include <math.h>
#include <time.h>
#include <iostream>
#include "cuda_runtime.h"


void cpuSum(int* A, int* B, int* C, int N){
    for (int i=0; i<N; ++i){
        C[i] = A[i] + B[i];
    }
}

__global__ void kernel(int* A, int* B, int* C, int N){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N){
        C[i] = A[i] + B[i];
    }
}

void gpuSum(int* A, int* B, int* C, int N){
    
    int threadsPerBlock = min(1024, N);
    int blocksPerGrid = ceil(double(N) / double(threadsPerBlock));

    kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);

}

bool isVectorEqual(int* A, int* B, int N){
    for (int i=0; i<N; ++i){
        if (A[i] != B[i])   return false; 
    }
    return true;
}
int main(){
    int N = 2e8;
    int *A, *B, *C, *D, *d_A, *d_B, *d_C;
    int size = N * sizeof(int);

    A = (int*)malloc(size);
    B = (int*)malloc(size);
    C = (int*)malloc(size);
    D = (int*)malloc(size);
    
    
    for (int i=0; i<N; ++i){
        A[i] = rand() % 1000;
        B[i] = rand() % 1000;
    }


    // CPU
    clock_t start, end;

    start = clock();
    cpuSum(A, B, C, N);
    end = clock();

    float timeTakenCPU = ((float)(end - start)) / CLOCKS_PER_SEC;
    

    // GPU
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    start = clock();
    gpuSum(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    cudaMemcpy(D, d_C, size, cudaMemcpyDeviceToHost);
    
    end = clock();
    float timeTakenGPU = ((float)(end - start)) / CLOCKS_PER_SEC;

    // free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Verify result
    bool success = isVectorEqual(C, D, N);

    printf("CPU Time: %f \n", timeTakenCPU);
    printf("GPU Time: %f \n", timeTakenGPU);
    printf("Speed Up: %f \n", timeTakenCPU/timeTakenGPU);
    printf("Verification: %s \n", success ? "true" : "false");
    
}

CPU Time: 0.981076 
GPU Time: 0.555874 
Speed Up: 1.764925 
Verification: true 

