Install a jupyter extension

In [1]:
!git clone https://github.com/andreinechaev/nvcc4jupyter
!pip install git+file:/content/nvcc4jupyter

Cloning into 'nvcc4jupyter'...
remote: Enumerating objects: 48, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 48 (delta 0), reused 0 (delta 0), pack-reused 45[K
Unpacking objects: 100% (48/48), 8.29 KiB | 565.00 KiB/s, done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+file:/content/nvcc4jupyter
  Cloning file:///content/nvcc4jupyter to /tmp/pip-req-build-5jnn8o8g
  Running command git clone --filter=blob:none --quiet file:///content/nvcc4jupyter /tmp/pip-req-build-5jnn8o8g
  Resolved file:///content/nvcc4jupyter to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=0e39422b84462e3a7276ec79b1

Load the plugin

In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [3]:
%%cu 
#include <math.h>
#include <time.h>
#include <iostream>
#include "cuda_runtime.h"


void cpuSum(int* A, int* B, int* C, int N){
    for (int i=0; i<N; ++i){
        C[i] = A[i] + B[i];
    }
}

__global__ void kernel(int* A, int* B, int* C, int N){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N){
        C[i] = A[i] + B[i];
    }
}

void gpuSum(int* A, int* B, int* C, int N){
    
    int threadsPerBlock = min(1024, N);
    int blocksPerGrid = ceil(double(N) / double(threadsPerBlock));

    kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);

}


bool isVectorEqual(int* A, int* B, int N){
    for (int i=0; i<N; ++i){
        if (A[i] != B[i])   return false; 
    }
    return true;
}
int main(){
    int N = 12e3;
    int *A, *B, *C, *D, *d_A, *d_B, *d_C;
    int size = N * sizeof(int);

    A = (int*)malloc(size);
    B = (int*)malloc(size);
    C = (int*)malloc(size);
    D = (int*)malloc(size);
    
    
    for (int i=0; i<N; ++i){
        A[i] = rand() % 1000;
        B[i] = rand() % 1000;
    }


    // CPU
    clock_t tic, toc;

    tic = clock();
    cpuSum(A, B, C, N);
    toc = clock();

    float timeTakenCPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;
    

    // GPU
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);
    
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    tic = clock();
    gpuSum(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();
    toc = clock();
    
    float timeTakenGPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;

    cudaMemcpy(D, d_C, size, cudaMemcpyDeviceToHost);
    // free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    // Verify result
    bool success = isVectorEqual(C, D, N);

    printf("CPU Time: %f \n", timeTakenCPU);
    printf("GPU Time: %f \n", timeTakenGPU);
    printf("Speed Up: %f \n", timeTakenCPU/timeTakenGPU);
    printf("Verification: %s \n", success ? "true" : "false");
    
}

CPU Time: 0.000054 
GPU Time: 0.000035 
Speed Up: 1.542857 
Verification: true 



In [48]:
%%cu 
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include "cuda_runtime.h"

// Define matrix size
#define N 16

__global__ void matrix_multiply(float *a, float *b, float *c) {
    // Calculate thread index
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Calculate the product of two matrices
    float sum = 0;
    for (int i = 0; i < N; i++) {
        sum += a[row * N + i] * b[i * N + col];
    }
    c[row * N + col] = sum;
}

void matrixMultiplication(float *a ,float *b ,float *c) {
 
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0;
            for (int k = 0; k < N; k++) {
                sum += a[i * N+ k] * b[k * N + j];
            }
            c[i * N + j] = sum;
        }
    }
       // Print the result matrix
          printf("\nMatrix result using normal function : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", c[i * N + j]);
        }
        printf("\n");
    }
    printf("\n-----------------------------------------------------------------------");
}



int main() {
    float *a, *b, *c,*d;  // Pointers to matrices in host memory
    float *dev_a, *dev_b, *dev_c;  // Pointers to matrices in device memory
    int size = N * N * sizeof(float);

    // Allocate memory for matrices in host memory
    a = (float *)malloc(size);
    b = (float *)malloc(size);
    c = (float *)malloc(size);
    d = (float *)malloc(size);

    // Initialize matrices with random values
    for (int i = 0; i < N * N; i++) {
        a[i] = rand() % 100;
        b[i] = rand() % 100;
    }

    // Allocate memory for matrices in device memory
    cudaMalloc((void **)&dev_a, size);
    cudaMalloc((void **)&dev_b, size);
    cudaMalloc((void **)&dev_c, size);

    // Copy matrices from host memory to device memory
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

    // Define the grid and block dimensions
    dim3 dimGrid(N / 16, N / 16);
    dim3 dimBlock(16, 16);

    // Call the kernel function

    clock_t tic, toc;
tic = clock();
   matrix_multiply<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
    toc = clock();

    float timeTakenGPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;

    // Copy the result matrix from device memory to host memory
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

       // Print the A matrix
       printf("Matrix A : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", a[i * N + j]);
        }
        printf("\n");
    }
    printf("\n---------------------------------------------------------------------------------\n");

       // Print the B matrix
       printf("Matrix B : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", b[i * N + j]);
        }
        printf("\n");
    }
      printf("\n---------------------------------------------------------------------------------");

    // normal 
     // CPU
   

    tic = clock();
    matrixMultiplication(a,b,d);
    toc = clock();

  float timeTakenCPU =(float) ((toc - tic)) / CLOCKS_PER_SEC;
    

    // Print the result matrix parallel
       printf("\nMatrix Result using cuda : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", c[i * N + j]);
        }
        printf("\n");
    }
    printf("----------------------------------------------------------------------------------\n");

    // Free memory
    free(a);
    free(b);
    free(c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    printf("\n ");
   printf("CPU Time: %f \n", timeTakenCPU);
   printf("GPU Time: %f \n", timeTakenGPU);
   printf("Speed Up: %f \n", timeTakenCPU/timeTakenGPU);

    return 0;
}


Matrix A : 
83.000000 77.000000 93.000000 86.000000 49.000000 62.000000 90.000000 63.000000 40.000000 72.000000 11.000000 67.000000 82.000000 62.000000 67.000000 29.000000 
22.000000 69.000000 93.000000 11.000000 29.000000 21.000000 84.000000 98.000000 15.000000 13.000000 91.000000 56.000000 62.000000 96.000000 5.000000 84.000000 
36.000000 46.000000 13.000000 24.000000 82.000000 14.000000 34.000000 43.000000 87.000000 76.000000 88.000000 3.000000 54.000000 32.000000 76.000000 39.000000 
26.000000 94.000000 95.000000 34.000000 67.000000 97.000000 17.000000 52.000000 1.000000 86.000000 65.000000 44.000000 40.000000 31.000000 97.000000 81.000000 
9.000000 67.000000 97.000000 86.000000 6.000000 19.000000 28.000000 32.000000 3.000000 70.000000 8.000000 40.000000 96.000000 18.000000 46.000000 21.000000 
79.000000 64.000000 41.000000 93.000000 34.000000 24.000000 87.000000 43.000000 27.000000 59.000000 32.000000 37.000000 75.000000 74.000000 58.000000 29.000000 
35.000000 18.000000 43.000000

In [1]:
#link: https://colab.research.google.com/drive/1h3uw8_SySdtCXauBJd216cakMVyxjy2T#scrollTo=GTpypW0VeiV6