In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-i_86q09z
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-i_86q09z
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4305 sha256=ddb2114e837ec79c1cedeac39d4e1f4f60f2fc48cee647f7f54166c8d72791a8
  Stored in directory: /tmp/pip-ephem-wheel-cache-wbgebwo8/wheels/db/c1/1f/a2bb07bbb4a1ce3c43921252aeafaa6205f08637e292496f04
Successfully built NVCCPlugin
Installing collecte

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu 
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include "cuda_runtime.h"

// Define matrix size
#define N 3

__global__ void matrix_multiply(float *a, float *b, float *c) { 
    // Calculate thread index
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    // Calculate the product of two matrices
    float sum = 0;
    for (int i = 0; i < N; i++) {
        sum += a[row * N + i] * b[i * N + col];
    }
    c[row * N + col] = sum;
}

void matrixMultiplication(float *a ,float *b ,float *c) {
 
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0;
            for (int k = 0; k < N; k++) {
                sum += a[i * N+ k] * b[k * N + j];
            }
            c[i * N + j] = sum;
        }
    }
    
       // Print the result matrix
        printf("\nMatrix result using normal function : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", c[i * N + j]);
        }
        printf("\n");
    }
    printf("\n-----------------------------------------------------------------------\n");
    
}



int main() {
    float *a, *b, *c,*d;  // Pointers to matrices in host memory
    float *dev_a, *dev_b, *dev_c;  // Pointers to matrices in device memory
    int size = N * N * sizeof(float);

    // Allocate memory for matrices in host memory
    a = (float *)malloc(size);
    b = (float *)malloc(size);
    c = (float *)malloc(size);
    d = (float *)malloc(size);

    // Initialize matrices with random values
    for (int i = 0; i < N * N; i++) {
        a[i] = rand() % 20;
        b[i] = rand() % 20;
    }

    // Allocate memory for matrices in device memory
    cudaMalloc((void **)&dev_a, size);
    cudaMalloc((void **)&dev_b, size);
    cudaMalloc((void **)&dev_c, size);

    // Copy matrices from host memory to device memory
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

    // Define the grid and block dimensions
    dim3 dimGrid(N / 3, N / 3);
    dim3 dimBlock(3, 3);

    // Call the kernel function

    clock_t tic, toc;
    tic = clock();
    matrix_multiply<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c);
    toc = clock();

    float timeTakenGPU = ((float)(toc - tic)) / CLOCKS_PER_SEC;

    // Copy the result matrix from device memory to host memory
    cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

       // Print the A matrix
       printf("Matrix A : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", a[i * N + j]);
        }
        printf("\n");
    }
    printf("\n---------------------------------------------------------------------------------\n");

       // Print the B matrix
       printf("Matrix B : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", b[i * N + j]);
        }
        printf("\n");
    }
      printf("\n---------------------------------------------------------------------------------\n");

    // normal 
     // CPU
   

    tic = clock();
    matrixMultiplication(a,b,d);
    toc = clock();

  float timeTakenCPU =(float) ((toc - tic)) / CLOCKS_PER_SEC;
    
  
    // Print the result matrix parallel
     //  printf("\nMatrix Result using cuda : \n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", c[i * N + j]);
        }
        printf("\n");
    }
    //printf("----------------------------------------------------------------------------------\n");
  
    // Free memory
    free(a);
    free(b);
    free(c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    printf("\n ");
   printf("CPU Time: %f \n", timeTakenCPU);
   printf("GPU Time: %f \n", timeTakenGPU);
   printf("Speed Up: %f \n", timeTakenCPU/timeTakenGPU);

    return 0;
}

Matrix A : 
3.000000 17.000000 13.000000 
6.000000 9.000000 2.000000 
10.000000 3.000000 0.000000 

---------------------------------------------------------------------------------
Matrix B : 
6.000000 15.000000 15.000000 
12.000000 1.000000 7.000000 
19.000000 6.000000 6.000000 

---------------------------------------------------------------------------------

Matrix result using normal function : 
469.000000 140.000000 242.000000 
182.000000 111.000000 165.000000 
96.000000 153.000000 171.000000 

-----------------------------------------------------------------------469.000000 140.000000 242.000000 
182.000000 111.000000 165.000000 
96.000000 153.000000 171.000000 

 CPU Time: 0.000009 
GPU Time: 0.000024 
Speed Up: 0.375000 

