<a href="https://colab.research.google.com/github/sejalxz/react/blob/main/multiplication_of_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-3xyease8
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-3xyease8
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=27a4dc5e964dcb01c17fb133c9b644f172ea219f57596e0efd5bcf0ecb8ee76a
  Stored in directory: /tmp/pip-ephem-wheel-cache-xu_5j332/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [11]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>

#define N 3

__global__ void matrixMul(int *a, int *b, int *c)
{
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  int sum = 0;
  for (int i = 0; i < N; ++i)
  {
    sum += a[row * N + i] * b[i * N + col];
  }

  c[row * N + col] = sum;
}

void matrixMulCPU(int *a, int *b, int *c)
{
  for (int row = 0; row < N; ++row)
  {
    for (int col = 0; col < N; ++col)
    {
      int sum = 0;
      for (int i = 0; i < N; ++i)
      {
        sum += a[row * N + i] * b[i * N + col];
      }
      c[row * N + col] = sum;
    }
  }
}

double getSeconds()
{
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
}

int main()
{
//  int a[N][N], b[N][N], c[N][N], c_CPU[N][N];
 
int a[N][N] = { {1, 2, 3}, {4, 5, 6}, {7, 8, 9} };
int b[N][N] = { {9, 8, 7}, {6, 5, 4}, {3, 2, 1} };
int c_CPU[N][N] = { 0 };
int c[N][N] = { 0 };

  int *dev_a, *dev_b, *dev_c;

  // Initialize matrices a and b
/*  for (int i = 0; i <= N; ++i)
  {
    for (int j = 0; j <= N; ++j)
    {
      a[i][j] = i + j + 1;
      b[i][j] = i * j + 1;
    }
  }
 */

  // Allocate memory on the device
  cudaMalloc((void **)&dev_a, N * N * sizeof(int));
  cudaMalloc((void **)&dev_b, N * N * sizeof(int));
  cudaMalloc((void **)&dev_c, N * N * sizeof(int));

  // Copy matrices a and b from host to device
  cudaMemcpy(dev_a, a, N * N * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N * N * sizeof(int), cudaMemcpyHostToDevice);

  // Define grid and block dimensions
  dim3 threadsPerBlock(N, N);
  dim3 blocksPerGrid(1, 1);

  // Measure GPU execution time
  double startGPU = getSeconds();

  // Launch the matrix multiplication kernel
  matrixMul<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c);

  // Copy result matrix c from device to host
  cudaMemcpy(c, dev_c, N * N * sizeof(int), cudaMemcpyDeviceToHost);

  double endGPU = getSeconds();
  double timeGPU = endGPU - startGPU;
 
 printf("\n Matrix 1:\n");
  for (int i = 0; i < N; ++i)
  {
    for (int j = 0; j < N; ++j)
    {
      printf("%d ", a[i][j]);
    }
    printf("\n");
  }
 

  printf("\Matrix 2:\n");
  for (int i = 0; i < N; ++i)
  {
    for (int j = 0; j < N; ++j)
    {
      printf("%d ", b[i][j]);
    }
    printf("\n");
  }


  // Print the result matrix
  printf("GPU Result Matrix:\n");
  for (int i = 0; i < N; ++i)
  {
    for (int j = 0; j < N; ++j)
    {
      printf("%d ", c[i][j]);
    }
    printf("\n");
  }

  // Measure CPU execution time
  double startCPU = getSeconds();

  // Perform matrix multiplication on the CPU
  matrixMulCPU((int *)a, (int *)b, (int *)c_CPU);

  double endCPU = getSeconds();
  double timeCPU = endCPU - startCPU;




  // Print the result matrix
  printf("\nCPU Result Matrix:\n");
  for (int i = 0; i < N; ++i)
  {
    for (int j = 0; j < N; ++j)
    {
      printf("%d ", c_CPU[i][j]);
    }
    printf("\n");
  }

  // Calculate speedup
  double speedup = timeCPU / timeGPU;
 
//  printf("Result Matrix:\n");
//     printMatrix((int*)c);

  // Print execution times and speedup
  printf("\nExecution Time (GPU): %.6f seconds\n", timeGPU);
  printf("Execution Time (CPU): %.6f seconds\n", timeCPU);
  printf("Speedup: %.2f\n", speedup);

  // Free device memory
  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);

  return 0;
}




 Matrix 1:
1 2 3 
4 5 6 
7 8 9 
Matrix 2:
9 8 7 
6 5 4 
3 2 1 
GPU Result Matrix:
30 24 18 
84 69 54 
138 114 90 

CPU Result Matrix:
30 24 18 
84 69 54 
138 114 90 

Execution Time (GPU): 0.000033 seconds
Execution Time (CPU): 0.000001 seconds
Speedup: 0.03

