<a href="https://colab.research.google.com/github/prayaspatnaik21/HPC/blob/main/Matrix_Multiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get remove libopencv-dev
!apt-get update
!apt-get install libopencv-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following packages were automatically installed and are no longer required:
  adwaita-icon-theme dbus-user-session dconf-gsettings-backend dconf-service gtk-update-icon-cache
  hicolor-icon-theme humanity-icon-theme libatk-bridge2.0-0 libatk1.0-0 libatk1.0-data
  libatspi2.0-0 libavcodec-dev libavformat-dev libavutil-dev libcharls2 libcolord2 libdc1394-dev
  libdconf1 libdouble-conversion3 libepoxy0 libexif-dev libexif12 libgdcm-dev libgdcm3.0
  libgl2ps1.4 libglew2.2 libgphoto2-6 libgphoto2-dev libgphoto2-port12
  libgstreamer-plugins-base1.0-0 libgtk-3-0 libgtk-3-common libilmbase-dev libilmbase25 liblept5
  libopencv-calib3d-dev libopencv-calib3d4.5d libopencv-contrib-dev libopencv-contrib4.5d
  libopencv-core-dev libopencv-core4.5d libopencv-dnn-dev libopencv-dnn4.5d
  libopencv-features2d-dev libopencv-features2d4.5d libopencv-flann-dev libopencv-flann4.5d
  libopencv-highgui-dev l

In [15]:
%%writefile matrixMultiplication.cu
#include <cuda_runtime.h>
#include <iostream>

// CUDA Kernel for Matrix Multiplication
__global__ void matrixMultiplicationKernel(unsigned int* pIn1, unsigned int* pIn2, unsigned int* pOut, int m, int n, int k)
{
    // m : rows in pIn1
    // n : columns in pIn2
    // k : columns in pIn1 and rows in pIn2

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < m && col < n)
    {
        float sum = 0.0f;

        for (int index = 0; index < k; index++)
        {
            sum += pIn1[row * k + index] * pIn2[index * n + col];
        }
        pOut[row * n + col] = static_cast<unsigned int>(sum);
    }
}

void initializeMatrix(unsigned int* pIn, int m, int n)
{
    unsigned int count = 0;
    for (int index = 0; index < m * n; index++)
    {
        pIn[index] = count++;
    }
}

// Function to verify the result on the CPU
void verifyResult(unsigned int* A, unsigned int* B, unsigned int* C, int m, int n, int k)
{
    for (int i = 0; i < m; ++i)
    {
        for (int j = 0; j < n; ++j)
        {
            unsigned int value = 0;
            for (int x = 0; x < k; ++x)
            {
                value += A[i * k + x] * B[x * n + j];
            }
            if (C[i * n + j] != value)
            {
                std::cerr << "Verification failed at (" << i << ", " << j << ")\n";
                return;
            }
        }
    }
    std::cout << "Verification passed!\n";
}

void print(unsigned int* pIn, int m, int n)
{
    for(int i =0 ; i < m  ; i++)
    {
      for(int j = 0 ;j < n ; j++)
      {
        std::cout << pIn[i * n + j] << " ";
      }
      std::cout << std::endl;
    }
}

int main()
{
    // Size of the two matrices:
    // m : rows in pIn1
    // n : columns in pIn2
    // k : columns in pIn1 and rows in pIn2
    const int m = 2;
    const int n = 4;
    const int k = 3;

    const int size_a = m * k * sizeof(unsigned int);
    const int size_b = k * n * sizeof(unsigned int);
    const int size_c = m * n * sizeof(unsigned int);

    // Allocate memory on the host
    unsigned int *pIn1, *pIn2, *pOut;
    pIn1 = (unsigned int*)malloc(size_a);
    pIn2 = (unsigned int*)malloc(size_b);
    pOut = (unsigned int*)malloc(size_c);

    initializeMatrix(pIn1, m, k);
    initializeMatrix(pIn2, k, n);

    print(pIn1 , m , k);
    printf("*******************\n");
    print(pIn2 , k , n);
    printf("*******************\n");

    // Allocate memory on the device
    unsigned int* d_pIn1;
    unsigned int* d_pIn2;
    unsigned int* d_pOut;

    cudaMalloc(&d_pIn1, size_a);
    cudaMalloc(&d_pIn2, size_b);
    cudaMalloc(&d_pOut, size_c);

    // Copy matrices to the device
    cudaMemcpy(d_pIn1, pIn1, size_a, cudaMemcpyHostToDevice);
    cudaMemcpy(d_pIn2, pIn2, size_b, cudaMemcpyHostToDevice);

    // Define block and grid sizes
    dim3 blockDim(16, 16);
    dim3 gridDim((n + blockDim.x - 1) / blockDim.x, (m + blockDim.y - 1) / blockDim.y);

    // Launch the kernel
    matrixMultiplicationKernel<<<gridDim, blockDim>>>(d_pIn1, d_pIn2, d_pOut, m, n, k);

    // Copy the result back to the host
    cudaMemcpy(pOut, d_pOut, size_c, cudaMemcpyDeviceToHost);
    print(pOut , m , n);
    // Verify the result
    verifyResult(pIn1, pIn2, pOut, m, n, k);

    // Free memory
    cudaFree(d_pIn1);
    cudaFree(d_pIn2);
    cudaFree(d_pOut);
    free(pIn1);
    free(pIn2);
    free(pOut);

    return 0;
}


Overwriting matrixMultiplication.cu


In [16]:
!nvcc matrixMultiplication.cu -o matrixMultiplication -I/usr/include/opencv4 `pkg-config --cflags --libs opencv4`

In [17]:
!./matrixMultiplication

0 1 2 
3 4 5 
*******************
0 1 2 3 
4 5 6 7 
8 9 10 11 
*******************
20 23 26 29 
56 68 80 92 
Verification passed!
