# **Check Cuda Version**

In [None]:
!nvcc --version

# **CUDA Matrix Multiplication Code**
Make sure you are using a GPU runtime in colab or running locally with Cuda Toolkit installed.

Code is for square matrices

In [None]:
%%writefile matmult_cu.h
#ifndef MATMULT_H
#define MATMULT_H

#include <cuda_runtime.h>
#include <stdint.h>
#include <stdio.h>

// -------- Constants --------
#define SIZE 10ULL // Matrix dimension (N x N)

// -------- Function Prototypes --------

// CUDA kernel for tiled matrix multiplication
__global__ void gpu_matrix_mult(uint8_t *a, uint8_t *b, uint8_t *c);

#endif // MATMULT_H

In [None]:
%%writefile matmult.cu
#include "matmult_cu.h"
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cmath>
#include <cuda_runtime.h>

__global__ void gpu_matrix_mult(int *a, int *b, int *c) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < SIZE && col < SIZE) {
        int sum = 0;
        for (int i = 0; i < SIZE; i++)
            sum += a[row * SIZE + i] * b[i * SIZE + col];
        c[row * SIZE + col] = sum;
    }
}

static void initialize(uint8_t *data, unsigned size) {
    for (unsigned i = 0; i < size; ++i) {
      data[i] = (i + 1);
    }
}

int main() {

  size_t elems = (size_t)SIZE * SIZE;

  uint8_t *hostA = new uint8_t[elems];
  uint8_t *hostB = new uint8_t[elems];
  uint8_t *hostC = new uint8_t[elems];

  initialize(hostA, elems);
  initialize(hostB, elems);

  int *deviceA, *deviceB, *deviceC;
  cudaMalloc((void**)&deviceA, elems * sizeof(uint8_t));
  cudaMalloc((void**)&deviceB, elems * sizeof(uint8_t));
  cudaMalloc((void**)&deviceC, elems * sizeof(uint8_t));

  cudaMemcpy(deviceA, hostA, elems * sizeof(uint8_t), cudaMemcpyHostToDevice);
  cudaMemcpy(deviceB, hostB, elems * sizeof(uint8_t), cudaMemcpyHostToDevice);

  dim3 threads(16, 16);
  dim3 blocks((SIZE + threads.x - 1) / threads.x,
            (SIZE + threads.y - 1) / threads.y);

  gpu_matrix_mult<<<blocks, threads>>>(deviceA, deviceB, deviceC);
  cudaDeviceSynchronize();

  cudaMemcpy(hostC, deviceC, elems * sizeof(uint8_t), cudaMemcpyDeviceToHost);

  cudaFree(deviceA);
  cudaFree(deviceB);
  cudaFree(deviceC);
  return 0;
}

# **Compile CUDA code with NVCC**

## **PTX Code**

In [None]:
!nvcc -ptx matmult.cu -o matmult.ptx
!cat ./matmult.ptx

## **SASS Code**

In [None]:
!nvcc -ptx matmult.cu -o matmult.ptx
!nvcc matmult.cu -o matmult.cubin -arch=sm_75
!cuobjdump -sass matmult.cubin