In [4]:
!nvidia-smi

Fri Feb 20 10:16:57 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [5]:
%%writefile matrix_mul_basic.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

#define TILE 16

// CPU Multiplication
void cpuMatMul(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++) {
            C[i*N + j] = 0;
            for (int k = 0; k < N; k++)
                C[i*N + j] += A[i*N + k] * B[k*N + j];
        }
}

// GPU Kernel (No Shared Memory)
__global__ void gpuMatMul(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        float sum = 0;
        for (int k = 0; k < N; k++)
            sum += A[row*N + k] * B[k*N + col];
        C[row*N + col] = sum;
    }
}

int main(int argc, char *argv[]) {

    if (argc != 2) {
        printf("Usage: ./matrix_mul_basic <matrix_size>\n");
        return 1;
    }

    int N = atoi(argv[1]);
    size_t size = N * N * sizeof(float);

    float *A = (float*)malloc(size);
    float *B = (float*)malloc(size);
    float *C_cpu = (float*)malloc(size);
    float *C_gpu = (float*)malloc(size);

    for (int i = 0; i < N*N; i++) {
        A[i] = rand() % 10;
        B[i] = rand() % 10;
    }

    // CPU Timing
    clock_t start = clock();
    cpuMatMul(A, B, C_cpu, N);
    clock_t end = clock();
    double cpu_time = (double)(end - start) / CLOCKS_PER_SEC;

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    dim3 threads(TILE, TILE);
    dim3 blocks((N + TILE - 1)/TILE, (N + TILE - 1)/TILE);

    cudaEvent_t gstart, gend;
    cudaEventCreate(&gstart);
    cudaEventCreate(&gend);

    cudaEventRecord(gstart);
    gpuMatMul<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaEventRecord(gend);
    cudaEventSynchronize(gend);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, gstart, gend);

    printf("\nMatrix Size: %d x %d\n", N, N);
    printf("CPU Time: %f seconds\n", cpu_time);
    printf("GPU Time: %f ms\n", gpu_time);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(A); free(B); free(C_cpu); free(C_gpu);

    return 0;
}

Overwriting matrix_mul_basic.cu


In [6]:
!nvcc matrix_mul_basic.cu -o matrix_mul_basic
!./matrix_mul_basic 512


Matrix Size: 512 x 512
CPU Time: 1.119478 seconds
GPU Time: 102.656548 ms


In [7]:
%%writefile matrix_mul_shared.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

#define TILE 16

void cpuMatMul(float *A, float *B, float *C, int N) {
    for (int i = 0; i < N; i++)
        for (int j = 0; j < N; j++) {
            C[i*N + j] = 0;
            for (int k = 0; k < N; k++)
                C[i*N + j] += A[i*N + k] * B[k*N + j];
        }
}

__global__ void gpuMatMulShared(float *A, float *B, float *C, int N) {

    __shared__ float sA[TILE][TILE];
    __shared__ float sB[TILE][TILE];

    int row = blockIdx.y * TILE + threadIdx.y;
    int col = blockIdx.x * TILE + threadIdx.x;

    float sum = 0;

    for (int t = 0; t < (N + TILE - 1)/TILE; t++) {

        if (row < N && t*TILE + threadIdx.x < N)
            sA[threadIdx.y][threadIdx.x] = A[row*N + t*TILE + threadIdx.x];
        else
            sA[threadIdx.y][threadIdx.x] = 0;

        if (col < N && t*TILE + threadIdx.y < N)
            sB[threadIdx.y][threadIdx.x] = B[(t*TILE + threadIdx.y)*N + col];
        else
            sB[threadIdx.y][threadIdx.x] = 0;

        __syncthreads();

        for (int k = 0; k < TILE; k++)
            sum += sA[threadIdx.y][k] * sB[k][threadIdx.x];

        __syncthreads();
    }

    if (row < N && col < N)
        C[row*N + col] = sum;
}

int main(int argc, char *argv[]) {

    if (argc != 2) {
        printf("Usage: ./matrix_mul_shared <matrix_size>\n");
        return 1;
    }

    int N = atoi(argv[1]);
    size_t size = N * N * sizeof(float);

    float *A = (float*)malloc(size);
    float *B = (float*)malloc(size);
    float *C_cpu = (float*)malloc(size);
    float *C_gpu = (float*)malloc(size);

    for (int i = 0; i < N*N; i++) {
        A[i] = rand() % 10;
        B[i] = rand() % 10;
    }

    clock_t start = clock();
    cpuMatMul(A, B, C_cpu, N);
    clock_t end = clock();
    double cpu_time = (double)(end - start) / CLOCKS_PER_SEC;

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

    dim3 threads(TILE, TILE);
    dim3 blocks((N + TILE - 1)/TILE, (N + TILE - 1)/TILE);

    cudaEvent_t gstart, gend;
    cudaEventCreate(&gstart);
    cudaEventCreate(&gend);

    cudaEventRecord(gstart);
    gpuMatMulShared<<<blocks, threads>>>(d_A, d_B, d_C, N);
    cudaEventRecord(gend);
    cudaEventSynchronize(gend);

    float gpu_time;
    cudaEventElapsedTime(&gpu_time, gstart, gend);

    printf("\nMatrix Size: %d x %d\n", N, N);
    printf("CPU Time: %f seconds\n", cpu_time);
    printf("GPU Time (Shared Memory): %f ms\n", gpu_time);

    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    free(A); free(B); free(C_cpu); free(C_gpu);

    return 0;
}

Writing matrix_mul_shared.cu


In [10]:
!nvcc matrix_mul_shared.cu -o matrix_mul_shared
!./matrix_mul_shared 512


Matrix Size: 512 x 512
CPU Time: 1.162507 seconds
GPU Time (Shared Memory): 0.746944 ms
