In [None]:
!nvcc --version
%pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [6]:
%%cuda
#include<stdio.h>
#include<stdlib.h>

#define N 5

__global__
void matrixMultiplication(float* A,float* B, float* C, int width){
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int ind = row * width + col;

    if(row<width && col<width){
        float sum = 0.0f;
        for (int k = 0; k < width; ++k) {
            sum += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = sum;
    }
}

int main(){
    
    size_t sz = N * N * sizeof(float);

    float *h_A=(float*)malloc(sz);
    float *h_B=(float*)malloc(sz);
    float *h_C=(float*) malloc(sz);

    for (int i = 0; i < N * N; i++) {
        h_A[i] = i;
        h_B[i] = (i+1)%8;
    }

    float *d_A=NULL,*d_B=NULL,*d_C=NULL;
    cudaMalloc((void**)&d_A,sz);
    cudaMalloc((void**)&d_B,sz);
    cudaMalloc((void**)&d_C,sz);

    cudaMemcpy(d_A,h_A,sz,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,h_B,sz,cudaMemcpyHostToDevice);

    int THREADS = 16;
    dim3 blocksPerGrid((N+THREADS-1)/THREADS,(N+THREADS-1)/THREADS);
    dim3 threadsPerBlock(THREADS,THREADS);

    matrixMultiplication<<<blocksPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,N);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C,d_C,sz,cudaMemcpyDeviceToHost);

    printf("A:\n");
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%0.1f ", h_A[row*N+col]);
        }
        printf("\n");
    }
    printf("\nB:\n");
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%0.1f ", h_B[row*N+col]);
        }
        printf("\n");
    }
    printf("\nC:\n");
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%0.1f ", h_C[row*N+col]);
        }
        printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);


    return 0;
}

A:
0.0 1.0 2.0 3.0 4.0 
5.0 6.0 7.0 8.0 9.0 
10.0 11.0 12.0 13.0 14.0 
15.0 16.0 17.0 18.0 19.0 
20.0 21.0 22.0 23.0 24.0 

B:
1.0 2.0 3.0 4.0 5.0 
6.0 7.0 0.0 1.0 2.0 
3.0 4.0 5.0 6.0 7.0 
0.0 1.0 2.0 3.0 4.0 
5.0 6.0 7.0 0.0 1.0 

C:
32.0 42.0 44.0 22.0 32.0 
107.0 142.0 129.0 92.0 127.0 
182.0 242.0 214.0 162.0 222.0 
257.0 342.0 299.0 232.0 317.0 
332.0 442.0 384.0 302.0 412.0 

