In [None]:
!nvcc --version
%pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [7]:
%%cuda
#include <stdio.h>
#include <stdlib.h>

#define N 16

__global__ 
void matrixAdd(const float *A, const float *B, float *C, int width) {
    
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int idx = row * width + col;

    if (row < width && col < width) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    int size = N * N;
    size_t sz = size * sizeof(float);

    float *h_A = (float*)malloc(sz);
    float *h_B = (float*)malloc(sz);
    float *h_C = (float*)malloc(sz);

    for (int i = 0; i < size; i++) {
        h_A[i] = i;
        h_B[i] = i;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, sz);
    cudaMalloc((void**)&d_B, sz);
    cudaMalloc((void**)&d_C, sz);

    cudaMemcpy(d_A, h_A, sz, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sz, cudaMemcpyHostToDevice);

    int THREADS = 16;
    dim3 threadsPerBlock(THREADS, THREADS);
    dim3 blocksPerGrid((N + THREADS - 1) / THREADS, (N + THREADS - 1) / THREADS);

    
    matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, sz, cudaMemcpyDeviceToHost);

    printf("A:\n");
    for (int row = 0; row < 4; row++) {
        for (int col = 0; col < 4; col++) {
            printf("%0.1f ", h_A[row * N + col]);
        }
        printf("\n");
    }
    printf("\nB:\n");
    for (int row = 0; row < 4; row++) {
        for (int col = 0; col < 4; col++) {
            printf("%0.1f ", h_B[row * N + col]);
        }
        printf("\n");
    }
    printf("\nC:\n");
    for (int row = 0; row < 4; row++) {
        for (int col = 0; col < 4; col++) {
            printf("%0.1f ", h_C[row * N + col]);
        }
        printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}


A:
0.0 1.0 2.0 3.0 
16.0 17.0 18.0 19.0 
32.0 33.0 34.0 35.0 
48.0 49.0 50.0 51.0 

B:
0.0 1.0 2.0 3.0 
16.0 17.0 18.0 19.0 
32.0 33.0 34.0 35.0 
48.0 49.0 50.0 51.0 

C:
0.0 2.0 4.0 6.0 
32.0 34.0 36.0 38.0 
64.0 66.0 68.0 70.0 
96.0 98.0 100.0 102.0 

