In [None]:
!nvcc --version
%pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [10]:
%%cuda
#include <stdio.h>
#include <stdlib.h>

#define N 8

__global__ 
void matrixAdd(const float *A, const float *B, float *C, int width) {
    
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int idx = row * width + col;

    if (row < width && col < width) {
        C[idx] = A[idx] + B[idx];
    }
}

int main() {
    int size = N * N;
    size_t sz = size * sizeof(float);

    float *h_A = (float*)malloc(sz);
    float *h_B = (float*)malloc(sz);
    float *h_C = (float*)malloc(sz);

    for (int i = 0; i < size; i++) {
        h_A[i] = i;
        h_B[i] = i;
    }

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, sz);
    cudaMalloc((void**)&d_B, sz);
    cudaMalloc((void**)&d_C, sz);

    cudaMemcpy(d_A, h_A, sz, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sz, cudaMemcpyHostToDevice);

    int THREADS = 16;
    dim3 threadsPerBlock(THREADS, THREADS);
    dim3 blocksPerGrid((N + THREADS - 1) / THREADS, (N + THREADS - 1) / THREADS);

    
    matrixAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, sz, cudaMemcpyDeviceToHost);

    printf("A:\n");
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%0.1f ", h_A[row*N+col]);
        }
        printf("\n");
    }
    printf("\nB:\n");
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%0.1f ", h_B[row*N+col]);
        }
        printf("\n");
    }
    printf("\nC:\n");
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%0.1f ", h_C[row*N+col]);
        }
        printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}


A:
0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 
8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 
16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 
24.0 25.0 26.0 27.0 28.0 29.0 30.0 31.0 
32.0 33.0 34.0 35.0 36.0 37.0 38.0 39.0 
40.0 41.0 42.0 43.0 44.0 45.0 46.0 47.0 
48.0 49.0 50.0 51.0 52.0 53.0 54.0 55.0 
56.0 57.0 58.0 59.0 60.0 61.0 62.0 63.0 

B:
0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 
8.0 9.0 10.0 11.0 12.0 13.0 14.0 15.0 
16.0 17.0 18.0 19.0 20.0 21.0 22.0 23.0 
24.0 25.0 26.0 27.0 28.0 29.0 30.0 31.0 
32.0 33.0 34.0 35.0 36.0 37.0 38.0 39.0 
40.0 41.0 42.0 43.0 44.0 45.0 46.0 47.0 
48.0 49.0 50.0 51.0 52.0 53.0 54.0 55.0 
56.0 57.0 58.0 59.0 60.0 61.0 62.0 63.0 

C:
0.0 2.0 4.0 6.0 8.0 10.0 12.0 14.0 
16.0 18.0 20.0 22.0 24.0 26.0 28.0 30.0 
32.0 34.0 36.0 38.0 40.0 42.0 44.0 46.0 
48.0 50.0 52.0 54.0 56.0 58.0 60.0 62.0 
64.0 66.0 68.0 70.0 72.0 74.0 76.0 78.0 
80.0 82.0 84.0 86.0 88.0 90.0 92.0 94.0 
96.0 98.0 100.0 102.0 104.0 106.0 108.0 110.0 
112.0 114.0 116.0 118.0 120.0 122.0 124.0 126.0 

