In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-_h52okim
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-_h52okim
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4293 sha256=38bc43772ba1abb8476f5fbbc8341f7011c43225dc00d21e7998e7af2cf19884
  Stored in directory: /tmp/pip-ephem-wheel-cache-ly23ew1j/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include <iostream>
#include <cuda_runtime.h>

const int TILE_SIZE = 2;

__global__ void matrixMultiplication(const int *matrixA, const int *matrixB, int *matrixResult, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ int sharedA[TILE_SIZE][TILE_SIZE];
    __shared__ int sharedB[TILE_SIZE][TILE_SIZE];

    int result = 0;

    for (int i = 0; i < (N + TILE_SIZE - 1) / TILE_SIZE; ++i) {
        if ((row < N) && (i * TILE_SIZE + threadIdx.x < N)) {
            sharedA[threadIdx.y][threadIdx.x] = matrixA[row * N + i * TILE_SIZE + threadIdx.x];
        } else {
            sharedA[threadIdx.y][threadIdx.x] = 0;
        }

        if ((col < N) && (i * TILE_SIZE + threadIdx.y < N)) {
            sharedB[threadIdx.y][threadIdx.x] = matrixB[(i * TILE_SIZE + threadIdx.y) * N + col];
        } else {
            sharedB[threadIdx.y][threadIdx.x] = 0;
        }

        __syncthreads();

        for (int j = 0; j < TILE_SIZE; ++j) {
            result += sharedA[threadIdx.y][j] * sharedB[j][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < N && col < N) {
        matrixResult[row * N + col] = result;
    }
}

void printMatrix(const int *matrix, int rows, int cols) {
    for (int i = 0; i < rows; ++i) {
        for (int j = 0; j < cols; ++j) {
            std::cout << matrix[i * cols + j] << " ";  // Print without setw
        }
        std::cout << std::endl;
    }
    std::cout << std::endl;
}

int main() {
    const int N = 3;

    const int matrixA[N][N] = {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}};
    const int matrixB[N][N] = {{9, 8, 7}, {6, 5, 4}, {3, 2, 1}};

    int matrixC_cpu[N][N]; // Result matrix from CPU
    int matrixC_gpu[N][N]; // Result matrix from GPU

    int *d_matrixA, *d_matrixB, *d_matrixC;
    cudaMalloc((void **)&d_matrixA, N * N * sizeof(int));
    cudaMalloc((void **)&d_matrixB, N * N * sizeof(int));
    cudaMalloc((void **)&d_matrixC, N * N * sizeof(int));

    cudaMemcpy(d_matrixA, &matrixA[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrixB, &matrixB[0][0], N * N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockDim(2, 2);
    dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (N + blockDim.y - 1) / blockDim.y);

    matrixMultiplication<<<gridDim, blockDim>>>(d_matrixA, d_matrixB, d_matrixC, N);

    cudaMemcpy(&matrixC_gpu[0][0], d_matrixC, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            matrixC_cpu[i][j] = 0;
            for (int k = 0; k < N; ++k) {
                matrixC_cpu[i][j] += matrixA[i][k] * matrixB[k][j];
            }
        }
    }

    std::cout << "Matrix A:" << std::endl;
    printMatrix(&matrixA[0][0], N, N);
    std::cout << "Matrix B:" << std::endl;
    printMatrix(&matrixB[0][0], N, N);
    std::cout << "Result from CPU (matrixC_cpu):" << std::endl;
    printMatrix(&matrixC_cpu[0][0], N, N);

    std::cout << "Result from GPU (matrixC_gpu):" << std::endl;
    printMatrix(&matrixC_gpu[0][0], N, N);

    bool resultMatch = true;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) {
            if (matrixC_cpu[i][j] != matrixC_gpu[i][j]) {
                resultMatch = false;
                break;
            }
        }
    }

    if (resultMatch) {
        std::cout << "Results match between CPU and GPU implementations." << std::endl;
    } else {
        std::cout << "Results do not match between CPU and GPU implementations." << std::endl;
    }

    cudaFree(d_matrixA);
    cudaFree(d_matrixB);
    cudaFree(d_matrixC);

    return 0;
}


Matrix A:
1 2 3 
4 5 6 
7 8 9 

Matrix B:
9 8 7 
6 5 4 
3 2 1 

Result from CPU (matrixC_cpu):
30 24 18 
84 69 54 
138 114 90 

Result from GPU (matrixC_gpu):
30 24 18 
84 69 54 
138 114 90 

Results match between CPU and GPU implementations.

