In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-m0d0ppvw
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-m0d0ppvw
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=4b2e289c369f7e9e363f8ac1599a94e215419d48807e0f7e24c137eb1d15df81
  Stored in directory: /tmp/pip-ephem-wheel-cache-guviy_eu/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e7bc7
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [3]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


This line calculates the thread ID (tid) for each thread. blockIdx.x is the index of the current block, blockDim.x is the number of threads per block, and threadIdx.x is the index of the current thread within its block. The tid is used to determine which element of the arrays A, B, and C each thread will work on.

In [1]:
%%writefile matrix.cu
#include<iostream>
using namespace std;

// CUDA code to multiply matrices
__global__ void multiply(int* A, int* B, int* C, int size) {
    // Uses thread indices and block indices to compute each element
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;


    if (row < size && col < size) {
        int sum = 0;
        for (int i = 0; i < size; i++) {
            sum += A[row * size + i] * B[i * size + col];
        }
        C[row * size + col] = sum;

        // Print thread index and operation
        printf("Thread (%d, %d) performed multiplication for C[%d][%d]\n", threadIdx.x, threadIdx.y, row, col);
    }
}

// Initialize matrix C with zeros
void initializeZero(int* matrix, int size) {
    for (int i = 0; i < size * size; i++) {
        matrix[i] = 0;
    }
}

void print(int* matrix, int size) {
    for (int row = 0; row < size; row++) {
        for (int col = 0; col < size; col++) {
            cout << matrix[row * size + col] << " ";
        }
        cout << '\n';
    }
    cout << '\n';
}

int main() {
    int N;
    cout << "Enter the size of the matrices: ";
    cin >> N;

    int* A, * B, * C;

    int matrixSize = N * N;
    size_t matrixBytes = matrixSize * sizeof(int);

    A = new int[matrixSize];
    B = new int[matrixSize];
    C = new int[matrixSize];

    cout << "Enter the elements of matrix A: \n";
    for (int i = 0; i < matrixSize; ++i) {
        cin >> A[i];
    }

    cout << "Enter the elements of matrix B: \n";
    for (int i = 0; i < matrixSize; ++i) {
        cin >> B[i];
    }

    cout << "Matrix A: \n";
    print(A, N);

    cout << "Matrix B: \n";
    print(B, N);


    int* X, * Y, * Z;
    // Allocate space
    cudaMalloc(&X, matrixBytes);
    cudaMalloc(&Y, matrixBytes);
    cudaMalloc(&Z, matrixBytes);

    // Copy values from A to X
    cudaMemcpy(X, A, matrixBytes, cudaMemcpyHostToDevice);

    // Copy values from A to X and B to Y
    cudaMemcpy(Y, B, matrixBytes, cudaMemcpyHostToDevice);

    // Threads per CTA dimension
    int THREADS = 16;

    // Blocks per grid dimension
    int BLOCKS = (N + THREADS - 1) / THREADS;

    // Use dim3 structs for block and grid dimensions
    dim3 threads(THREADS, THREADS);
    dim3 blocks(BLOCKS, BLOCKS);

    // Launch kernel
    multiply<<<THREADS,BLOCKS>>>(X, Y, Z, N);

    cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);
    cout << "Multiplication of matrix A and B: \n";
    print(C, N);

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}



Writing matrix.cu


In [2]:
!nvcc matrix.cu -o matrix

In [4]:
!./matrix

Enter the size of the matrices: 2
Enter the elements of matrix A: 
0 1 2 3
Enter the elements of matrix B: 
2 3 5 7
Matrix A: 
0 1 
2 3 

Matrix B: 
2 3 
5 7 

Thread (0, 0) performed multiplication for C[0][1]
Thread (0, 0) performed multiplication for C[0][0]
Multiplication of matrix A and B: 
5 7 
0 0 

