In [1]:
code = r'''
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdlib>
#include <ctime>
#include <iomanip>
#include <cuda_runtime.h>

__global__ void multiplyMatrices(int* A, int* B, int* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int k = 0; k < N; k++) {
            sum += A[row * N + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}

void initializeMatrix(int* matrix, int N) {
    for (int i = 0; i < N * N; i++




    ) {
        matrix[i] = rand() % 10;
    }
}

void printMatrix(int* matrix, int N) {
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            printf("%d ", matrix[row * N + col]);
        }
        printf("\n");
    }
    printf("\n");
}

int main() {
    int N;
    printf("Enter size of square matrix (N x N): ");
    scanf("%d", &N);

    int *A, *B, *C;
    size_t bytes = N * N * sizeof(int);

    A = (int*)malloc(bytes);
    B = (int*)malloc(bytes);
    C = (int*)malloc(bytes);

    initializeMatrix(A, N);
    initializeMatrix(B, N);

    printf("Matrix A:\n");
    printMatrix(A, N);

    printf("Matrix B:\n");
    printMatrix(B, N);

    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);

    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((N + 15) / 16, (N + 15) / 16);

    multiplyMatrices<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);

    printf("Resultant Matrix (Multiplication):\n");
    printMatrix(C, N);

    free(A);
    free(B);
    free(C);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

'''
with open("matrix_multiplication.cu", "w") as f:
    f.write(code)

In [3]:
!nvcc -arch=sm_75 matrix_multiplication.cu -o matrix_multiplication
#!./matrix_multiplication
!echo 4 | ./matrix_multiplication

Enter size of square matrix (N x N): Matrix A:
3 6 7 5 
3 5 6 2 
9 1 2 7 
0 9 3 6 

Matrix B:
0 6 2 6 
1 8 7 9 
2 0 2 3 
7 5 9 2 

Resultant Matrix (Multiplication):
55 91 107 103 
31 68 71 85 
54 97 92 83 
57 102 123 102 

