In [11]:
# Step 1: Write your CUDA code to a file
code = """
#include <iostream>
#include <cuda_runtime.h>
using namespace std;

__global__ void matrixMulCUDA(int* A, int* B, int* C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < N && col < N) {
        int value = 0;
        for (int i = 0; i < N; i++) {
            value += A[row * N + i] * B[i * N + col];
        }
        C[row * N + col] = value;
    }
}

void printMatrix(int* matrix, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            cout << matrix[i * N + j] << " ";
        }
        cout << endl;
    }
}

int main() {
    int N = 2;

    int h_A[4] = {1, 2, 3, 4}; // Matrix A
    int h_B[4] = {5, 6, 7, 8}; // Matrix B
    int h_C[4];

    int *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, N * N * sizeof(int));
    cudaMalloc(&d_B, N * N * sizeof(int));
    cudaMalloc(&d_C, N * N * sizeof(int));

    cudaMemcpy(d_A, h_A, N * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, N * N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((N + 15) / 16, (N + 15) / 16);

    matrixMulCUDA<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        cout << "CUDA error: " << cudaGetErrorString(err) << endl;
        return 1;
    }

    cudaMemcpy(h_C, d_C, N * N * sizeof(int), cudaMemcpyDeviceToHost);

    cout << "Result matrix C (A * B):" << endl;
    printMatrix(h_C, N);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}
"""

with open("matrix_mul.cu", "w") as f:
    f.write(code)


In [12]:
!nvcc -gencode=arch=compute_60,code=sm_60 \
      -gencode=arch=compute_70,code=sm_70 \
      -gencode=arch=compute_75,code=sm_75 \
      -o matrix_mul matrix_mul.cu


In [13]:
!./matrix_mul


Result matrix C (A * B):
19 22 
43 50 
