In [1]:
# Write CUDA C++ code as a string
cuda_code = r'''
#include <iostream>
#include <iomanip>
#include <cuda_runtime.h>
#include <sys/time.h>
#define N 4  // Matrix size N x N

__global__ void transposeGPU(float* input, float* output, int width) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x < width && y < width) {
        int idx_in = y * width + x;
        int idx_out = x * width + y;
        output[idx_out] = input[idx_in];
    }
}

void transposeCPU(float* input, float* output, int width) {
    for (int i = 0; i < width; ++i)
        for (int j = 0; j < width; ++j)
            output[j * width + i] = input[i * width + j];
}

double getTimeDiff(timeval start, timeval end) {
    return (double)(end.tv_sec - start.tv_sec) + (double)(end.tv_usec - start.tv_usec) / 1e6;
}

void printMatrix(const char* title, float* matrix, int width) {
    std::cout << title << std::endl;
    for (int i = 0; i < width * width; ++i) {
        std::cout << std::setw(6) << matrix[i];
        if ((i + 1) % width == 0) std::cout << std::endl;
    }
    std::cout << std::endl;
}

int main() {
    float hostInput[N*N], hostOutputCPU[N*N], hostOutputGPU[N*N];
    float *devInput, *devOutput;
    timeval start, end;

    // Initialize input matrix
    for (int i = 0; i < N*N; ++i)
        hostInput[i] = static_cast<float>(i + 1);

    // CPU transpose
    gettimeofday(&start, nullptr);
    transposeCPU(hostInput, hostOutputCPU, N);
    gettimeofday(&end, nullptr);
    double cpuTime = getTimeDiff(start, end);

    // Allocate memory for device
    cudaMalloc((void**)&devInput, N*N*sizeof(float));
    cudaMalloc((void**)&devOutput, N*N*sizeof(float));
    cudaMemcpy(devInput, hostInput, N*N*sizeof(float), cudaMemcpyHostToDevice);

    // Kernel configuration
    dim3 threadsPerBlock(2, 2);  // 2x2 block size
    dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                   (N + threadsPerBlock.y - 1) / threadsPerBlock.y);

    // GPU transpose
    gettimeofday(&start, nullptr);
    transposeGPU<<<numBlocks, threadsPerBlock>>>(devInput, devOutput, N);
    cudaDeviceSynchronize();  // Wait for GPU to finish
    gettimeofday(&end, nullptr);
    double gpuTime = getTimeDiff(start, end);

    // Check for CUDA errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
        return -1;
    }

    // Copy result back to host
    cudaMemcpy(hostOutputGPU, devOutput, N*N*sizeof(float), cudaMemcpyDeviceToHost);

    // Display matrices
    printMatrix("Original Matrix:", hostInput, N);
    printMatrix("Transposed Matrix (CPU):", hostOutputCPU, N);
    printMatrix("Transposed Matrix (GPU):", hostOutputGPU, N);

    std::cout << "CPU Time: " << cpuTime << " seconds" << std::endl;
    std::cout << "GPU Time: " << gpuTime << " seconds" << std::endl;

    // Free memory
    cudaFree(devInput);
    cudaFree(devOutput);
    return 0;
}
'''

# Save the CUDA code to a file
with open("/content/matrix_transpose.cu", "w") as f:
    f.write(cuda_code)

# Compile the CUDA code using nvcc
!nvcc -arch=sm_75 -std=c++11 /content/matrix_transpose.cu -o /content/matrix_transpose

# Run the compiled CUDA code
!./matrix_transpose

Original Matrix:
     1     2     3     4
     5     6     7     8
     9    10    11    12
    13    14    15    16

Transposed Matrix (CPU):
     1     5     9    13
     2     6    10    14
     3     7    11    15
     4     8    12    16

Transposed Matrix (GPU):
     1     5     9    13
     2     6    10    14
     3     7    11    15
     4     8    12    16

CPU Time: 0 seconds
GPU Time: 0.000162 seconds
