In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [None]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpjtplckhs".


In [None]:
%%cuda
#include <iostream>
int main() {
    std::cout << "Welcome To CUDA\n";
    cudaError_t err = cudaSetDevice(0);
    if (err != cudaSuccess) {
        std::cerr << "CUDA device not available: " << cudaGetErrorString(err) << std::endl;
        return 1;
    }
    return 0;
}

Welcome To CUDA



In [None]:
%%cuda
#include <iostream>
#include <vector>
#include <chrono>
#include <fstream>
#include <sstream>
#include <cuda_runtime.h>

void readMatrixFromFile(const std::string& fileName, std::vector<std::vector<int>>& matrix) {
    std::ifstream file(fileName);
    if (!file.is_open()) {
        std::cout << "Error opening file: " << fileName << std::endl;
        return;
    }

    std::string line;
    while (std::getline(file, line)) {
      std::stringstream ss(line);
      std::vector<int> row;
      int value;
      while (ss >> value) {
        row.push_back(value);
      }
      matrix.push_back(row);
    }

    file.close();
}

bool areResultFilesIdentical(const std::string& fileName) {
    std::string resultFileName = "Result/" + fileName + "_result.txt";
    std::string sequentialResultFile = "Result/" + fileName + "_sequential.txt";

    std::ifstream file1(sequentialResultFile);
    std::ifstream file2(resultFileName);

    if (!file1.is_open() || !file2.is_open()) {
        std::cout << "Error opening files." << std::endl;
        return false;
    }

    std::string line1, line2;
    while (std::getline(file1, line1) && std::getline(file2, line2)) {
        line1.erase(line1.find_last_not_of(" \t\r\n") + 1);
        line2.erase(line2.find_last_not_of(" \t\r\n") + 1);

        if (line1 != line2) {
            file1.close();
            file2.close();
            return false;
        }
    }

    file1.close();
    file2.close();
    return true;
}

void writeMatrixToFile(const std::string& fileName, const std::vector<std::vector<int>>& matrix) {
    std::ofstream file(fileName);
    if (!file.is_open()) {
        std::cout << "Error opening file: " << fileName << std::endl;
        return;
    }

    for (const auto& row : matrix) {
        for (const auto& val : row) {
            file << val << " ";
        }
        file << '\n';
    }

    file.close();
}

__global__ void convolveKernel(int* matrix, int* result, int* kernel, int N, int M, int K) {
  int row = blockIdx.x * blockDim.x + threadIdx.x;

  if (row >= N) {
      return;
  }

  for (int col = 0; col < M; col++) {
      int value = 0;
      for (int ki = 0; ki < K; ki++) {
          for (int kj = 0; kj < K; kj++) {
              int x = row + ki - K / 2;
              int y = col + kj - K / 2;
              int validX = min(max(x, 0), N - 1);
              int validY = min(max(y, 0), M - 1);
              value += matrix[validX * M + validY] * kernel[ki * K + kj];
          }
      }
      result[row * M + col] = value;
  }
}

void CUDARowsMethod(std::vector<std::vector<int>>& matrix, const std::vector<std::vector<int>>& kernel) {
  int N = matrix.size();
  int M = matrix[0].size();
  int K = kernel.size();

  std::vector<int> h_matrix(N * M), h_kernel(K * K), h_result(N * M, 0);
  for (int i = 0; i < N; i++) {
      for (int j = 0; j < M; j++) {
          h_matrix[i * M + j] = matrix[i][j];
      }
  }

  for (int i = 0; i < K; i++) {
        for (int j = 0; j < K; j++) {
            h_kernel[i * K + j] = kernel[i][j];
        }
    }

  int *d_matrix, *d_kernel, *d_result;

  cudaMalloc(&d_matrix, N * M * sizeof(int));
  cudaMalloc(&d_kernel, K * K * sizeof(int));
  cudaMalloc(&d_result, N * M * sizeof(int));

  cudaMemcpy(d_matrix, h_matrix.data(), N * M * sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_kernel, h_kernel.data(), K * K * sizeof(int), cudaMemcpyHostToDevice);

  int threadsPerBlock = 256;
  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

  convolveKernel<<<blocksPerGrid, threadsPerBlock>>>(d_matrix, d_result, d_kernel, N, M, K);
  cudaError_t kernelErr = cudaGetLastError();
  if (kernelErr != cudaSuccess) {
      std::cerr << "CUDA Kernel Error: " << cudaGetErrorString(kernelErr) << std::endl;
      return;
  }
  cudaDeviceSynchronize();

  cudaMemcpy(h_result.data(), d_result, N * M * sizeof(int), cudaMemcpyDeviceToHost);

  for (int i = 0; i < N; i++) {
      for (int j = 0; j < M; j++) {
          matrix[i][j] = h_result[i * M + j];
      }
  }

  cudaFree(d_matrix);
  cudaFree(d_result);
  cudaFree(d_kernel);
}

int main(int argc, char* argv[]) {
    std::string fileName = "matrix_100_100";
    std::vector<std::vector<int>> matrix;
    readMatrixFromFile("TestData/" + fileName + ".txt", matrix);

    std::vector<std::vector<int>> kernel = {
      {1, 0, 1},
      {0, 2, 0},
      {-1, 0, -1}
    };

    auto startTime = std::chrono::high_resolution_clock::now();

    CUDARowsMethod(matrix, kernel);

    auto endTime = std::chrono::high_resolution_clock::now();
    auto elapsed_time_ms = std::chrono::duration<double, std::milli>(endTime - startTime).count();
    std::cout << "Execution time: " << elapsed_time_ms << " ms" << std::endl;

    std::string resultFileName = "Result/" + fileName + "_result.txt";
    writeMatrixToFile(resultFileName, matrix);

    bool result = areResultFilesIdentical(fileName);
    if (result) {
      std::cout << "Identical" << std::endl;
    } else {
      std::cout << "Not identical." << std::endl;
    }

    return 0;
}

Execution time: 185.963 ms
Identical

