In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp /content/drive/MyDrive/Public/PPD_7/input_10000.txt /input_10000.txt
!cp /content/drive/MyDrive/Public/PPD_7/output_10000.txt /output_10000.txt
!cp /content/drive/MyDrive/Public/PPD_7/output_1000.txt /output_1000.txt
!cp /content/drive/MyDrive/Public/PPD_7/output_100.txt /output_100.txt
!cp /content/drive/MyDrive/Public/PPD_7/output_10.txt /output_10.txt

In [4]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [5]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-3mryqlhz
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-3mryqlhz
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0d2ab99cccbbc682722e708515fe9c4cfc50185a
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4716 sha256=012db2cd26b66b69e49168cb343a4e46fbab363d0ab48d81e5756357f5aaa978
  Stored in directory: /tmp/pip-ephem-wheel-cache-nmov199i/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [6]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [25]:
%%cu
#include <cstdio>
#include <iostream>
#include <fstream>
#include <chrono>

#include <cuda_runtime_api.h>
#include <cuda.h>
#include <cooperative_groups.h>

using namespace std;

// __device__ because it is called from the GPU
__device__ int clamp(int x, int a, int b) { return x<=a ? a : x>=b ? b: x; }

__global__ void convolution(int* matrix, int* kernel, int n, int k, int blocks_count, int threads_per_block) // kernel -> used by every thread
{
    const int max_elems_per_thread = 3000;
    int line[max_elems_per_thread];
    int col[max_elems_per_thread];
    int border[max_elems_per_thread][9];

    int thread_id = blockIdx.x*threads_per_block+threadIdx.x;

    int elems_count = n*n/(blocks_count*threads_per_block)+1;
    int elem_start = thread_id*elems_count;

    for(int i=0;i<elems_count;i++)
    {
        if(elem_start+i>=n*n) break;
        line[i] = (elem_start+i)/n;
        col[i] = (elem_start+i)%n;
        for(int ii=0;ii<3;ii++)
          for(int jj=0;jj<3;jj++)
            border[i][3*ii+jj] = matrix[n*clamp(line[i]+ii-1, 0, n-1) + clamp(col[i]+jj-1, 0, n-1)];
    }

    cooperative_groups::grid_group grid = cooperative_groups::this_grid();
    grid.sync();

    for(int k=0;k<elems_count;k++)
    {
        if(elem_start+k>=n*n) break;
        int sum = 0;
        for(int i=0;i<9;i++)
          sum += border[k][i] * kernel[i];

        matrix[n*line[k]+col[k]] = sum;
    }
}

int main()
{
    int n = 10000;
    int k = 3;

    int* matrix = new int[n*n];
    //int* kernel = new int[k*k];
    int kernel[9] = { 2, 1, 1, 1, 1, 0, 0, 1, 2};

    auto t_read_start = std::chrono::high_resolution_clock::now();


    //ifstream f("/test_case_1.txt");
    ifstream f("/input_10000.txt");

    for(int i=0;i<n;i++)
    {
        for(int j=0;j<n;j++) f>>matrix[i*n+j];
    }

  	/*for(int i=0;i<k;i++)
    {
        for(int j=0;j<k;j++) f>>kernel[i*k+j];
    }*/

    f.close();

    auto t_read_end = std::chrono::high_resolution_clock::now();

    for(int i=0;i<k;i++)
    {
        for(int j=0;j<k;j++)
        {
            cout<<kernel[i*k+j]<<" ";
        }
        cout<<"\n";
    }


    double read_time = std::chrono::duration<double, std::milli>(t_read_end - t_read_start).count();

  	int* cuda_matrix;
    int* cuda_kernel;

    // masuram timpul
    cudaEvent_t start, end;

    // Allocate memory on the GPU
    cudaMalloc(&cuda_matrix, n*n*sizeof(int));
  	cudaMalloc(&cuda_kernel, k*k*sizeof(int));

    // Copy vectors to the device
    cudaMemcpy(cuda_matrix, matrix, n*n*sizeof(int), cudaMemcpyHostToDevice);
  	cudaMemcpy(cuda_kernel, kernel, k*k*sizeof(int), cudaMemcpyHostToDevice);

  	int blockSize, gridSize;
    blockSize = n;
    gridSize = n; // (n*n) /n

    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);

    //convolution<<<gridSize, blockSize>>>(cuda_matrix, cuda_kernel, n, k);

    int dev = 0;
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);
    int blocks_count = deviceProp.multiProcessorCount;
    int threads_per_block = 1024;

    void* args[] = { &cuda_matrix, &cuda_kernel, &n, &k, &blocks_count, &threads_per_block};

    cout<<"MP count ="<<deviceProp.multiProcessorCount<<"\n";

    cudaLaunchCooperativeKernel((void*)convolution, blocks_count, threads_per_block, (void**)args);

  	// Copy matrix back to host
    cudaMemcpy(matrix, cuda_matrix, n*n*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(kernel, cuda_kernel, k*k*sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    cout<<"Count = "<<kernel[0]<<"\n";

    float time = 0;
    cudaEventElapsedTime(&time, start, end);

    auto t_write_start = std::chrono::high_resolution_clock::now();

    ofstream g("/output.txt");
    for(int i=0;i<n;i++)
    {
        for(int j=0;j<n;j++)
        {
            g<<matrix[i*n+j]<<" ";
        }
        g<<"\n";
    }
    g.close();

    auto t_write_end = std::chrono::high_resolution_clock::now();

    double write_time = std::chrono::duration<double, std::milli>(t_write_end - t_write_start).count();

		// Release device memory
    cudaFree(cuda_matrix);
  	cudaFree(cuda_kernel);

    // Release host memory
    delete[] matrix;
    //delete[] kernel;

    cout << "Read time : "<<read_time<<"ms \n";
    cout << "CUDA time : "<<time<<"ms \n";
    cout << "Write time : "<<write_time<<"ms \n";
    cout << "Total time : "<<read_time+time+write_time<<"ms \n";

    return 0;
}



2 1 1 
1 1 0 
0 1 2 
MP count =40
Count = 2
Read time : 7381.07ms 
CUDA time : 475.08ms 
Write time : 6498.25ms 
Total time : 14354.4ms 



In [12]:
!echo `sha1sum /output.txt`
!echo `sha1sum /output_10.txt`
!echo `sha1sum /output_100.txt`
!echo `sha1sum /output_1000.txt`
!echo `sha1sum /output_10000.txt`

2c9ee71bd1651bbade8fa828bd4b3f2c4a987197 /output.txt
3c5348942ead39f0494fe25b33780ffb039d7de8 /output_10.txt
a505c42cb05c31291f0dc38efd759d7c50ccb3c4 /output_100.txt
2c9ee71bd1651bbade8fa828bd4b3f2c4a987197 /output_1000.txt
dd7c11c700a4d1db4fe9d526d3501c518d15b7ab /output_10000.txt


In [None]:
cat /output.txt | head -n 2

21 33 47 50 32 17 20 37 59 27 34 39 35 48 53 42 46 26 29 43 50 57 48 46 41 48 29 10 32 48 51 28 29 35 31 29 49 37 52 31 46 23 27 37 27 41 41 19 37 46 43 43 43 69 56 35 36 30 45 67 53 38 30 58 42 54 40 25 23 35 46 21 15 15 34 34 34 55 54 58 37 31 23 37 27 22 30 53 37 33 37 30 33 53 32 38 41 57 50 53 34 36 39 53 50 43 38 48 44 33 41 46 35 46 59 50 42 56 55 43 30 34 46 51 45 52 40 42 53 39 39 58 56 38 35 31 39 49 36 36 33 23 40 48 69 51 53 50 34 46 69 66 70 61 56 48 36 15 24 41 17 15 23 32 38 47 19 37 34 32 29 36 38 58 42 57 31 36 35 39 39 35 47 48 44 57 61 50 40 14 26 41 29 21 34 48 52 72 51 31 24 46 41 40 37 44 25 41 46 21 41 33 45 42 60 43 49 46 45 24 27 44 58 44 16 21 29 22 37 58 46 61 46 42 45 45 56 63 50 28 30 38 29 40 45 45 52 53 32 39 27 51 42 44 44 42 27 42 50 27 40 52 63 61 28 22 34 33 55 30 31 22 14 23 30 31 48 54 40 39 51 41 45 56 52 37 53 38 39 41 41 56 66 60 46 42 66 54 48 53 54 45 45 37 42 48 33 30 32 32 39 46 46 37 24 46 55 43 40 45 49 35 49 67 62 47 31 39 29 16 43 39 36 1

In [26]:
%%cu
#include <cstdio>
#include <iostream>
#include <fstream>
#include <chrono>

#include <cuda_runtime_api.h>
#include <cuda.h>
#include <cooperative_groups.h>

using namespace std;

// __device__ because it is called from the GPU
__device__ int clamp(int x, int a, int b) { return x<=a ? a : x>=b ? b: x; }

__global__ void convolution(int* matrix, int* kernel, int n, int k, int* rezultat) // kernel -> used by every thread
{
	int border[9];
	int thread_id = blockIdx.x*blockDim.x + threadIdx.x;
    if(thread_id>=n*n) return;
	int i = thread_id / n;
	int j = thread_id % n;
	for(int ii=0;ii<3;ii++)
          for(int jj=0;jj<3;jj++)
            border[3*ii+jj] = matrix[n*clamp(i+ii-1, 0, n-1) + clamp(j+jj-1, 0, n-1)];
	int s = 0;
	for(int i=0;i<9;i++) s+=kernel[i]*border[i];
  rezultat[n*i+j] = s;
}

int main()
{
    int n = 10000;
    int k = 3;

    int* matrix = new int[n*n];
    int* rezultat = new int[n*n];
    int kernel[9] = { 2, 1, 1, 1, 1, 0, 0, 1, 2};

    auto t_read_start = std::chrono::high_resolution_clock::now();

    ifstream f("/input_10000.txt");

    for(int i=0;i<n;i++)
    {
        for(int j=0;j<n;j++) f>>matrix[i*n+j];
    }

    f.close();

    auto t_read_end = std::chrono::high_resolution_clock::now();
    double read_time = std::chrono::duration<double, std::milli>(t_read_end - t_read_start).count();

  	int* cuda_matrix;
    int* cuda_kernel;
	int* cuda_rezultat;

    // masuram timpul
    cudaEvent_t start, end;

    // Allocate memory on the GPU
    cudaMalloc(&cuda_matrix, n*n*sizeof(int));
  	cudaMalloc(&cuda_kernel, k*k*sizeof(int));
  	cudaMalloc(&cuda_rezultat, n*n*sizeof(int));

    // Copy vectors to the device
    cudaMemcpy(cuda_matrix, matrix, n*n*sizeof(int), cudaMemcpyHostToDevice);
  	cudaMemcpy(cuda_kernel, kernel, k*k*sizeof(int), cudaMemcpyHostToDevice);

  	int blockSize, gridSize;
    blockSize = 1024;
    gridSize = n*n / blockSize+1;

    cudaEventCreate(&start);
    cudaEventCreate(&end);
    cudaEventRecord(start);

    convolution<<<gridSize, blockSize>>>(cuda_matrix, cuda_kernel, n, k, cuda_rezultat);

  	// Copy matrix back to host
    cudaMemcpy(rezultat, cuda_rezultat, n*n*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(kernel, cuda_kernel, k*k*sizeof(int), cudaMemcpyDeviceToHost);

    cudaEventRecord(end);
    cudaEventSynchronize(end);

    float time = 0;
    cudaEventElapsedTime(&time, start, end);

    auto t_write_start = std::chrono::high_resolution_clock::now();

    ofstream g("/output.txt");
    for(int i=0;i<n;i++)
    {
        for(int j=0;j<n;j++)
        {
            g<<rezultat[i*n+j]<<" ";
        }
        g<<"\n";
    }
    g.close();

    auto t_write_end = std::chrono::high_resolution_clock::now();

    double write_time = std::chrono::duration<double, std::milli>(t_write_end - t_write_start).count();

		// Release device memory
    cudaFree(cuda_matrix);
  	cudaFree(cuda_kernel);

    // Release host memory
    delete[] matrix;
    delete[] rezultat;

    cout << "Read time : "<<read_time<<"ms \n";
    cout << "CUDA time : "<<time<<"ms \n";
    cout << "Write time : "<<write_time<<"ms \n";
    cout << "Total time : "<<read_time+time+write_time<<"ms \n";

    return 0;
}



Read time : 5313.73ms 
CUDA time : 292.395ms 
Write time : 9165.47ms 
Total time : 14771.6ms 

