Instalacja środowiska

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

!nvcc --version
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

%load_ext nvcc_plugin

In [7]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-j3zn07gj
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-j3zn07gj
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [8]:
%%cu
#include <chrono>  // obsługa mierzenia czasu w C++
#include <iostream> 
#include <vector>

int cpuMandelbrot(double X0, double Y0, double X1, double Y1, int POZ, int PION, int ITER,int *Mandel ){
    double dx=(X1-X0)/POZ;
    double dy=(Y1-Y0)/PION;
    for (int r_y = 0; r_y<PION; r_y++)
    {
        // Punkt w pionie na płaszczyznie zespolonej
        double c_y0 = Y1 - r_y * dy;

        for(int r_x = 0; r_x<POZ; r_x++)
        {
            // Punkt w poziomie na płaszczyznie zespolonej
            double c_x0 = X0 + r_x * dx;

            double x = 0.0;
            double y = 0.0;
            int iteracji = 0;

            while (x*x + y*y < 2*2 && iteracji < ITER) {
                double xtemp = x*x - y*y + c_x0;
                y = 2*x*y + c_y0;
                x = xtemp;
                iteracji += 1;
            };
            
            Mandel[r_y*POZ+r_x] = iteracji;
        }

    }
    return 1;
}

int main() {
  //Ustaw obszar obliczen
  //{X0,Y0} - lewy dolny rog
  double X0=atof("-1.");
  double Y0=atof("-1.");

  //{X1,Y1} - prawy gorny rog
  double X1=atof("1.");
  double Y1=atof("1.");

  //Ustal rozmiar w pikselach
  //{POZ,PION}
  int POZ=atoi("3000");
  int PION=atoi("3000");

  //Ustal liczbe iteracji probkowania
  int ITER=atoi("256");

  // Zaalokuj tablice do przechowywania wyniku na CPU
  int *host_mandel = (int *) malloc(sizeof(int)*POZ*PION);

  int num_of_executions = 25;

  // Zdefiniuj timery
  time_t start, end;

  std::vector<double> cpu_durations = { };

  std::chrono::duration<int64_t, std::nano> diff;
 
  for (int i = 0; i < num_of_executions; i++) {
    // startujemy pomiar czasu - na rózne sposoby
    start=clock();
    auto start2 = std::chrono::steady_clock::now();

    cpuMandelbrot(X0,Y0,X1,Y1,POZ,PION,ITER,host_mandel);

    // konczymy pomiar czasu 
    auto stop = std::chrono::steady_clock::now();
    end=clock();
    diff = stop - start2;
    cpu_durations.push_back(((double) diff.count() / 1000000000)); // Nanosekundy do sekund      
  }

  std::cout << "CPU" << std::endl;
  std::string cpu_string_result = "";
  for (float result : cpu_durations) {
    cpu_string_result.append(std::to_string(result) + ", ");
  }
  std::cout << cpu_string_result << std::endl;

 
 return 0;
}

CPU
7.401653, 7.245220, 7.214851, 7.231353, 7.191373, 7.259069, 7.272729, 7.251545, 7.222718, 7.232304, 7.239226, 7.202899, 7.214046, 7.190893, 7.247749, 7.265268, 7.348147, 7.361453, 7.366894, 7.342649, 7.271054, 7.339664, 7.281950, 7.373431, 7.425019, 



In [9]:
%%cu
#include <iostream> 
#include <vector>

__global__ void cudaMandelbrot(double X0, double Y0, double X1, double Y1, int POZ, int PION, int ITER,int *Mandel){
  double    dX=(X1-X0)/(POZ);
  double    dY=(Y1-Y0)/(PION);
  int i = 0;
  double x = 0.0;
  double y = 0.0;
  double Zx,Zy,tZx,tZy;
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx < PION*POZ) { // wykonujemy obliczenia tylko dla watków przypisanych do probkowanego obszaru
        // tu liczymy zbior Mandelbrota
        // trzeba pamietac o zmapowaniu liniowej struktury watkow na punkty siatki obliczeniowej

        double c_x0 = X0 + (idx % POZ) * dX;
        double c_y0 = Y1 - (int)(idx / PION) * dY;
        
        while (x*x + y*y < 2*2 && i < ITER) {
            double xtemp = x*x - y*y + c_x0;
            y = 2*x*y + c_y0;
            x = xtemp;
            i += 1;
        };

        Mandel[idx] = i;
	}	
}


int main() {
  //Ustaw obszar obliczen
  //{X0,Y0} - lewy dolny rog
  double X0=atof("-1.");
  double Y0=atof("-1.");

  //{X1,Y1} - prawy gorny rog
  double X1=atof("1.");
  double Y1=atof("1.");

  //Ustal rozmiar w pikselach
  //{POZ,PION}
  int POZ=atoi("3000");
  int PION=atoi("3000");

  //Ustal liczbe iteracji probkowania
  int ITER=atoi("256");

  cudaError_t status;
  
  // Zaalokuj tablice do przechowywania wyniku na GPU i CPU
  int *host_mandel = (int *) malloc(sizeof(int)*POZ*PION);
  int *device_mandel;
  status = cudaMalloc(&device_mandel, sizeof(int)*POZ*PION);
  // sprawdz czy alokacja się powiodła 
  if (status != cudaSuccess) {
      std::cout << "Blad alokacji w pamieci zunifikowanej\n";
      delete[] host_mandel;
      return 0;
  }

  status = cudaMemcpy(device_mandel, host_mandel, sizeof(int)*POZ*PION, cudaMemcpyHostToDevice);
  if (status != cudaSuccess) {
      std::cout << "Blad kopiowania\n";
      delete[] host_mandel;
      cudaFree(device_mandel);
      return 0;
  }

  int num_of_executions = 25;

  // Zdefiniuj timery
  time_t start, end;

  std::vector<double> gpu_durations = { };
 
  for (int i = 0; i < num_of_executions; i++) {
    // ustawienia dla cudaMandelbrot
    dim3 threadsPerBlock(32,1,1);
    dim3 numBlocks(PION*POZ/threadsPerBlock.x+1,1,1);

    // Dla GPU pomiar z uzyciem CUDA Events
    // Z poradnika: https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    cudaMandelbrot<<<numBlocks,threadsPerBlock,1>>>(X0,Y0,X1,Y1,POZ,PION,ITER,device_mandel);
    cudaEventRecord(stop);

    // Zatrymaj egzekujcę CPU do czasu az nie wystapi event "stop"
    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    float seconds = milliseconds / 1000;
    gpu_durations.push_back(seconds);
  }

  std::cout << "GPU" << std::endl;
  std::string gpu_string_result = "";
  for (float result : gpu_durations) {
    gpu_string_result.append(std::to_string(result) + ", ");
  }
  std::cout << gpu_string_result << std::endl;

 
 return 0;
}

GPU
0.028260, 0.028144, 0.028142, 0.028145, 0.028145, 0.028144, 0.028144, 0.028147, 0.028162, 0.025317, 0.025269, 0.025284, 0.025268, 0.025274, 0.025281, 0.024496, 0.022862, 0.022869, 0.022857, 0.022866, 0.022858, 0.022872, 0.022878, 0.020583, 0.020584, 

