Instalacja środowiska

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

!nvcc --version
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

%load_ext nvcc_plugin

In [4]:
%%cuda --name Matrix.cu

#include <cuda_runtime.h>
#include <iostream>
#include <ctime>
using namespace std;
#define checkCudaErrors(call)  do {   cudaError_t err = call;     if (err != cudaSuccess) {  printf("CUDA error at %s %d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err));                        exit(EXIT_FAILURE);                                     }                                                         } while (0)

__global__ void matmulDevice(int* A, int* B, int* C, int N)
{
	int kolumna = blockIdx.x * blockDim.x + threadIdx.x;
	int wiersz = blockIdx.y * blockDim.y + threadIdx.y;

	if (wiersz < N && kolumna < N) {
		int suma = 0;
		for (int i = 0; i < N; i++) {
			suma += A[N * wiersz + i] * B[N * i + kolumna];
    }
		C[wiersz * N + kolumna] = suma;
	}
}

void matmulHost(int* A, int* B, int* C, int N)
{

	for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) {
			int suma = 0;
			for (int q = 0; q < N; q++) suma += A[N * i + q] * B[q * N + j];
			C[N * i + j] = suma;
		}
}


int main(void)
{
		int N = 1024;
		int block_size = 16;

		int Iter = 1;

		unsigned int rozmiar = N * N;
		unsigned int mem_size = sizeof(int) * rozmiar;
	
	for(int qwerty = 0; qwerty < 5; qwerty++){
		int* A = (int*)malloc(mem_size);
		int* B = (int*)malloc(mem_size);
		int* C = (int*)malloc(mem_size);
		int* hCuda_C = (int*)malloc(mem_size);
	
		int* wA, * wB, * wC;
	
		for (int i = 0; i < rozmiar; i++) {
			A[i] = rand() % 501;
			B[i] = rand() % 501;
		}
	
		unsigned int czasStart = clock();

		for (int j = 0; j < Iter; j++) 
			matmulHost(A, B, C, N);
		

		unsigned int elapsedCzas = clock() - czasStart;
		float czasCPU = elapsedCzas / 1000;

		cout << "CPU czas dla iter = "<<qwerty <<"  = " << czasCPU/1000 << endl;
	
		checkCudaErrors(cudaMalloc((void**)& wA, mem_size));
		checkCudaErrors(cudaMalloc((void**)& wB, mem_size));
		checkCudaErrors(cudaMalloc((void**)& wC, mem_size));
		checkCudaErrors(cudaMemcpy(wA, A, mem_size,cudaMemcpyHostToDevice));
		checkCudaErrors(cudaMemcpy(wB, B, mem_size,cudaMemcpyHostToDevice));
	
		dim3 threadsPerBlock(block_size, block_size);
		dim3 blocksPerGrid(N / block_size, N / block_size);
		
		cudaEvent_t start;
		cudaEvent_t stop;
		checkCudaErrors(cudaEventCreate(&start));
		checkCudaErrors(cudaEventCreate(&stop));

		checkCudaErrors(cudaEventRecord(start, 0));

		for (int j = 0; j < Iter; j++) 
			matmulDevice << <blocksPerGrid, threadsPerBlock >> > (wA, wB, wC, N);
		

		checkCudaErrors(cudaEventRecord(stop, 0));

		checkCudaErrors(cudaEventSynchronize(stop));

		float milisec = 0.0f;
		checkCudaErrors(cudaEventElapsedTime(&milisec, start, stop));

		float czas = milisec / Iter;
			   
		cout << "GPU czas dla iter = "<<qwerty <<"  = " << czas/1000 << endl;

		cudaDeviceSynchronize();
	
		checkCudaErrors(cudaMemcpy(hCuda_C, wC, mem_size, cudaMemcpyDeviceToHost));
		cudaDeviceSynchronize();
    cudaFree(wA);
    cudaFree(wB);
    cudaFree(wC);
  }
	

    return 0;
}

'File written in /content/src/Matrix.cu'

In [5]:
!nvcc -o /content/src/matrix /content/src/Matrix.cu

In [6]:
!/content/src/matrix

CPU czas dla iter = 0  = 7.454
GPU czas dla iter = 0  = 0.0329805
CPU czas dla iter = 1  = 7.217
GPU czas dla iter = 1  = 0.0329017
CPU czas dla iter = 2  = 7.251
GPU czas dla iter = 2  = 0.0329032
CPU czas dla iter = 3  = 7.255
GPU czas dla iter = 3  = 0.0329911
CPU czas dla iter = 4  = 7.184
GPU czas dla iter = 4  = 0.0328832
