<a href="https://colab.research.google.com/github/sssteeefaaan/VIII-Semestar/blob/main/Paralelni%20Sistemi/Ra%C4%8Dunske%20Ve%C5%BEbe/Cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalacija (samo jednom)

In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

# Instalacija (uvek)

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

In [None]:
%load_ext nvcc_plugin

# Računske Vežbe 1

In [None]:
#@title Sabiranje vektora
%%cu
#include <stdio.h>
#define N 10000000
#define BLOCK_SIZE 32
#define GRID_SIZE 128

__global__ void add(int* a, int* b, int *c, int count)
{
  int start = (blockIdx.x * blockDim.x + threadIdx.x) * count, end = start + count;
  for(int i = start; i < end && i < N; i++)
    c[i] = a[i] + b[i];
}

int main(void)
{
    int *a, *b, *c, *dev_a, *dev_b, *dev_c, amount =  N / (GRID_SIZE * BLOCK_SIZE) + 1;
 
    a = (int*) malloc(N * sizeof(int));
    b = (int*) malloc(N * sizeof(int));
 
    for(int i = 0; i < N; i++)
    {
        a[i] = i + 1;
        b[i] = (i + 1) * (i + 1);
    }

    cudaMalloc((void**) &dev_a, N * sizeof(int));
    cudaMalloc((void**) &dev_b, N * sizeof(int));
    cudaMalloc((void**) &dev_c, N * sizeof(int));

    cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
 
    free(a);
    free(b);
    
    printf("Start!\n");
    add<<<GRID_SIZE, BLOCK_SIZE>>>(dev_a, dev_b, dev_c, amount);

    c = (int*) malloc(N * sizeof(int));
    cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
 
    printf("Done!\n");
    free(c);
 
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
    
    return 0;
}

# Računske Vežbe 2

In [None]:
#@title Challenge: Funkcija koja može da se pozove i na CPU i na GPU
%%cu
#include <stdio.h>

__host__ __device__ void hello(int* a, int b)
{
    *a = b;
}

__global__ void wrapper(int *a)
{
    hello(a, 10);
}

int main(int agrc, char** argv)
{
    int a, *dev_a;
    cudaMalloc((void**) &dev_a, sizeof(int));
 
    hello(&a, 5);
    printf("Hello, %d\n", a);
 
    wrapper<<<1,1>>>(dev_a);
    cudaMemcpy(&a, dev_a, sizeof(int), cudaMemcpyDeviceToHost);
    printf("Hello, %d\n", a);
 
    cudaFree(dev_a);
    return 0;
}

In [None]:
#@title Koliko šestica ima u nizu
%%cu
#include <stdio.h>
#define N 10000000
#define BLOCK_SIZE 32
#define GRID_SIZE 16
#define THREAD_NUMB 512

__device__ int compare(int a, int b)
{
    return a == b ? 1 : 0;
}

__global__ void count(int* array, int* result)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ int partial_result[BLOCK_SIZE];
    partial_result[threadIdx.x] = 0;
 
    while(index < N){
        partial_result[threadIdx.x] += compare(array[index], 6);
        index += THREAD_NUMB;
    }

    __syncthreads();
 
    if(threadIdx.x == 0)
    {
        result[blockIdx.x] = 0;
        for(int i = 0; i < blockDim.x; i++)
          result[blockIdx.x] += partial_result[i];
    }
}

__host__ void compute(int* array, int *result)
{
    int *d_array, *h_partial, *d_partial;
 
    cudaMalloc((void**) &d_array, N * sizeof(int));
    cudaMalloc((void**) &d_partial, GRID_SIZE * sizeof(int));
 
    cudaMemcpy(d_array, array, N * sizeof(int), cudaMemcpyHostToDevice);
 
    count<<<GRID_SIZE, BLOCK_SIZE>>>(d_array, d_partial);
 
    h_partial = (int*) malloc(GRID_SIZE * sizeof(int));
 
    cudaMemcpy(h_partial, d_partial, GRID_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
    
    cudaFree(d_array);
    cudaFree(d_partial);
 
    for(int i = 0; i < GRID_SIZE; i++)
      *result += h_partial[i];

    free(h_partial);
}

int main(void)
{
    int result = 0,
      *array = (int*) malloc(N * sizeof(int));
 
    for(int i = 0; i < N; i++)
      array[i] = i % 2 + 6;
 
    compute(array, &result);
    free(array);
 
    printf("U nizu ima %d sestica.\n", result);
 
    return 0;
}

In [None]:
#@title Transponovanje matrice
%%cu
#include <stdio.h>
#define N 64
#define M 96
#define BLOCK_SIZE 32

__global__ void transpose_matrix(int* matrix_in, int* matrix_out)
{
    int thread_id_x = blockIdx.x * blockDim.x + threadIdx.x,
        thread_id_y = blockIdx.y * blockDim.y  + threadIdx.y;

    __shared__ int temp[BLOCK_SIZE][BLOCK_SIZE + 1];
 
    for(int i = thread_id_x; i < N; i += blockDim.x * gridDim.x)
        for(int j = thread_id_y; j < M; j += blockDim.y * gridDim.y)
          temp[i % BLOCK_SIZE][j % BLOCK_SIZE] = matrix_in[i * M + j];
 
    __syncthreads();
 
    for(int j = thread_id_y; j < M; j += blockDim.y * gridDim.y)
        for(int i = thread_id_x; i < N; i += blockDim.x * gridDim.x)
            matrix_out[j * N + i] = temp[i % BLOCK_SIZE][j % BLOCK_SIZE];
}

__host__ void transpose(int* matrix_in, int* matrix_out)
{
    int* d_matrix_in, *d_matrix_out;
 
    cudaMalloc((void**) &d_matrix_in, N * M * sizeof(int));
    cudaMalloc((void**) &d_matrix_out, M * N * sizeof(int));
 
    cudaMemcpy(d_matrix_in, matrix_in, N * M * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridSize(N / BLOCK_SIZE, M / BLOCK_SIZE);
 
    transpose_matrix<<<gridSize, blockSize>>>(d_matrix_in, d_matrix_out);
 
    cudaMemcpy(matrix_out, d_matrix_out, M * N * sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(d_matrix_in);
    cudaFree(d_matrix_out);
}

int main(void)
{
    int *matrix_in, *matrix_out;
 
    matrix_in = (int*) malloc(sizeof(int) * N * M);
    for(int i = 0; i < N; i++)
      for(int j = 0; j < M; j++)
            matrix_in[i * M + j] = i * M + j;

    matrix_out = (int*) malloc(sizeof(int) * M * N);
 
    transpose(matrix_in, matrix_out);
    free(matrix_in);
 
    for(int i = 0; i < M; i++)
    {
        printf("|\t");
        for(int j = 0; j < N; j++)
          printf("%d\t", matrix_out[i * N + j]);
        printf("|\n");
    }
    free(matrix_out);
 
    return 0;
}

# Računske Vežbe 3