# Suma elemenata niza

In [None]:
%%writefile array_sum.cu

#include <stdio.h>
#define BLOCK_SIZE 512

__host__ void init_vector(int** v, int n, int val);
__host__ void operate_on_GPU(int* v, int* ps, int n);
__global__ void reduce(int* v, int* p, int n);

int main(int argc, char** argv)
{
    int n = 1 << 16;
    int *vector = nullptr,
        result = 0;
 
    init_vector(&vector, n, 1);
    operate_on_GPU(vector, &result, n);
    printf("sum = %d\n", result);
    printf("should be = %d\n", n);
 
    free(vector);

    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=val);
}

__host__ void operate_on_GPU(int* v, int* result, int n)
{
    int *copy, *partial, grid_size = n / BLOCK_SIZE / 2;
 
    size_t full = sizeof(int) * n,
            part = sizeof(int) * grid_size;
 
    cudaError_t err;
 
    err = cudaMalloc(&copy, full);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
    
    err = cudaMalloc(&partial, part);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(copy, v, full, cudaMemcpyHostToDevice);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    reduce<<<grid_size, BLOCK_SIZE>>>(copy, partial, n);
    
    err = cudaDeviceSynchronize();
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    reduce<<<1, grid_size>>>(partial, partial, grid_size);
 
    err = cudaMemcpy(result, partial, sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    cudaFree(partial);
    cudaFree(copy);
}

__global__ void reduce(int* v, int* p, int n)
{
    __shared__ int partials[BLOCK_SIZE];
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    partials[threadIdx.x] = v[tid] + v[tid + blockDim.x];

    for(int stride = (blockDim.x >> 1); stride > threadIdx.x; stride >>= 1)
    {
        __syncthreads();
        partials[threadIdx.x] += partials[threadIdx.x + stride];
    }

    if(!threadIdx.x)
      p[blockIdx.x] = partials[0];
}

In [None]:
filepath = "array_sum.cu"  #@param { type: "string" }
compiled_filepath = "array_sum"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Minimalni element niza

In [131]:
%%writefile array_min_element.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BLOCK_SIZE 512

__host__ void init_vector(int** v, int n, int val);
__host__ void operate_on_GPU(int* v, int* ps, int n);
__global__ void reduce(int* v, int* p, int n);
__host__ int findMin(int* v, int n);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = 1 << 16;
    int *vector = nullptr,
        result = 0;
 
    init_vector(&vector, n, 1);
    operate_on_GPU(vector, &result, n);
    printf("minimal = %d\n", result);
    printf("should be = %d\n", findMin(vector, n));
 
    free(vector);

    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++] = rand());
}

__host__ void operate_on_GPU(int* v, int* result, int n)
{
    int *copy, *partial, grid_size = n / BLOCK_SIZE / 2;
 
    size_t full = sizeof(int) * n,
            part = sizeof(int) * grid_size;
 
    cudaError_t err;
 
    err = cudaMalloc(&copy, full);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
    
    err = cudaMalloc(&partial, part);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(copy, v, full, cudaMemcpyHostToDevice);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    reduce<<<grid_size, BLOCK_SIZE>>>(copy, partial, n);
    
    err = cudaDeviceSynchronize();
    if(err)
      printf("1 %s\n", cudaGetErrorString(err));
 
    reduce<<<1, grid_size>>>(partial, partial, grid_size);
 
    err = cudaMemcpy(result, partial, sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("2 %s\n", cudaGetErrorString(err));
 
    cudaFree(partial);
    cudaFree(copy);
}

__host__ int findMin(int* v, int n)
{
    int min = v[0];
    for(int i = 0; i < n; i++)
      min = v[i] < min ? v[i] : min; 
    return min;
}

__global__ void reduce(int* v, int* p, int n)
{
    __shared__ int partials[BLOCK_SIZE];
    int tid = 2 * blockIdx.x * blockDim.x + threadIdx.x;
 
    if(tid < n)
    {
      int temp = v[tid];
      
      partials[threadIdx.x] = v[tid + blockDim.x];
      if(partials[threadIdx.x] > temp)
        partials[threadIdx.x] = temp;

      for(int stride = (blockDim.x >> 1); stride > threadIdx.x; stride >>= 1)
      {
          __syncthreads();
          if(partials[threadIdx.x] > partials[threadIdx.x + stride])
            partials[threadIdx.x] = partials[threadIdx.x + stride];
      }

      if(!threadIdx.x)
        p[blockIdx.x] = partials[0];
    }
}

Overwriting array_min_element.cu


In [136]:
filepath = "array_min_element.cu"  #@param { type: "string" }
compiled_filepath = "array_min_element"  #@param { type: "string" }

!nvcc -arch=sm_53 -gencode=arch=compute_53,code=sm_53 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

minimal = 0
should be = 95249


In [135]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0


# Maksimalni element niza

In [None]:
%%writefile array_max_element.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define BLOCK_SIZE 512

__host__ void init_vector(int** v, int n, int val);
__host__ void operate_on_GPU(int* v, int* ps, int n);
__global__ void reduce(int* v, int* p, int n);
__host__ int findMax(int * v, int n);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = 1 << 16;
    int *vector = nullptr,
        result = 0;
 
    init_vector(&vector, n, 1);
    operate_on_GPU(vector, &result, n);
    printf("maximal = %d\n", result);
    printf("shoudl be = %d\n", findMax(vector, n));
 
    free(vector);

    return 0;
}

__host__ int findMax(int* v, int n)
{
    int max = v[0];
    for(int i = 0; i < n; (max = v[++i] > max ? v[i] : max));
    return max;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++] = rand());
}

__host__ void operate_on_GPU(int* v, int* result, int n)
{
    int *copy, *partial, grid_size = n / BLOCK_SIZE;
 
    size_t full = sizeof(int) * n,
            part = sizeof(int) * grid_size;
 
    cudaError_t err;
 
    err = cudaMalloc(&copy, full);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
    
    err = cudaMalloc(&partial, part);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(copy, v, full, cudaMemcpyHostToDevice);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    reduce<<<grid_size, BLOCK_SIZE>>>(copy, partial, n);
    
    err = cudaDeviceSynchronize();
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    reduce<<<1, grid_size>>>(partial, partial, grid_size);
 
    err = cudaMemcpy(result, partial, sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("%s\n", cudaGetErrorString(err));
 
    cudaFree(partial);
    cudaFree(copy);
}

__global__ void reduce(int* v, int* p, int n)
{
    __shared__ int partials[BLOCK_SIZE];
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    
    partials[threadIdx.x] = v[tid];

    for(int stride = (blockDim.x >> 1); stride > threadIdx.x; stride >>= 1)
    {
        __syncthreads();
        if(partials[threadIdx.x + stride] > partials[threadIdx.x])
          partials[threadIdx.x] = partials[threadIdx.x + stride];
    }

    if(!threadIdx.x)
      p[blockIdx.x] = partials[0];
}

In [None]:
filepath = "array_max_element.cu"  #@param { type: "string" }
compiled_filepath = "array_max_element"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Suma kolona matrice

In [None]:
%%writefile matrix_column_sum.cu

#include <stdio.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate_on_GPU(int* v, int* ps, int n);
__global__ void reduceMat(int* v, int* p, int n);
__host__ bool checkResult(int* mat, int* res, int n);

int main(int argc, char** argv)
{
    int n = 1 << 8;
    int *matrix = nullptr,
        *result = nullptr;
 
    init_vector(&matrix, n * n, 1);
    init_vector(&result, n, 0);
    operate_on_GPU(matrix, result, n);
 
    printf("column_sum = |\t");
    for(int i = 0; i < n; printf("%d\t", result[i++]));
    printf("|\n");
 
    if(checkResult(matrix, result, n))
      printf("Result is correct!\n");
    else
      printf("Result is false!\n");
 
    free(result);
    free(matrix);

    return 0;
}

__host__ bool checkResult(int* mat, int* res, int n)
{
    bool c = true;
    for(int j = 0; c && j < n; j++)
    {
        int sum = 0;
        for(int i = 0; i < n; (sum += mat[i * n + j]), i++);
        c = sum == res[j];
    }
    return c;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=val);
}

__host__ void operate_on_GPU(int* m, int* result, int n)
{
    int *copy, *partial, grid_size = n / BLOCK_SIZE;
 
    size_t full = sizeof(int) * n * n,
            part = sizeof(int) * n * grid_size;
 
    cudaError_t err;
 
    err = cudaMalloc(&copy, full);
    if(err)
      printf("1 %s\n", cudaGetErrorString(err));
    
    err = cudaMalloc(&partial, part);
    if(err)
      printf("2 %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(copy, m, full, cudaMemcpyHostToDevice);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    dim3 grd(grid_size, grid_size),
        grd_1_y(grid_size, 1),
        blck(BLOCK_SIZE, BLOCK_SIZE),
        blck_grd_y(BLOCK_SIZE, grid_size);
 
    reduceMat<<<grd, blck>>>(copy, partial, n);
    
    err = cudaDeviceSynchronize();
    if(err)
      printf("4 %s\n", cudaGetErrorString(err));
 
    reduceMat<<<grd_1_y, blck_grd_y>>>(partial, partial, n);
 
    err = cudaDeviceSynchronize();
        if(err)
          printf("5 %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(result, partial, n * sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("6 %s\n", cudaGetErrorString(err));
 
    cudaFree(partial);
    cudaFree(copy);
}

__global__ void reduceMat(int* v, int* p, int n)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x,
        tid_y = blockIdx.y * blockDim.y + threadIdx.y;
 
    if(tid_x < n && tid_y < n)
    {
      __shared__ int partials[BLOCK_SIZE][BLOCK_SIZE];
      partials[threadIdx.y][threadIdx.x] = v[tid_y * n + tid_x];
  
      for(int stride = (blockDim.y >> 1); stride > threadIdx.y; stride >>= 1)
      {
          __syncthreads();
          partials[threadIdx.y][threadIdx.x] += partials[threadIdx.y + stride][threadIdx.x];
      }
  
      if(!threadIdx.y)
        p[blockIdx.y * n + tid_x] = partials[0][threadIdx.x];
    }
}

In [None]:
filepath = "matrix_column_sum.cu"  #@param { type: "string" }
compiled_filepath = "matrix_column_sum"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Suma vrsta matrice

In [None]:
%%writefile matrix_row_sum.cu

#include <stdio.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate_on_GPU(int* v, int* ps, int n);
__global__ void reduceMat(int* v, int* p, int n);
__host__ bool checkResult(int* mat, int* res, int n);

int main(int argc, char** argv)
{
    int n = 1 << 8;
    int *matrix = nullptr,
        *result = nullptr;
 
    init_vector(&matrix, n * n, 1);
    init_vector(&result, n, 0);
    operate_on_GPU(matrix, result, n);
 
    printf("row_sum\n");
    for(int i = 0; i < n; printf("|\t%d\t|\n", result[i++]));
 
    if(checkResult(matrix, result, n))
      printf("Result is correct!\n");
    else
      printf("Result is false!\n");
 
    free(result);
    free(matrix);

    return 0;
}

__host__ bool checkResult(int* mat, int* res, int n)
{
    bool c = true;
    for(int i = 0; c && i < n; i++)
    {
        int sum = 0;
        for(int j = 0; j < n; (sum += mat[i * n + j]), j++);
        c = sum == res[i];
    }
    return c;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=val);
}

__host__ void operate_on_GPU(int* m, int* result, int n)
{
    int *copy, *partial, grid_size = n / BLOCK_SIZE;
 
    size_t full = sizeof(int) * n * n,
            part = sizeof(int) * n * grid_size;
 
    cudaError_t err;
 
    err = cudaMalloc(&copy, full);
    if(err)
      printf("1 %s\n", cudaGetErrorString(err));
    
    err = cudaMalloc(&partial, part);
    if(err)
      printf("2 %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(copy, m, full, cudaMemcpyHostToDevice);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    dim3 grd(grid_size, grid_size),
        grd_1_x(1, grid_size),
        blck(BLOCK_SIZE, BLOCK_SIZE),
        blck_grd_x(grid_size, BLOCK_SIZE);
 
    reduceMat<<<grd, blck>>>(copy, partial, n);
    
    err = cudaDeviceSynchronize();
    if(err)
      printf("4 %s\n", cudaGetErrorString(err));
 
    reduceMat<<<grd_1_x, blck_grd_x>>>(partial, partial, n);
 
    err = cudaDeviceSynchronize();
        if(err)
          printf("5 %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(result, partial, n * sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("6 %s\n", cudaGetErrorString(err));
 
    cudaFree(partial);
    cudaFree(copy);
}

__global__ void reduceMat(int* v, int* p, int n)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x,
        tid_y = blockIdx.y * blockDim.y + threadIdx.y;
 
    if(tid_x < n && tid_y < n)
    {
      __shared__ int partials[BLOCK_SIZE][BLOCK_SIZE];
      partials[threadIdx.y][threadIdx.x] = v[tid_y * n + tid_x];
  
      for(int stride = (blockDim.x >> 1); stride > threadIdx.x; stride >>= 1)
      {
          __syncthreads();
          partials[threadIdx.y][threadIdx.x] += partials[threadIdx.y][threadIdx.x + stride];
      }
  
      if(!threadIdx.x)
      {
        if(gridDim.x > 1)
          p[tid_y * n + blockIdx.x] = partials[threadIdx.y][0];
        else
          p[tid_y] = partials[threadIdx.y][0];
      }
    }
}

In [None]:
filepath = "matrix_row_sum.cu"  #@param { type: "string" }
compiled_filepath = "matrix_row_sum"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Matrices sum

In [51]:
%%writefile matrices_sum.cu

#include <stdio.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate_on_GPU(int* A, int* B, int* result, int n);
__global__ void matrixSum(int* A, int* B, int *C, int n);
__host__ const char* checkSum(int *A, int* B, int *check, int n);

int main(int argc, char** argv)
{
    int n = 1 << 7;
    int *A = nullptr,
        *B = nullptr,
        *result = nullptr;
 
    init_vector(&A, n * n, 1);
    init_vector(&B, n * n, 5);
    init_vector(&result, n * n, 0);
    operate_on_GPU(A, B, result, n);
 

    printf("\n\nMatrix:\n");
    for(int i = 0; i < n; i++)
    {
        printf("|\t");
        for(int j = 0; j < n; printf("%d\t", result[i * n + j]), j++);
        printf("|\n");
    }
 
    printf("\nSum is %s!", checkSum(A, B, result, n));
 
    free(result);
    free(B);
    free(A);

    return 0;
}

__host__ const char* checkSum(int *A, int* B, int *check, int n)
{
    bool c = true;
    for(int i = 0; c && i < n; i++)
      for(int j = 0; c && j < n; j++)
        c = ((A[i * n + j] + B[i * n + j]) == check[i * n + j]);
    if(c)
      return "correct";
    return "false";
}


__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=val);
}

__host__ void operate_on_GPU(int* A, int* B, int* result, int n)
{
    int *dev_A, *dev_B, *dev_C, grid_size = n / BLOCK_SIZE;
    printf("\nGrid size: %d\n", grid_size);
 
    size_t full = sizeof(int) * n * n;
 
    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      printf("1A %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_B, full);
    if(err)
      printf("1B %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_C, full);
        if(err)
          printf("1C %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2A %s\n", cudaGetErrorString(err));

    err = cudaMemcpy(dev_B, B, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2B %s\n", cudaGetErrorString(err));
 
    dim3 grd(grid_size, grid_size),
        blck(BLOCK_SIZE, BLOCK_SIZE);
 
    matrixSum<<<grd, blck>>>(dev_A, dev_B, dev_C, n);
 
    err = cudaMemcpy(result, dev_C, full, cudaMemcpyDeviceToHost);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_C);
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__global__ void matrixSum(int* A, int* B, int* C, int n)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x,
        tid_y = blockIdx.y * blockDim.y + threadIdx.y,
        index;
 
    if(tid_x < n && tid_y < n)
    {
      index = tid_y * n + tid_x;
      C[index] = A[index] + B[index];
    }
}

Overwriting matrices_sum.cu


In [52]:
filepath = "matrices_sum.cu"  #@param { type: "string" }
compiled_filepath = "matrices_sum"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv


Grid size: 4


Matrix:
|	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	|
|	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	|
|	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	|
|	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	6	