# Zadatak 1

In [None]:
%%writefile prvi.cu

#include <stdio.h>
#include <time.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate(int* A, int* B, int n);
__host__ bool check_result(int* A, int* B, int n);
__global__ void reduce(int* A, int* B, int n);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = (1 << 7),
        *A = nullptr,
        *B = nullptr; 
 
    init_vector(&A, n + 2, 1);
    init_vector(&B, n, -1);
 
    operate(A, B, n);

    if(check_result(A, B, n))
      printf("Correct!\n");
    else
      printf("False!\n");

    free(B);
    free(A);
 
    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=rand() % 100);
}

__host__ void operate(int* A, int* B, int n)
{
    int* dev_A, *dev_B;
 
    size_t count_A = sizeof(int) * (n + 2), count_B = sizeof(int) * n;
    cudaError_t err;
 
    err = cudaMalloc(&dev_A, count_A);
    if(err)
      printf("1A %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_B, count_B);
    if(err)
      printf("1B %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_A, A, count_A, cudaMemcpyHostToDevice);
    if(err)
      printf("2 %s\n", cudaGetErrorString(err));
 
    reduce<<<n / BLOCK_SIZE + 1, BLOCK_SIZE>>>(dev_A, dev_B, n);

    err = cudaMemcpy(B, dev_B, count_B, cudaMemcpyDeviceToHost);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__host__ bool check_result(int* A, int* B, int n)
{
    bool c = true;
    for(int i = 0; c && i < n; i++)
      c = ((3 * A[i] + 10 * A[i + 1] + 7 * A[i + 2]) / 20) == B[i];
    return c;
}

__global__ void reduce(int* A, int* B, int n)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(tid_x < n)
    {
        __shared__ int sh[BLOCK_SIZE];
        sh[threadIdx.x] = A[tid_x + 1];

        __syncthreads();
     
        if(threadIdx.x == 0)
          B[tid_x] = (3 * A[tid_x] + 10 * sh[threadIdx.x] + 7 * sh[threadIdx.x + 1]) / 20.f;
        else if(threadIdx.x == blockDim.x - 1)
          B[tid_x] = (3 * sh[threadIdx.x - 1] + 10 * sh[threadIdx.x] + 7 * A[tid_x + 2]) / 20.f;
        else
          B[tid_x] = (3 * sh[threadIdx.x - 1] + 10 * sh[threadIdx.x] + 7 * sh[threadIdx.x + 1]) / 20.f;
    }
}

In [None]:
filepath = "prvi.cu"  #@param { type: "string" }
compiled_filepath = "prvi"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Zadatak 2

In [None]:
%%writefile drugi.cu

#include <stdio.h>
#include <time.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate(int* M, int* res, int n, int m);
__host__ bool check_result(int* M, int* res, int n, int m);
__global__ void reduce(int* M, int n, int m, int* res_min, int* res_max, int offset);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = 1 << 7,
        m = 1 << 8,
        * M = nullptr,
        *result = nullptr; 
 
    init_vector(&M, n * m, 1);
    init_vector(&result, 2 * m, -1);
 
    operate(M, result, n, m);
 
    if(check_result(M, result, n, m))
      printf("Correct!\n");
    else
      printf("False!\n");

    free(M);
    free(result);
 
    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=rand() % n);
}

__host__ void operate(int* M, int* res, int n, int m)
{
    int* dev_M, *dev_res;
    dim3 grd(m / BLOCK_SIZE + 1, n / BLOCK_SIZE / 2 + 1),
        blck(BLOCK_SIZE, BLOCK_SIZE);
 
    size_t full = sizeof(int) * n * m, part = sizeof(int) * 2 * m * grd.y;
    cudaError_t err;
 
    err = cudaMalloc(&dev_M, full);
    if(err)
      printf("1M %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_res, part);
    if(err)
      printf("1res %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_M, M, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2 %s\n", cudaGetErrorString(err));
 
    reduce<<<grd, blck>>>(dev_M, n, m, dev_res, &dev_res[m * grd.y], 0);

    err = cudaDeviceSynchronize();
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    blck.y = grd.y;
    grd.y = 1;
    reduce<<<grd, blck>>>(dev_res, blck.y, m, dev_res, &dev_res[m], m * blck.y);
 
    err = cudaMemcpy(res, dev_res, 2 * m * sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("4 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_res);
    cudaFree(dev_M);
}

__host__ bool check_result(int* M, int* res, int n, int m)
{
    bool c = true;
    for(int j = 0; c && j < m; j++)
    {
        int min_max[2] = { M[j], M[j] };
        for(int i = 1; i < n; i++)
        {
            if(min_max[0] > M[i * m + j])
              min_max[0] = M[i * m + j];
         
            if(min_max[1] < M[i * m + j])
              min_max[1] = M[i * m + j];
        }
        c = (res[j] == min_max[0]) && (res[m + j] == min_max[1]);
    }
    return c;
}

__global__ void reduce(int* M, int n, int m, int* res_min, int* res_max, int offset)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x,
        tid_y = 2 * blockIdx.y * blockDim.y + threadIdx.y;
 
    if(tid_x < m && tid_y < n)
    {
        __shared__ int sh[2][BLOCK_SIZE][BLOCK_SIZE];
        if(offset)
        {
          sh[0][threadIdx.y][threadIdx.x] = M[tid_y * m + tid_x];
          sh[1][threadIdx.y][threadIdx.x] = M[offset + tid_y * m + tid_x];
        }
        else {
          sh[0][threadIdx.y][threadIdx.x] =
          sh[1][threadIdx.y][threadIdx.x] = M[tid_y * m + tid_x];
        }
        
        if(tid_y + blockDim.y < n)
        {
            int temp1, temp2;
            temp1 = M[(tid_y + blockDim.y) * m + tid_x];
            temp2 = M[offset + (tid_y + blockDim.y) * m + tid_x];
         
            if(temp1 < sh[0][threadIdx.y][threadIdx.x])
              sh[0][threadIdx.y][threadIdx.x] = temp1;
            if(temp2 > sh[1][threadIdx.y][threadIdx.x])
                  sh[1][threadIdx.y][threadIdx.x] = temp2;
        }
     
        for(int i = (blockDim.y >> 1); i > threadIdx.y; i >>= 1)
        {
            __syncthreads();
         
            if(sh[0][threadIdx.y + i][threadIdx.x] < sh[0][threadIdx.y][threadIdx.x])
              sh[0][threadIdx.y][threadIdx.x] = sh[0][threadIdx.y + i][threadIdx.x];
         
            if(sh[1][threadIdx.y + i][threadIdx.x] > sh[1][threadIdx.y][threadIdx.x])
                  sh[1][threadIdx.y][threadIdx.x] = sh[1][threadIdx.y + i][threadIdx.x];
        }
     
        if(!threadIdx.y)
        {
            res_min[blockIdx.y * m + tid_x] = sh[0][0][threadIdx.x];
            res_max[blockIdx.y * m + tid_x] = sh[1][0][threadIdx.x];
        }
    }
}

In [None]:
filepath = "drugi.cu"  #@param { type: "string" }
compiled_filepath = "drugi"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Zadatak 3

In [None]:
%%writefile treci.cu

#include <stdio.h>
#include <time.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate(int* M, int* res, int n, int m);
__host__ bool check_result(int* M, int* res, int n, int m);
__global__ void reduce(int* M, int* res, int n, int m);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = (1 << 8),
        *M = nullptr,
        *result = nullptr; 
 
    init_vector(&M, n * n, 1);
    init_vector(&result, n, -1);
 
    operate(M, result, n, n);
 
    printf("res = |\t");
    for(int i = 0; i < n; printf("%d\t", result[i++]));
    printf("|\n");
 
    if(check_result(M, result, n, n))
      printf("Correct!\n");
    else
      printf("False!\n");

    free(M);
    free(result);
 
    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=rand() % 100);
}

__host__ void operate(int* M, int* res, int n, int m)
{
    int* dev_M, *dev_res;
    dim3 grd(m / BLOCK_SIZE / 2 + 1, n / BLOCK_SIZE + 1),
        blck(BLOCK_SIZE, BLOCK_SIZE);
 
    size_t full = sizeof(int) * n * m, part = sizeof(int) * grd.x * n;
    cudaError_t err;
 
    err = cudaMalloc(&dev_M, full);
    if(err)
      printf("1M %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_res, part);
    if(err)
      printf("1res %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_M, M, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2 %s\n", cudaGetErrorString(err));
 
    reduce<<<grd, blck>>>(dev_M, dev_res, n, m);

    err = cudaDeviceSynchronize();
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    blck.x = grd.x;
    grd.x = 1;
    reduce<<<grd, blck>>>(dev_res, dev_res, n, blck.x);
 
    err = cudaMemcpy(res, dev_res, n * sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("4 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_res);
    cudaFree(dev_M);
}

__host__ bool check_result(int* M, int* res, int n, int m)
{
    bool c = true;
    for(int i = 0; c && i < n; i++)
    {
        int sum = 0;
        for(int j = 0; j < m; j++)
          sum += M[i * m + j];
        c = res[i] == sum;
    }
    return c;
}

__global__ void reduce(int* M, int* res, int n, int m)
{
    int tid_x = 2 * blockIdx.x * blockDim.x + threadIdx.x,
        tid_y = blockIdx.y * blockDim.y + threadIdx.y;
 
    if(tid_x < m && tid_y < n)
    {
        __shared__ int sh[BLOCK_SIZE][BLOCK_SIZE];
        sh[threadIdx.y][threadIdx.x] = M[tid_y * m + tid_x];
        
        if(tid_x + blockDim.x < m)
          sh[threadIdx.y][threadIdx.x] += M[tid_y * m + tid_x + blockDim.x];
     
        for(int i = (blockDim.x >> 1); i > threadIdx.x; i >>= 1)
        {
            __syncthreads();
            sh[threadIdx.y][threadIdx.x] += sh[threadIdx.y][threadIdx.x + i];
        }
     
        if(!threadIdx.x)
          res[tid_y * gridDim.x + blockIdx.x] = sh[threadIdx.y][0];
    }
}

In [None]:
filepath = "treci.cu"  #@param { type: "string" }
compiled_filepath = "treci"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Zadatak 4

In [None]:
%%writefile cetvrti.cu

#include <stdio.h>
#include <time.h>
#define BLOCK_SIZE 1024

__host__ void init_vector(int** v, int n, int val);
__host__ void operate(int* A, int* B, int* res, int n);
__host__ bool check_result(int* A, int* B, int res, int n);
__global__ void reduce(int* A, int* B, int* res, int n);
__global__ void sum_up(int* dst, int* src, int n);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = (1 << 16),
        *A = nullptr,
        *B = nullptr,
        res = 0; 
 
    init_vector(&A, n, 1);
    init_vector(&B, n, 1);
 
    operate(A, B, &res, n);

    if(check_result(A, B, res, n))
      printf("Correct!\n");
    else
      printf("False!\n");

    free(B);
    free(A);
 
    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=rand() % 100);
}

__host__ void operate(int* A, int* B, int* res, int n)
{
    int* dev_A, *dev_B, *dev_res, grid_size = n / BLOCK_SIZE / 2;
 
    size_t full = sizeof(int) * n, part = sizeof(int) * grid_size;
    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      printf("1A %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_B, full);
    if(err)
      printf("1B %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_res, part);
    if(err)
      printf("1res %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2A %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_B, B, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2B %s\n", cudaGetErrorString(err));
 
    reduce<<<grid_size, BLOCK_SIZE>>>(dev_A, dev_B, dev_res, n);
 
    sum_up<<<1, grid_size>>>(dev_res, dev_res, grid_size);

    err = cudaMemcpy(res, dev_res, sizeof(int), cudaMemcpyDeviceToHost);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_res);
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__host__ bool check_result(int* A, int* B, int res, int n)
{
    int dp = 0;
    for(int i = 0; i < n; i++)
      dp += A[i] * B[i];
    return dp == res;
}

__global__ void reduce(int* A, int* B, int* partial, int n)
{
    int tid_x = 2 * blockIdx.x * blockDim.x + threadIdx.x;
    if(tid_x < n)
    {
        __shared__ int sh[BLOCK_SIZE];
        sh[threadIdx.x] = A[tid_x] * B[tid_x];
        if(tid_x + blockDim.x < n)
          sh[threadIdx.x] += A[tid_x + blockDim.x] * B[tid_x + blockDim.x];
     
        for(int i = blockDim.x >> 1; i > threadIdx.x; i>>=1)
        {
            __syncthreads();
            sh[threadIdx.x] += sh[threadIdx.x + i];
        }
     
        if(!threadIdx.x)
          partial[blockIdx.x] = sh[0];
    }
}

__global__ void sum_up(int* dst, int* src, int n)
{
    int tid_x = 2 * blockIdx.x * blockDim.x + threadIdx.x;
    if(tid_x < n)
    {
        __shared__ int sh[BLOCK_SIZE];
        sh[threadIdx.x] = src[tid_x];
        if(tid_x + blockDim.x < n)
          sh[threadIdx.x] += src[tid_x + blockDim.x];
     
        for(int i = blockDim.x >> 1; i > threadIdx.x; i>>=1)
        {
            __syncthreads();
            sh[threadIdx.x] += sh[threadIdx.x + i];
        }
     
        if(!threadIdx.x)
          dst[blockIdx.x] = sh[0];
    }
}

In [None]:
filepath = "cetvrti.cu"  #@param { type: "string" }
compiled_filepath = "cetvrti"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Zadatak 5

In [None]:
%%writefile peti.cu

#include <stdio.h>
#include <time.h>
#define BLOCK_SIZE 32

__host__ void init_vector(int** v, int n, int val);
__host__ void operate(int* A, int* res, int n, int m);
__host__ bool check_result(int* A, int* res, int n, int m);
__global__ void reduce(int* A, int* res, int n, int m);
__global__ void minimize(int* M, int n, int m, int* avg);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = 1 << 8,
        m = 1 << 6,
        *A = nullptr,
        *res = nullptr; 
 
    init_vector(&A, n * m, 1);
    init_vector(&res, n * m, 0);
 
    operate(A, res, n, m);

    if(check_result(A, res, n, m))
      printf("Correct!\n");
    else
      printf("False!\n");

    free(res);
    free(A);
 
    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=rand() % 100);
}

__host__ void operate(int* A, int* res, int n, int m)
{
    int* dev_A, *dev_res;
    dim3 grd(m / BLOCK_SIZE / 2, n / BLOCK_SIZE / 2),
        blck(BLOCK_SIZE, BLOCK_SIZE);
 
    size_t full = sizeof(int) * n * m, part = sizeof(int) * grd.x * grd.y;
    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      printf("1A %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_res, part);
    if(err)
      printf("1res %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2A %s\n", cudaGetErrorString(err));
 
    reduce<<<grd, blck>>>(dev_A, dev_res, n, m);
    blck.x = grd.x;
    blck.y = grd.y;
    grd.x = grd.y = 1;
    reduce<<<grd, blck>>>(dev_res, dev_res, blck.y, blck.x);
 
    grd.x = 2 * blck.x;
    grd.y = 2 * blck.y;
    blck.x = blck.y = BLOCK_SIZE;
    minimize<<<grd, blck>>>(dev_A, n, m, dev_res);

    err = cudaMemcpy(res, dev_A, full, cudaMemcpyDeviceToHost);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_res);
    cudaFree(dev_A);
}

__host__ bool check_result(int* A, int* B, int n, int m)
{
    int avg = 0;
    for(int i = 0; i < n; i++)
      for(int j = 0; j < m; j++)
          avg += A[i * m + j];
    avg /= n * m;
 
    bool c = true;
    for(int i = 0; c && i < n; i++)
      for(int j = 0; c && j < m; j++)
          c = B[i * m + j] == (A[i * m + j] > avg ? -1 : A[i * m + j]);
        
    return c;
}

__global__ void reduce(int* A, int* res, int n, int m)
{
    int tid_x = 2 * blockIdx.x * blockDim.x + threadIdx.x,
        tid_y = 2 * blockIdx.y * blockDim.y + threadIdx.y;
 
    if(tid_x < m && tid_y < n)
    {
        bool tst;
        __shared__ int sh[BLOCK_SIZE][BLOCK_SIZE];
        sh[threadIdx.y][threadIdx.x] = A[tid_y * m + tid_x];
     
        if(tst = (tid_x + blockDim.x < m))
          sh[threadIdx.y][threadIdx.x] += A[tid_y * m + tid_x + blockDim.x];
        
        if(tid_y + blockDim.y < n)
        {
          sh[threadIdx.y][threadIdx.x] += A[(tid_y + blockDim.y) * m + tid_x];
          if(tst)
            sh[threadIdx.y][threadIdx.x] += A[(tid_y + blockDim.y) * m + tid_x + blockDim.x];
        }
     
        for(int i = blockDim.x >> 1; i > threadIdx.x; i>>=1)
        {
            __syncthreads();
            sh[threadIdx.y][threadIdx.x] += sh[threadIdx.y][threadIdx.x + i];
        }
     
        if(!threadIdx.x)
        {
          for(int i = blockDim.y >> 1; i > threadIdx.y; i>>=1)
          {
              __syncthreads();
              sh[threadIdx.y][0] += sh[threadIdx.y + i][0];
          }
          if(!threadIdx.y)
            res[blockIdx.y * gridDim.x + blockIdx.x] = sh[0][0];
        }
    }
}

__global__ void minimize(int* M, int n, int m, int* avg)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x,
      tid_y = blockIdx.y * blockDim.y + threadIdx.y;
 
    if(tid_x < m && tid_y < n)
    {
        int temp = avg[0] / (n * m);
        if(M[tid_y * m + tid_x] > temp)
          M[tid_y * m + tid_x] = -1;
    }
}

In [None]:
filepath = "peti.cu"  #@param { type: "string" }
compiled_filepath = "peti"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Zadatak 6

In [None]:
%%writefile sesti.cu

#include <stdio.h>
#include <time.h>
#include <math.h>
#define BLOCK_SIZE 1024

__host__ void init_vector(int** v, int n, int val);
__host__ void operate(int* v, int* res, int n);
__host__ bool check_result(int* v, int* res, int n);
__global__ void reduce(int* v, int* res, int n);
__global__ void maximize(int* v, int n, int* avg);

int main(int argc, char** argv)
{
    srand(time(NULL));
 
    int n = 1 << 16,
        *v = nullptr,
        *res = nullptr; 
 
    init_vector(&v, n, 1);
    init_vector(&res, n, 0);
 
    operate(v, res, n);

    if(check_result(v, res, n))
      printf("Correct!\n");
    else
      printf("False!\n");

    free(res);
    free(v);
 
    return 0;
}

__host__ void init_vector(int** v, int n, int val)
{
    *v = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; (*v)[i++]=(rand() % 100) * pow(-1, i));
}

__host__ void operate(int* v, int* res, int n)
{
    int* dev_v, *dev_res, grid_size = n / BLOCK_SIZE;
    
    size_t full = sizeof(int) * n, part = sizeof(int) * grid_size;
    cudaError_t err;
 
    err = cudaMalloc(&dev_v, full);
    if(err)
      printf("1A %s\n", cudaGetErrorString(err));
 
    err = cudaMalloc(&dev_res, part);
    if(err)
      printf("1res %s\n", cudaGetErrorString(err));
 
    err = cudaMemcpy(dev_v, v, full, cudaMemcpyHostToDevice);
    if(err)
      printf("2A %s\n", cudaGetErrorString(err));
 
    reduce<<<grid_size, BLOCK_SIZE>>>(dev_v, dev_res, n);
    reduce<<<1, grid_size>>>(dev_res, dev_res, grid_size);
    maximize<<<grid_size, BLOCK_SIZE>>>(dev_v, n, dev_res);

    err = cudaMemcpy(res, dev_v, full, cudaMemcpyDeviceToHost);
    if(err)
      printf("3 %s\n", cudaGetErrorString(err));
 
    cudaFree(dev_res);
    cudaFree(dev_v);
}

__host__ bool check_result(int* v, int* res, int n)
{
    int min_pos = v[0];
    for(int i = 0; i < n; i++)
      if(v[i] >= 0 && v[i] < min_pos)
        min_pos = v[i];
 
    bool c = true;
    for(int i = 0; c && i < n; i++)
          c = res[i] == (v[i] < 0 ? min_pos : v[i]);
        
    return c;
}

__global__ void reduce(int* v, int* res, int n)
{
    int tid_x = 2 * blockIdx.x * blockDim.x + threadIdx.x;
 
    if(tid_x < n)
    {
        __shared__ int sh[BLOCK_SIZE];
        sh[threadIdx.x] = v[tid_x];
     
        if(tid_x + blockDim.x < n)
        {
            int temp = v[tid_x + blockDim.x];
            if(temp >= 0 && (temp < sh[threadIdx.x] || sh[threadIdx.x] < 0))
              sh[threadIdx.x] = temp;
        }
     
        for(int i = blockDim.x >> 1; i > threadIdx.x; i >>= 1)
        {
            __syncthreads();
            if(sh[threadIdx.x + i] >= 0 && sh[threadIdx.x + i] < sh[threadIdx.x])
              sh[threadIdx.x] = sh[threadIdx.x + i];
        }
     
        if(!threadIdx.x)
          res[blockIdx.x] = sh[0];
    }
}

__global__ void maximize(int* v, int n, int* min_pos)
{
    int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
 
    if(tid_x < n && v[tid_x] < 0)
          v[tid_x] = min_pos[0];
}

In [None]:
filepath = "sesti.cu"  #@param { type: "string" }
compiled_filepath = "sesti"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv