# Januar 2022

In [None]:
%%writefile jan_2022.cu

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;

#define BLOCK_SIZE 256

__host__ void initialize_vector(int** A, int n);
__host__ void operate_on_GPU(int* A, int* B, int n);
__global__ void kernel(int* A, int* B, int n);
__host__ bool check_result(int*A, int*B, int n);
__host__ void print_vector(const char* lbl, int* A, int n);

int main(int argc, char** argv)
{
    int n, *A, *B;
    n = (1 << 15) + 25;
 
    initialize_vector(&A, n);
    B = (int*) malloc(sizeof(int) * (n - 2));

    operate_on_GPU(A, B, n);
 
    print_vector("B", B, n - 2);

    if(check_result(A, B, n))
      cout << "Correct!" << endl;
    else
      cout << "False!" << endl;
}


__host__ void initialize_vector(int** A, int n)
{
    *A = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; i++)
      (*A)[i] = i + 1;
}

__host__ void operate_on_GPU(int* A, int* B, int n)
{
    int* dev_A, *dev_B;
    size_t full = sizeof(int) * n,
            part = sizeof(int) * (n - 2);

    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      cout << "1A " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_B, part);
    if(err)
          cout << "1B " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2 " << cudaGetErrorString(err) << endl;
 
    kernel<<<min(256, n / BLOCK_SIZE + 1), BLOCK_SIZE>>>(dev_A, dev_B, n);

    err = cudaMemcpy(B, dev_B, part, cudaMemcpyDeviceToHost);
    if(err)
          cout << "3 " << cudaGetErrorString(err) << endl;
 
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__global__ void kernel(int* A, int* B, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ int sh[BLOCK_SIZE];
 
    while(tid < n - 1)
    {
        sh[threadIdx.x] = A[tid + 1];
        __syncthreads();
     
        if(tid < n - 2)
        {
          if(threadIdx.x == 0)
            B[tid] = (A[tid] + sh[threadIdx.x] + sh[threadIdx.x + 1]) / 3;
          else if(threadIdx.x == (blockDim.x - 1))
            B[tid] = (sh[threadIdx.x - 1] + sh[threadIdx.x] + A[tid + 2]) / 3;
          else
            B[tid] = (sh[threadIdx.x - 1] + sh[threadIdx.x] + sh[threadIdx.x + 1]) / 3;
        }
     
        __syncthreads();
     
        tid += blockDim.x * gridDim.x;
    }
}

__host__ bool check_result(int*A, int*B, int n)
{
    bool c = true;
    for(int i = 0; c && i < n - 2; i++)
      c = B[i] == ((A[i] + A[i + 1] + A[i + 2]) / 3);
    return c;
}

__host__ void print_vector(const char* lbl, int* A, int n)
{
    cout << lbl << " = |\t";
    for(int i = 0; i < n; (cout << A[i++] << "\t"));
    cout << "|\n";
}

In [None]:
filepath = "jan_2022.cu"  #@param { type: "string" }
compiled_filepath = "jan_2022"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Jun 2 2021
# Ovo je takav overkill, ali ja sam Stefan :D

In [None]:
%%writefile jun2_2021.cu

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;

#define BLOCK_SIZE 32

__host__ void initialize_vector(int** A, int n);
__host__ void operate_on_GPU(int* A, int* B, int* v, int n, int m);
__global__ void kernel_sum(int* A, int* B, int* v, int n, int m);
__global__ void kernel_reduce(int* v, int n, int m, int el_numb);
__host__ bool check_result(int*A, int*B, int *v, int n, int m);
__host__ void print_vector(const char* lbl, int* A, int n);

int main(int argc, char** argv)
{
    int n, m, *A, *B, *v;
    n = 221;
    m = 123;
 
    initialize_vector(&A, n * m);
    initialize_vector(&B, n * m);
    v = (int*) malloc(sizeof(int) * n);

    operate_on_GPU(A, B, v, n, m);
 
    print_vector("v", v, n);

    if(check_result(A, B, v, n, m))
      cout << "Correct!" << endl;
    else
      cout << "False!" << endl;
}


__host__ void initialize_vector(int** A, int n)
{
    *A = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; i++)
      (*A)[i] = i + 1;
}

__host__ void operate_on_GPU(int* A, int* B, int* v, int n, int m)
{
    int* dev_A, *dev_B, *dev_v;
 
    dim3 grid_size(min(256, m / BLOCK_SIZE / 2 + 1), min(256, n / BLOCK_SIZE + 1)),
        block_size(BLOCK_SIZE / 2, BLOCK_SIZE);
 
    size_t full = sizeof(int) * n * m,
            part = sizeof(int) * n * grid_size.x;

    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      cout << "1A " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_B, full);
    if(err)
          cout << "1B " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_v, part);
    if(err)
          cout << "1v " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2A " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_B, B, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2B " << cudaGetErrorString(err) << endl;
 
    kernel_sum<<<grid_size, block_size>>>(dev_A, dev_B, dev_v, n, m);
    block_size.x = grid_size.x;
    grid_size.x = 1;
    kernel_reduce<<<grid_size, block_size>>>(dev_v, n, block_size.x, m);

    err = cudaMemcpy(v, dev_v, sizeof(int) * n, cudaMemcpyDeviceToHost);
    if(err)
          cout << "3 " << cudaGetErrorString(err) << endl;
 
    cudaFree(dev_v);
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__global__ void kernel_sum(int* A, int* B, int* v, int n, int m)
{
    int tid_x_initial = 2 * blockIdx.x * blockDim.x + threadIdx.x,
      tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    __shared__ int sh[BLOCK_SIZE][BLOCK_SIZE];
 
    while(tid_y < n)
    {
        int tid_x = tid_x_initial;
     
        sh[threadIdx.y][threadIdx.x] = 0;
        while(tid_x < m)
        {
          sh[threadIdx.y][threadIdx.x] += A[tid_y * m + tid_x] + B[tid_y * m + tid_x];
          if(tid_x + blockDim.x < m)
            sh[threadIdx.y][threadIdx.x] += A[tid_y * m + tid_x + blockDim.x] + B[tid_y * m + tid_x + blockDim.x];
          tid_x += 2 * blockDim.x * gridDim.x;
        }
     
        for(int i = blockDim.x>>1; i > threadIdx.x; i>>=1)
        {
          __syncthreads();
          sh[threadIdx.y][threadIdx.x] += sh[threadIdx.y][threadIdx.x + i];
        }
     
        if(!threadIdx.x)
          v[tid_y * gridDim.x + blockIdx.x] = sh[threadIdx.y][0];
     
        tid_y += blockDim.y * gridDim.y;
    }
}

__global__ void kernel_reduce(int* v, int n, int m, int el_numb)
{
    int tid_x_initial = 2 * blockIdx.x * blockDim.x + threadIdx.x,
      tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    __shared__ int sh[BLOCK_SIZE][BLOCK_SIZE];
 
    while(tid_y < n)
    {
        int tid_x = tid_x_initial;
     
        sh[threadIdx.y][threadIdx.x] = 0;
        while(tid_x < m)
        {
          sh[threadIdx.y][threadIdx.x] += v[tid_y * m + tid_x];
          if(tid_x + blockDim.x < m)
            sh[threadIdx.y][threadIdx.x] += v[tid_y * m + tid_x + blockDim.x];
          tid_x += 2 * blockDim.x * gridDim.x;
        }
     
        for(int i = blockDim.x >> 1; i > threadIdx.x; i >>= 1)
        {
          __syncthreads();
          sh[threadIdx.y][threadIdx.x] += sh[threadIdx.y][threadIdx.x + i];
        }
     
        if(!threadIdx.x){
          v[tid_y * gridDim.x + blockIdx.x] = sh[threadIdx.y][0];
          if(gridDim.x == 1)
            v[tid_y] /= el_numb;
        }
     
        tid_y += blockDim.y * gridDim.y;
    }
}

__host__ bool check_result(int*A, int*B, int*v, int n, int m)
{
    bool c = true;
    for(int i = 0; c && i < n; i++)
    {
        int sum = 0;
        for(int j = 0; j < m; j++)
          sum += A[i * m + j] + B[i * m + j];
        c = v[i] == (sum / m);
    }
    return c;
}

__host__ void print_vector(const char* lbl, int* A, int n)
{
    cout << lbl << " = |\t";
    for(int i = 0; i < n; (cout << A[i++] << "\t"));
    cout << "|\n";
}

In [None]:
filepath = "jun2_2021.cu"  #@param { type: "string" }
compiled_filepath = "jun2_2021"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv