# Januar 2022

In [None]:
%%writefile jan_2022.cu

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;

#define BLOCK_SIZE 256

__host__ void initialize_vector(int** A, int n);
__host__ void operate_on_GPU(int* A, int* B, int n);
__global__ void kernel(int* A, int* B, int n);
__host__ bool check_result(int*A, int*B, int n);
__host__ void print_vector(const char* lbl, int* A, int n);

int main(int argc, char** argv)
{
    int *A, *B;
    
    for(int n = 255; n < 3245; n += 3)
    {
      initialize_vector(&A, n);
      B = (int*) malloc(sizeof(int) * (n - 2));

      operate_on_GPU(A, B, n);

      if(check_result(A, B, n))
        cout << n << " Correct!" << endl;
      else
        cout << n << " False!" << endl;

      free(B);
      free(A);
    }
 
    return 0;
}


__host__ void initialize_vector(int** A, int n)
{
    *A = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; i++)
      (*A)[i] = i + 1;
}

__host__ void operate_on_GPU(int* A, int* B, int n)
{
    int* dev_A, *dev_B;
    size_t full = sizeof(int) * n,
            part = sizeof(int) * (n - 2);

    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      cout << "1A " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_B, part);
    if(err)
          cout << "1B " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2 " << cudaGetErrorString(err) << endl;
 
    kernel<<<min(256, n / BLOCK_SIZE + 1), BLOCK_SIZE>>>(dev_A, dev_B, n);

    err = cudaMemcpy(B, dev_B, part, cudaMemcpyDeviceToHost);
    if(err)
          cout << "3 " << cudaGetErrorString(err) << endl;
 
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__global__ void kernel(int* A, int* B, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ int sh[BLOCK_SIZE];
 
    while(tid < n - 1)
    {
        sh[threadIdx.x] = A[tid + 1];
        __syncthreads();
     
        if(tid < n - 2)
        {
          if(threadIdx.x == 0)
            B[tid] = (A[tid] + sh[threadIdx.x] + sh[threadIdx.x + 1]) / 3;
          else if(threadIdx.x == (blockDim.x - 1))
            B[tid] = (sh[threadIdx.x - 1] + sh[threadIdx.x] + A[tid + 2]) / 3;
          else
            B[tid] = (sh[threadIdx.x - 1] + sh[threadIdx.x] + sh[threadIdx.x + 1]) / 3;
        }
     
        __syncthreads();
     
        tid += blockDim.x * gridDim.x;
    }
}

__host__ bool check_result(int*A, int*B, int n)
{
    bool c = true;
    for(int i = 0; c && i < n - 2; i++)
      c = B[i] == ((A[i] + A[i + 1] + A[i + 2]) / 3);
    return c;
}

__host__ void print_vector(const char* lbl, int* A, int n)
{
    cout << lbl << " = |\t";
    for(int i = 0; i < n; (cout << A[i++] << "\t"));
    cout << "|\n";
}

In [None]:
filepath = "jan_2022.cu"  #@param { type: "string" }
compiled_filepath = "jan_2022"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Jun 2 2021
# Ovo je takav overkill

In [None]:
%%writefile jun2_2021.cu

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;

#define BLOCK_SIZE 32

__host__ void initialize_vector(int** A, int n);
__host__ void operate_on_GPU(int* A, int* B, int* v, int n, int m);
__global__ void kernel_sum(int* A, int* B, int* v, int n, int m);
__global__ void kernel_reduce(int* v, int n, int m, int el_numb);
__host__ bool check_result(int*A, int*B, int *v, int n, int m);
__host__ void print_vector(const char* lbl, int* A, int n);

int main(int argc, char** argv)
{
    int n = 123;
    for(int m = 1; m < 65432; m += 19)
    {
      int *A, *B, *v;
  
      initialize_vector(&A, n * m);
      initialize_vector(&B, n * m);
      v = (int*) malloc(sizeof(int) * n);

      operate_on_GPU(A, B, v, n, m);

      if(check_result(A, B, v, n, m))
        cout << "(" << n << ", " << m << ") Correct!" << endl;
      else
        cout << "(" << n << ", " << m << ") False!" << endl;

      free(v);
      free(B);
      free(A);
    }
 
    return 0;
}


__host__ void initialize_vector(int** A, int n)
{
    *A = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; i++)
      (*A)[i] = i + 1;
}

__host__ void operate_on_GPU(int* A, int* B, int* v, int n, int m)
{
    int* dev_A, *dev_B, *dev_v;
 
    dim3 grid_size(min(256, (m / BLOCK_SIZE) + 1), min(256, (n / BLOCK_SIZE) + 1)),
        block_size(BLOCK_SIZE, BLOCK_SIZE);
 
    size_t full = sizeof(int) * n * m,
            part = sizeof(int) * n * grid_size.x;

    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      cout << "1A " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_B, full);
    if(err)
          cout << "1B " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_v, part);
    if(err)
          cout << "1v " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2A " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_B, B, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2B " << cudaGetErrorString(err) << endl;
 
    kernel_sum<<<grid_size, block_size>>>(dev_A, dev_B, dev_v, n, m);
    int temp = grid_size.x;
    grid_size.x = 1;
    kernel_reduce<<<grid_size, block_size>>>(dev_v, n, temp, m);

    err = cudaMemcpy(v, dev_v, sizeof(int) * n, cudaMemcpyDeviceToHost);
    if(err)
          cout << "3 " << cudaGetErrorString(err) << endl;
 
    cudaFree(dev_v);
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__global__ void kernel_sum(int* A, int* B, int* v, int n, int m)
{
    int tid_x_initial = 2 * blockIdx.x * blockDim.x + threadIdx.x,
      tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    __shared__ int sh[BLOCK_SIZE][BLOCK_SIZE];
 
    while(tid_y < n)
    {
        int tid_x = tid_x_initial;
     
        sh[threadIdx.y][threadIdx.x] = 0;
        while(tid_x < m)
        {
          sh[threadIdx.y][threadIdx.x] += A[tid_y * m + tid_x] + B[tid_y * m + tid_x];
          if(tid_x + blockDim.x < m)
            sh[threadIdx.y][threadIdx.x] += A[tid_y * m + tid_x + blockDim.x] + B[tid_y * m + tid_x + blockDim.x];
          tid_x += 2 * blockDim.x * gridDim.x;
        }
     
        for(int i = blockDim.x>>1; i > threadIdx.x; i>>=1)
        {
          __syncthreads();
          sh[threadIdx.y][threadIdx.x] += sh[threadIdx.y][threadIdx.x + i];
        }
     
        if(!threadIdx.x)
          v[tid_y * gridDim.x + blockIdx.x] = sh[threadIdx.y][0];
     
        tid_y += blockDim.y * gridDim.y;
    }
}

__global__ void kernel_reduce(int* v, int n, int m, int el_numb)
{
    int tid_x_initial = 2 * blockIdx.x * blockDim.x + threadIdx.x,
      tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    __shared__ int sh[BLOCK_SIZE][BLOCK_SIZE];
 
    while(tid_y < n)
    {
        int tid_x = tid_x_initial;
     
        sh[threadIdx.y][threadIdx.x] = 0;
        while(tid_x < m)
        {
          sh[threadIdx.y][threadIdx.x] += v[tid_y * m + tid_x];
          if(tid_x + blockDim.x < m)
            sh[threadIdx.y][threadIdx.x] += v[tid_y * m + tid_x + blockDim.x];
          tid_x += 2 * blockDim.x * gridDim.x;
        }
     
        for(int i = blockDim.x >> 1; i > threadIdx.x; i >>= 1)
        {
          __syncthreads();
          sh[threadIdx.y][threadIdx.x] += sh[threadIdx.y][threadIdx.x + i];
        }
     
        if(!threadIdx.x){
          v[tid_y * gridDim.x + blockIdx.x] = sh[threadIdx.y][0];
          if(gridDim.x == 1)
            v[tid_y] /= el_numb;
        }
     
        tid_y += blockDim.y * gridDim.y;
    }
}

__host__ bool check_result(int*A, int*B, int*v, int n, int m)
{
    bool c = true;
    for(int i = 0; c && i < n; i++)
    {
        int sum = 0;
        for(int j = 0; j < m; j++)
          sum += A[i * m + j] + B[i * m + j];
        c = v[i] == (sum / m);
    }
    return c;
}

__host__ void print_vector(const char* lbl, int* A, int n)
{
    cout << lbl << " = |\t";
    for(int i = 0; i < n; (cout << A[i++] << "\t"));
    cout << "|\n";
}

In [None]:
filepath = "jun2_2021.cu"  #@param { type: "string" }
compiled_filepath = "jun2_2021"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Jun 2021

In [None]:
%%writefile jun_2021.cu

#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#define BLOCK_SIZE 256

__host__ void initialize_vector(float** v, int n);
__host__ void print_vector(const char* lbl, float* v, int n);
__host__ void operate_on_GPU(float* v1, float* v2, float* v3, int n, float* avg, float* deviation);
__global__ void vector_sum(float* v1, float* v2, float* v3, int n);
__global__ void reduce_sum(float* v, float* sum, int n);
__global__ void vector_transform(float *v, int n, float avg);
__host__ bool check_result(float *v1, float* v2, float* v3, int n, float avg, float deviation);

int main(int argc, char** argv)
{
    for(int n = 1; n < 65432; n+=3){
      float *v1, *v2, *v3, avg, deviation;
    
      initialize_vector(&v1, n);
      initialize_vector(&v2, n);
      v3 = (float*) malloc(sizeof(float) * n);

      operate_on_GPU(v1, v2, v3, n, &avg, &deviation);

      if(check_result(v1, v2, v3, n, avg, deviation))
        printf("(%d) Correct!\n", n);
      else
        printf("(%d) False!\n", n);

      free(v3);
      free(v2);
      free(v1);
    }
 
    return 0;
}

__host__ void initialize_vector(float** v, int n)
{
    *v = (float*) malloc(sizeof(float) * n);
    for(int i = 0; i < n; i++)
      (*v)[i] = i;
}

__host__ void print_vector(const char* lbl, float* v, int n)
{
    printf("%s = |\t", lbl);
    for(int i = 0; i < n; i++)
      printf("%f\t", v[i]);
    printf("|\n");
}

__host__ void operate_on_GPU(float* v1, float* v2, float* v3, int n, float* avg, float* deviation)
{
    float* dev_v1, *dev_v2, *dev_v3, *dev_sum, *dev_sum_2;

    int grid_size = min(256, (n / BLOCK_SIZE) + 1);
    size_t full = sizeof(float) * n,
            part = sizeof(float) * grid_size;
 
    cudaMalloc(&dev_v1, full);
    cudaMalloc(&dev_v2, full);
    cudaMalloc(&dev_v3, full);
    cudaMalloc(&dev_sum, part);
    cudaMalloc(&dev_sum_2, sizeof(float));
 
    cudaMemcpy(dev_v1, v1, full, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_v2, v2, full, cudaMemcpyHostToDevice);
 
    vector_sum<<<grid_size, BLOCK_SIZE>>>(dev_v1, dev_v2, dev_v3, n);
 
    cudaMemcpy(v3, dev_v3, full, cudaMemcpyDeviceToHost);
 
    reduce_sum<<<grid_size, BLOCK_SIZE>>>(dev_v3, dev_sum, n);
    reduce_sum<<<1, BLOCK_SIZE>>>(dev_sum, dev_sum_2, grid_size);

    float temp;
    cudaMemcpy(&temp, dev_sum_2, sizeof(float), cudaMemcpyDeviceToHost);
 
    (*avg) = temp / n;

    vector_transform<<<grid_size, BLOCK_SIZE>>>(dev_v3, n, *avg);
    reduce_sum<<<grid_size, BLOCK_SIZE>>>(dev_v3, dev_sum, n);
    reduce_sum<<<1, BLOCK_SIZE>>>(dev_sum, dev_sum_2, grid_size);
 
    cudaMemcpy(deviation, dev_sum_2, sizeof(float), cudaMemcpyDeviceToHost);
 
    *deviation = sqrt(1.0 / n * (*deviation));
 
    cudaFree(dev_sum_2);
    cudaFree(dev_sum);
    cudaFree(dev_v3);
    cudaFree(dev_v2);
    cudaFree(dev_v1);
}

__global__ void vector_sum(float* v1, float* v2, float* v3, int n)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    while(tid < n) {
        v3[tid] = v1[tid] + v2[tid];
        tid += blockDim.x * gridDim.x;
    }
}

__global__ void reduce_sum(float* v, float* sum, int n)
{
    int tid = 2 * blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ float sh[BLOCK_SIZE];

    sh[threadIdx.x] = 0;
 
    while(tid < n) {
        sh[threadIdx.x] += v[tid];
        if(tid + blockDim.x < n)
          sh[threadIdx.x] += v[tid + blockDim.x];
        tid += 2 * blockDim.x * gridDim.x;
    }
 
    for(int i = blockDim.x >> 1; i > threadIdx.x; i >>= 1)
    {
      __syncthreads();
      sh[threadIdx.x] += sh[threadIdx.x + i];
    }

    if(!threadIdx.x) {
      sum[blockIdx.x] = sh[0];
    }
}

__global__ void vector_transform(float *v, int n, float avg)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    while(tid < n) {
        float temp = v[tid];
        v[tid] = (temp - avg) * (temp - avg);
        tid += blockDim.x * gridDim.x;
    }
}

__host__ bool check_result(float *v1, float* v2, float* v3, int n, float avg, float deviation)
{
    bool c = true;
 
    float sum = 0,
    *check_v3 = (float*) malloc(sizeof(float) * n);
 
    for(int i = 0; c && i < n; i++)
    {
      c = v3[i] == (check_v3[i] = v1[i] + v2[i]);
      sum += check_v3[i];
    }
 
    float check_avg = sum / n;
 
    if(c = (fabs(check_avg - avg) <= 1))
    {
        float dev = 0;
        for(int i = 0; i < n; i++)
          dev += (check_v3[i] - avg) * (check_v3[i] - avg);
        dev = sqrt(1.0 / n * dev);
        c = fabs(dev - deviation) <= 1;
    }
    return c;
}

In [None]:
filepath = "jun_2021.cu"  #@param { type: "string" }
compiled_filepath = "jun_2021"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv

# Kolokvijum 2022

In [None]:
%%writefile kolokvijum_2022.cu

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
using namespace std;
#include <math.h>

#define BLOCK_SIZE 256

__host__ void initialize_vector(int** A, int n);
__host__ void operate_on_GPU(int* A, int* B, float*C, int n, float p);
__global__ void kernel(int* A, int* B, float*C, int n, float p);
__host__ bool check_result(int*A, int*B, float*C, int n, float p);

int main(int argc, char** argv)
{
    int *A, *B;
    float p = .3, *C;
    
    for(int n = 255; n < 3245; n += 3)
    { 
      initialize_vector(&A, n);
      initialize_vector(&B, n);
      C = (float*) malloc(sizeof(float) * (n - 2));

      operate_on_GPU(A, B, C, n, p);

      if(check_result(A, B, C, n, p))
        cout << n << " Correct!" << endl;
      else
        cout << n << " False!" << endl;

      free(C);
      free(B);
      free(A);
    }
 
    return 0;
}


__host__ void initialize_vector(int** A, int n)
{
    *A = (int*)malloc(sizeof(int) * n);
    for(int i = 0; i < n; i++)
      (*A)[i] = i + 1;//rand() % 100;
}

__host__ void operate_on_GPU(int* A, int* B, float* C, int n, float p)
{
    int* dev_A, *dev_B;
    float *dev_C;
    size_t full = sizeof(int) * n,
            part = sizeof(float) * (n - 2);

    cudaError_t err;
 
    err = cudaMalloc(&dev_A, full);
    if(err)
      cout << "1A " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_B, full);
    if(err)
          cout << "1B " << cudaGetErrorString(err) << endl;
 
    err = cudaMalloc(&dev_C, part);
    if(err)
          cout << "1C " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_A, A, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2B " << cudaGetErrorString(err) << endl;
 
    err = cudaMemcpy(dev_B, B, full, cudaMemcpyHostToDevice);
    if(err)
          cout << "2B " << cudaGetErrorString(err) << endl;
 
    kernel<<<min(256, (n / BLOCK_SIZE) + 1), BLOCK_SIZE>>>(dev_A, dev_B, dev_C, n, p);

    err = cudaMemcpy(C, dev_C, part, cudaMemcpyDeviceToHost);
    if(err)
          cout << "3 " << cudaGetErrorString(err) << endl;
 
    cudaFree(dev_C);
    cudaFree(dev_B);
    cudaFree(dev_A);
}

__global__ void kernel(int* A, int* B, float *C, int n, float p)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    __shared__ int shA[BLOCK_SIZE];
    __shared__ int shB[BLOCK_SIZE];

    while(tid < n)
    {
        shA[threadIdx.x] = A[tid];
        shB[threadIdx.x] = B[tid];
        __syncthreads();
     
        if(tid < n - 2)
        {
          if(threadIdx.x < blockDim.x - 2)
            C[tid] = (shA[threadIdx.x] + shA[threadIdx.x + 1] + shA[threadIdx.x + 2]) * p + (shB[threadIdx.x] + shB[threadIdx.x + 1] + shB[threadIdx.x + 2]) * (1 - p);
          else if(threadIdx.x < blockDim.x - 1)
            C[tid] = (shA[threadIdx.x] + shA[threadIdx.x + 1] + A[tid + 1]) * p + (shB[threadIdx.x] + shB[threadIdx.x + 1] + B[tid + 1]) * (1 - p);
          else
            C[tid] = (shA[threadIdx.x] + A[tid + 1] + A[tid + 2]) * p + (shB[threadIdx.x] + B[tid + 1] + B[tid + 2]) * (1 - p);
        }
     
        __syncthreads();
        tid += blockDim.x * gridDim.x;
    }
}

__host__ bool check_result(int*A, int*B, float*C, int n, float p)
{
    bool c = true;
    for(int i = 0; c && i < n - 2; i++)
      c = (fabs(C[i] - ((A[i] + A[i + 1] + A[i + 2]) * p + (B[i] + B[i + 1] + B[i + 2]) * (1 - p))) <= 1);
    return c;
}

In [None]:
filepath = "kolokvijum_2022.cu"  #@param { type: "string" }
compiled_filepath = "kolokvijum_2022"  #@param { type: "string" }

!nvcc -arch=sm_37 -gencode=arch=compute_37,code=sm_37 $filepath -o $compiled_filepath

argv = "" #@param [] { allow-input: true }

!./$compiled_filepath $argv