<a href="https://colab.research.google.com/github/rbdus0715/Machine-Learning/blob/main/study/cuda/01.intro-cuda/09.parallel_summation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

In [7]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

// for random initialize
#include <stdlib.h>
#include <time.h>

// for memset
#include <cstring>

__global__ void sum_array_gpu(int * a, int * b, int * c, int size)
{
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    if(gid < size)
    {
        c[gid] = a[gid] + b[gid];
    }
}

void sum_array_cpu(int * a, int * b, int * c, int size)
{
    for(int i=0; i<size; i++)
    {
        c[i] = a[i] + b[i];
    }
}

void compare_arrays(int * a, int * b, int size)
{
    for (int i=0; i<size; i++)
    {
        if(a[i] != b[i])
        {
            printf("Array are different \n");
            return;
        }
    }
    printf("Arrays are same \n");
}

int main() {
    int size = 10000;
    int block_size = 128;

    int NO_BYTES = size * sizeof(int);

    // host pointers
    // h_c는 cpu 계산 결과를 저장
    int * h_a, * h_b, * gpu_results, * h_c;

    // host에 메모리 할당
    h_a = (int*)malloc(NO_BYTES);
    h_b = (int*)malloc(NO_BYTES);
    gpu_results = (int*)malloc(NO_BYTES);
    h_c = (int*)malloc(NO_BYTES);

    // host 포인터 initialize
    time_t t;
    srand((unsigned)time(&t));
    for(int i=0; i<size; i++) {
        h_a[i] = (int)(rand() & 0xFF);
    }
    for(int i=0; i<size; i++) {
        h_b[i] = (int)(rand() & 0xFF);
    }

    // host calculation
    sum_array_cpu(h_a, h_b, h_c, size);

    memset(gpu_results, 0, NO_BYTES);

    // device pointer
    int * d_a, * d_b, * d_c;
    cudaMalloc((int **)&d_a, NO_BYTES);
    cudaMalloc((int **)&d_b, NO_BYTES);
    cudaMalloc((int **)&d_c, NO_BYTES);

    // transfer
    cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);

    // 커널 크기
    dim3 block(block_size);
    // size가 완벽하게 block.x로 나눠지지 않을 때는 보통 1 grid size를 추가해준다.
    dim3 grid((size/block.x) + 1);

    sum_array_gpu<<<grid, block>>>(d_a, d_b, d_c, size);
    cudaDeviceSynchronize();

    cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost);

    // array comparison
    compare_arrays(gpu_results, h_c, size);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(gpu_results);

    cudaDeviceReset();
    return 0;
}

Arrays are same 

