<a href="https://colab.research.google.com/github/rbdus0715/Machine-Learning/blob/main/study/cuda/01.intro-cuda/11.sum_array_with_timing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **(1) CPU와 GPU 작동 시간 측정하는 방법**
```cpp
clock start = clock()
work loading...
clock end = clock()

difference = end - start
time = (difference / clocks_per_sec)
```

In [None]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

- GPU 연산에 걸리는 대부분의 시간은 메모리 transfering 연산이다.

In [3]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

// for random initialize
#include <stdlib.h>
#include <time.h>

// for memset
#include <cstring>

__global__ void sum_array_gpu(int * a, int * b, int * c, int size)
{
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    if(gid < size)
    {
        c[gid] = a[gid] + b[gid];
    }
}

void sum_array_cpu(int * a, int * b, int * c, int size)
{
    for(int i=0; i<size; i++)
    {
        c[i] = a[i] + b[i];
    }
}

void compare_arrays(int * a, int * b, int size)
{
    for (int i=0; i<size; i++)
    {
        if(a[i] != b[i])
        {
            printf("Array are different \n");
            return;
        }
    }
    printf("Arrays are same \n");
}

#define gpuErrchk(ans) {gpuAssert((ans), __FILE__, __LINE__);}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if(code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d \n", cudaGetErrorString(code), file, line);
        if(abort) exit(code);
    }
}

int main() {
    int size = 1 << 25;
    int block_size = 1024;
    cudaError error;

    size_t NO_BYTES = size * sizeof(int);

    // host pointers
    // cpu_results cpu 계산 결과를 저장
    int * h_a, * h_b, * gpu_results, * cpu_results;

    // host에 메모리 할당
    h_a = (int*)malloc(NO_BYTES);
    h_b = (int*)malloc(NO_BYTES);
    gpu_results = (int*)malloc(NO_BYTES);
    cpu_results = (int*)malloc(NO_BYTES);

    // host 포인터 initialize
    time_t t;
    srand((unsigned)time(&t));
    for(int i=0; i<size; i++) {
        h_a[i] = (int)(rand() & 0xFF);
    }
    for(int i=0; i<size; i++) {
        h_b[i] = (int)(rand() & 0xFF);
    }

    ////////////////////////////////////////////////
    // host calculation with 시간 측정
    clock_t cpu_start, cpu_end;
    cpu_start = clock();
    sum_array_cpu(h_a, h_b, cpu_results, size);
    cpu_end = clock();


    memset(gpu_results, 0, NO_BYTES);

    // device pointer
    int * d_a, * d_b, * d_c;

    // 쿠다 함수 적용할 때마다 error를 반환받는다
    gpuErrchk(cudaMalloc((int **)&d_a, NO_BYTES));
    gpuErrchk(cudaMalloc((int **)&d_b, NO_BYTES));
    gpuErrchk(cudaMalloc((int **)&d_c, NO_BYTES));


    ////////////////////////////////////////////////
    // transfer 시간 체크
    clock_t htod_start, htod_end;
    htod_start = clock();
    cudaMemcpy(d_a, h_a, NO_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, NO_BYTES, cudaMemcpyHostToDevice);
    htod_end = clock();

    // 커널 크기
    dim3 block(block_size);
    // size가 완벽하게 block.x로 나눠지지 않을 때는 보통 1 grid size를 추가해준다.
    dim3 grid((size/block.x) + 1);


    ////////////////////////////////////////////////
    // 커널 시간 체크
    clock_t gpu_start, gpu_end;
    gpu_start = clock();
    sum_array_gpu<<<grid, block>>>(d_a, d_b, d_c, size);
    cudaDeviceSynchronize();
    gpu_end = clock();

    ////////////////////////////////////////////////
    // transfer 시간 체크
    clock_t dtoh_start, dtoh_end;
    dtoh_start = clock();
    cudaMemcpy(gpu_results, d_c, NO_BYTES, cudaMemcpyDeviceToHost);
    dtoh_end = clock();

    // array comparison
    compare_arrays(gpu_results, cpu_results, size);


    ////////////////////////////////////////////////
    // CPU 합 측정 결과
    printf("Sum array CPU execution time : %4.6f \n",
           (double)((double)(cpu_end - cpu_start) / CLOCKS_PER_SEC));
    // GPU 합 측정 결과
    printf("Sum array GPU execution time : %4.6f \n",
           (double)((double)(gpu_end - gpu_start) / CLOCKS_PER_SEC));
    // h to d 측정 결과
    printf("h to d memory transfer time : %4.6f \n",
           (double)((double)(htod_end - htod_start) / CLOCKS_PER_SEC));
    // d to h 측정 결과
    printf("d to h memory transfer time : %4.6f \n",
           (double)((double)(dtoh_end - dtoh_start) / CLOCKS_PER_SEC));
    // GPU시간 총 측정 결과 (htod -> gpu 합 -> dtoh)
    printf("total GPU time : %4.6f \n",
           (double)((double)(dtoh_end - htod_start) / CLOCKS_PER_SEC));

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    free(h_a);
    free(h_b);
    free(gpu_results);

    cudaDeviceReset();
    return 0;
}

Arrays are same 
Sum array CPU execution time : 0.159459 
Sum array GPU execution time : 0.001651 
h to d memory transfer time : 0.056667 
d to h memory transfer time : 0.030207 
total GPU time : 0.088529 



### **(2) 쿠다 어플리케이션의 성능**
- Execution time
- Power consumption
- Floor space
- Cost of hardware