<a href="https://colab.research.google.com/github/rbdus0715/Machine-Learning/blob/main/study/cuda/01.intro-cuda/06.unique_index_calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

### **(1) 인덱싱**
[예시1]

|23|9|4|53|65|12|1|33|
|-|-|-|-|-|-|-|-|

In [5]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void unique_idx_calc_threadIdx(int * input) {
    int tid = threadIdx.x;
    printf("threadIdx : %d, value : %d\n", tid, input[tid]);
}


int main() {
    int array_size = 8;
    int array_byte_size = sizeof(int)*array_size;
    int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33};

    for(int i=0; i<array_size; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n \n");

    int * d_data;
    cudaMalloc((void**)&d_data, array_byte_size);
    cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice);

    dim3 block(8);
    dim3 grid(1);

    unique_idx_calc_threadIdx<<<grid, block>>>(d_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

23 9 4 53 65 12 1 33 
 
threadIdx : 0, value : 23
threadIdx : 1, value : 9
threadIdx : 2, value : 4
threadIdx : 3, value : 53
threadIdx : 4, value : 65
threadIdx : 5, value : 12
threadIdx : 6, value : 1
threadIdx : 7, value : 33



[예시2]

|23|9|4|53|
|-|-|-|-|

</br>

|65|12|1|33|
|-|-|-|-|

[결과]
```
23 9 4 53 65 12 1 33

threadIdx : 0, value : 23
threadIdx : 1, value : 9
threadIdx : 2, value : 4
threadIdx : 3, value : 53
threadIdx : 0, value : 23
threadIdx : 1, value : 9
threadIdx : 2, value : 4
threadIdx : 3, value : 53
```
위처럼 65, 12, 1, 33에 대해서는 접근하지 않았다.

[결과에 대한 이유]</br>
threadIdx.x가 "23 9 4 53", "65 12 1 33" 모두가 같았기 때문

In [6]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void unique_idx_calc_threadIdx(int * input) {
    int tid = threadIdx.x;
    printf("threadIdx : %d, value : %d\n", tid, input[tid]);
}


int main() {
    int array_size = 8;
    int array_byte_size = sizeof(int)*array_size;
    int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33};

    for(int i=0; i<array_size; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n \n");

    int * d_data;
    cudaMalloc((void**)&d_data, array_byte_size);
    cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice);

    dim3 block(4);
    dim3 grid(2);

    unique_idx_calc_threadIdx<<<grid, block>>>(d_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

23 9 4 53 65 12 1 33 
 
threadIdx : 0, value : 23
threadIdx : 1, value : 9
threadIdx : 2, value : 4
threadIdx : 3, value : 53
threadIdx : 0, value : 23
threadIdx : 1, value : 9
threadIdx : 2, value : 4
threadIdx : 3, value : 53



### **(2) global unit id**
- gid = tid + offset
- gid = tid + blockIdx.x * blockDim.x

In [8]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void unique_gid_calculation(int * input) {
    int tid = threadIdx.x;
    int offset = blockIdx.x * blockDim.x;
    int gid = tid + offset;
    printf("gid : %d, value : %d \n", gid, input[gid]);
}


int main() {
    int array_size = 8;
    int array_byte_size = sizeof(int)*array_size;
    int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33};

    for(int i=0; i<array_size; i++) {
        printf("%d ", h_data[i]);
    }
    printf("\n \n");

    int * d_data;
    cudaMalloc((void**)&d_data, array_byte_size);
    cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice);

    dim3 block(4);
    dim3 grid(2);

    unique_gid_calculation<<<grid, block>>>(d_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

23 9 4 53 65 12 1 33 
 
gid : 4, value : 65 
gid : 5, value : 12 
gid : 6, value : 1 
gid : 7, value : 33 
gid : 0, value : 23 
gid : 1, value : 9 
gid : 2, value : 4 
gid : 3, value : 53 

