<a href="https://colab.research.google.com/github/rbdus0715/Machine-Learning/blob/main/study/cuda/01.intro-cuda/07.unique_index_cal_for2Dgrid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvcc --version
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

### **(1) 2차원 그리드에서 잘못된 인덱싱**

```python
# X 차원만 생각한 인덱싱
[
    [[A, B, C, D], [E, F, G, H]],
    # 0, 1, 2, 3,   4, 5, 6, 7
    [[I, J, K, L], [M, N, O, P]]
    # 0, 1, 2, 3,   4, 5, 6, 7

]
```

- 앞에서 배운 index = blockIdx.x * blockDim.x + threadIdx.x 만 사용하면 안된다. Y 차원도 생각해야한다.

In [2]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void unique_gid_calculation_2d(int * input) {
    int tid = threadIdx.x;
    int offset = blockIdx.x * blockDim.x;
    int gid = tid + offset;
    printf("blockIdx.x : %d, blockIdx.y : %d, threadIdx.x : %d, gid : %d - data : %d \n",
           blockIdx.x, blockIdx.y, tid, gid, input[gid]);
}


int main() {
    int array_size = 16;
    int array_byte_size = sizeof(int)*array_size;
    int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33, 22, 43, 56, 4, 76, 81, 94, 32};

    int * d_data;
    cudaMalloc((void**)&d_data, array_byte_size);
    cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice);

    dim3 block(4);
    dim3 grid(2, 2);

    unique_gid_calculation_2d<<<grid, block>>>(d_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 0, gid : 4 - data : 65 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 1, gid : 5 - data : 12 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 2, gid : 6 - data : 1 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 3, gid : 7 - data : 33 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 0, gid : 4 - data : 65 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 1, gid : 5 - data : 12 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 2, gid : 6 - data : 1 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 3, gid : 7 - data : 33 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 0, gid : 0 - data : 23 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 1, gid : 1 - data : 9 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 2, gid : 2 - data : 4 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 3, gid : 3 - data : 53 
blockIdx.x : 0, blockIdx.y : 0, threadIdx.x : 0, gid : 0 - data : 23 
blockIdx.x : 0, blockIdx.y : 0, threadIdx.x : 1, gid : 1 - data : 9 
blockIdx.x : 0, blockIdx.

### **(2) (2D grid에서 스레드 블럭이 1차원일 때) grid의 Y차원까지 고려한 인덱싱**

- index = row offset + block offset + tid
- index = 한 스레드 블럭의 row에 길이만큼의 스레드 개수 * blockIdx.y + 스레드 블럭에 담겨있는 스레드 개수 * blockIdx.x + threadIdx.x
    - 한 스레드 블럭의 row에 길이만큼의 스레드 개수 = gridDim.x * blockDim.x
    - 스레드 블럭에 담겨있는 스레드 개수 = blockDim.x

**[정리]**</br>
$gid = (gridDim.x \times blockDim.x \times blockIdx.y) + (blockIdx.X \times blockDim.x) + threadIdx.x$


```python
# 스레드 블럭이 1차원일 경우에
# X 차원, Y차원을 모두 고려한 인덱싱
[
    [[A, B, C, D], [E, F, G, H]],
    # 0, 1, 2, 3,   4, 5, 6, 7
    [[I, J, K, L], [M, N, O, P]]
    # 8, 9, 10, 11 12, 13, 14, 15

]
```

In [3]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void unique_gid_calculation_2d(int * input) {
    int tid = threadIdx.x;
    int block_offset = blockIdx.x * blockDim.x;
    int row_offset = gridDim.x * blockDim.x * blockIdx.y;

    int gid = tid + block_offset + row_offset;
    printf("blockIdx.x : %d, blockIdx.y : %d, threadIdx.x : %d, gid : %d - data : %d \n",
           blockIdx.x, blockIdx.y, tid, gid, input[gid]);
}


int main() {
    int array_size = 16;
    int array_byte_size = sizeof(int)*array_size;
    int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33, 22, 43, 56, 4, 76, 81, 94, 32};

    int * d_data;
    cudaMalloc((void**)&d_data, array_byte_size);
    cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice);

    dim3 block(4);
    dim3 grid(2, 2);

    unique_gid_calculation_2d<<<grid, block>>>(d_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 0, gid : 4 - data : 65 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 1, gid : 5 - data : 12 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 2, gid : 6 - data : 1 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 3, gid : 7 - data : 33 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 0, gid : 12 - data : 76 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 1, gid : 13 - data : 81 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 2, gid : 14 - data : 94 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 3, gid : 15 - data : 32 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 0, gid : 8 - data : 22 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 1, gid : 9 - data : 43 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 2, gid : 10 - data : 56 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 3, gid : 11 - data : 4 
blockIdx.x : 0, blockIdx.y : 0, threadIdx.x : 0, gid : 0 - data : 23 
blockIdx.x : 0, blockIdx.y : 0, threadIdx.x : 1, gid : 1 - data : 9 
blockIdx.x : 0, b

### **(3) (2D grid에서 스레드 블럭이 2차원일 때) 인덱싱**

```python
# 총 16개의 스레드
# dim3 grid(2, 2);
# dim3 block(2, 2);
[
    [
        [[0, 1], [2, 3]], [[4, 5], [6, 7]]
    ],
    [
        [[8, 9], [10, 11]], [[12, 13], [14, 15]]
    ]
]
```

tid $= threadIdx.y * blockDim.x + threadIdx.x$

block_offset $= (blockDim.x * blockDim.y) * blockIdx.x$

row_offset $= (blockDim.x * blockDim.y * gridDim.x) * blockIdx.y$

In [10]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void unique_gid_calculation_2d_2d(int * input) {
    int tid = blockDim.x * threadIdx.y + threadIdx.x;
    int block_offset = blockDim.x * blockDim.y * blockIdx.x;
    int row_offset = (blockDim.x * blockDim.y * gridDim.x) * blockIdx.y;

    int gid = tid + block_offset + row_offset;
    printf("blockIdx.x : %d, blockIdx.y : %d, threadIdx.x : %d, gid : %d - data : %d \n",
           blockIdx.x, blockIdx.y, tid, gid, input[gid]);
}


int main() {
    int array_size = 16;
    int array_byte_size = sizeof(int)*array_size;
    int h_data[] = {23, 9, 4, 53, 65, 12, 1, 33, 22, 43, 56, 4, 76, 81, 94, 32};

    int * d_data;
    cudaMalloc((void**)&d_data, array_byte_size);
    cudaMemcpy(d_data, h_data, array_byte_size, cudaMemcpyHostToDevice);

    dim3 block(2, 2);
    dim3 grid(2, 2);

    unique_gid_calculation_2d_2d<<<grid, block>>>(d_data);
    cudaDeviceSynchronize();
    cudaDeviceReset();

    return 0;
}

blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 0, gid : 4 - data : 65 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 1, gid : 5 - data : 12 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 2, gid : 6 - data : 1 
blockIdx.x : 1, blockIdx.y : 0, threadIdx.x : 3, gid : 7 - data : 33 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 0, gid : 12 - data : 76 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 1, gid : 13 - data : 81 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 2, gid : 14 - data : 94 
blockIdx.x : 1, blockIdx.y : 1, threadIdx.x : 3, gid : 15 - data : 32 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 0, gid : 8 - data : 22 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 1, gid : 9 - data : 43 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 2, gid : 10 - data : 56 
blockIdx.x : 0, blockIdx.y : 1, threadIdx.x : 3, gid : 11 - data : 4 
blockIdx.x : 0, blockIdx.y : 0, threadIdx.x : 0, gid : 0 - data : 23 
blockIdx.x : 0, blockIdx.y : 0, threadIdx.x : 1, gid : 1 - data : 9 
blockIdx.x : 0, b