In [1]:
%%writefile reduction_a.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM_ELEMENT 100000

int main() {
	int* arr;
	int max = 0;
	clock_t start, end;

  // random number creation
	srand(time(NULL));
	arr = (int*)malloc(sizeof(int) * NUM_ELEMENT);
	for (int i = 0; i < NUM_ELEMENT; i++)
		arr[i] = rand() % (NUM_ELEMENT * 10);

  // finding max num
	start = clock();
	for (int i = 0; i < NUM_ELEMENT; i++) {
		if (max < arr[i])
			max = arr[i];
	}
	end = clock();

	printf("MAX NUM : %d\n", max);
	printf("EXEC TIME : %f ms\n", (float)(end - start));
	
	return 0;
}

Overwriting reduction_a.c


In [2]:
!nvcc -o reduction_a reduction_a.c

In [3]:
!./reduction_a

MAX NUM : 999996
EXEC TIME : 250.000000 ms


In [4]:
%%writefile reduction_b.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM_ELEMENT 100000
#define BLOCK_SIZE 128
#define GRID_SIZE ((NUM_ELEMENT + BLOCK_SIZE - 1) / BLOCK_SIZE)

__global__ void reduction_max(int *Arr, int *Max){
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    for (int i = 1; i < blockDim.x; i *= 2){
        if (id % (2*i) == 0){
            if (Arr[id] < Arr[id + i])
              Arr[id] = Arr[id + i];
        }
        __syncthreads();
    }

    if (id == 0)
      Max[0] = Arr[0];
}

int main(){
	int* arr;
  int* d_arr, *d_max;
	int max = 0;
  cudaEvent_t start, end;
  float etime;
  dim3 block(BLOCK_SIZE);
  dim3 grid(GRID_SIZE);

  cudaEventCreate(&start);
	cudaEventCreate(&end);
	srand(time(NULL));

  // random number creation
	arr = (int*)malloc(sizeof(int) * NUM_ELEMENT);
	for (int i = 0; i < NUM_ELEMENT; i++)
		arr[i] = rand() % (NUM_ELEMENT * 10);

  // tmp print
	//for (int i = 0; i < NUM_ELEMENT; i++)
	//	printf("%d\n", arr[i]);

  // cuda var initialization
  cudaMalloc((void**)&d_arr, sizeof(int)*NUM_ELEMENT);
  cudaMalloc((void**)&d_max, sizeof(int));
  cudaMemcpy(d_arr, arr, sizeof(int)*NUM_ELEMENT, cudaMemcpyHostToDevice);

  // kernel call & exec time check
  cudaEventRecord(start, 0);
	reduction_max<<<grid, block>>>(d_arr, d_max);
	cudaEventRecord(end, 0);
	cudaEventSynchronize(end);
	cudaEventElapsedTime(&etime, start, end);

  cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost);

	printf("MAX NUM : %d\n", max);
	printf("EXEC TIME : %f ms\n", etime);

  cudaEventDestroy(start);
	cudaEventDestroy(end);
  cudaFree(d_arr);
  cudaFree(d_max);

	return 0;
}

Overwriting reduction_b.cu


In [5]:
!nvcc -o reduction_b reduction_b.cu

In [6]:
!./reduction_b

MAX NUM : 987896
EXEC TIME : 0.023008 ms


In [7]:
%%writefile reduction_c.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM_ELEMENT 100000
#define BLOCK_SIZE 128
#define GRID_SIZE ((NUM_ELEMENT + BLOCK_SIZE - 1) / BLOCK_SIZE)
#define WARP_SIZE 32

__global__ void reduction_max(int *Arr, int *Max){
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    for (int i = 1; i < blockDim.x; i *= 2){
        if ((id / WARP_SIZE) % (2*i) == 0){
            if (Arr[id] < Arr[id + i])
              Arr[id] = Arr[id + i];
        }
        __syncthreads();
    }

    if (id == 0)
      Max[0] = Arr[0];
}

int main(){
	int* arr;
  int* d_arr, *d_max;
	int max = 0;
  cudaEvent_t start, end;
  float etime;
  dim3 block(BLOCK_SIZE);
  dim3 grid(GRID_SIZE);

  cudaEventCreate(&start);
	cudaEventCreate(&end);
	srand(time(NULL));

  // random number creation
	arr = (int*)malloc(sizeof(int) * NUM_ELEMENT);
	for (int i = 0; i < NUM_ELEMENT; i++)
		arr[i] = rand() % (NUM_ELEMENT * 10);

  // tmp print
	//for (int i = 0; i < NUM_ELEMENT; i++)
	//	printf("%d\n", arr[i]);

  // cuda var initialization
  cudaMalloc((void**)&d_arr, sizeof(int)*NUM_ELEMENT);
  cudaMalloc((void**)&d_max, sizeof(int));
  cudaMemcpy(d_arr, arr, sizeof(int)*NUM_ELEMENT, cudaMemcpyHostToDevice);

  // kernel call & exec time check
  cudaEventRecord(start, 0);
	reduction_max<<<grid, block>>>(d_arr, d_max);
	cudaEventRecord(end, 0);
	cudaEventSynchronize(end);
	cudaEventElapsedTime(&etime, start, end);

  cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost);

	printf("MAX NUM : %d\n", max);
	printf("EXEC TIME : %f ms\n", etime);

  cudaEventDestroy(start);
	cudaEventDestroy(end);
  cudaFree(d_arr);
  cudaFree(d_max);

	return 0;
}

Overwriting reduction_c.cu


In [8]:
!nvcc -o reduction_c reduction_c.cu

In [9]:
!./reduction_c

MAX NUM : 989165
EXEC TIME : 0.021088 ms


In [10]:
%%writefile reduction_d.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM_ELEMENT 100000
#define BLOCK_SIZE 32
#define GRID_SIZE ((NUM_ELEMENT + BLOCK_SIZE - 1) / BLOCK_SIZE)
#define WARP_SIZE 32
#define SHARED_SIZE (BLOCK_SIZE * 4)

__global__ void reduction_max(int *Arr, int *Max){
    extern __shared__ int sharedmem[];
    int tid = threadIdx.x;
    int id = blockIdx.x * blockDim.x + threadIdx.x;

    sharedmem[tid] = Arr[id];
    __syncthreads();

    for (int i = 1; i < blockDim.x; i *= 2){
        if ((tid / WARP_SIZE) % (2*i) == 0){
            if (sharedmem[tid] < sharedmem[tid + i])
              sharedmem[tid] = sharedmem[tid + i];
        }
        __syncthreads();
    }

    if (tid == 0)
      Max[0] = sharedmem[0];
}

int main(){
	int* arr;
  int* d_arr, *d_max;
	int max = 0;
  cudaEvent_t start, end;
  float etime;
  dim3 block(BLOCK_SIZE);
  dim3 grid(GRID_SIZE);

  cudaEventCreate(&start);
	cudaEventCreate(&end);
	srand(time(NULL));

  // random number creation
	arr = (int*)malloc(sizeof(int) * NUM_ELEMENT);
	for (int i = 0; i < NUM_ELEMENT; i++)
		arr[i] = rand() % (NUM_ELEMENT * 10);

  // tmp print
	//for (int i = 0; i < NUM_ELEMENT; i++)
	//	printf("%d\n", arr[i]);

  // cuda var initialization
  cudaMalloc((void**)&d_arr, sizeof(int)*NUM_ELEMENT);
  cudaMalloc((void**)&d_max, sizeof(int));
  cudaMemcpy(d_arr, arr, sizeof(int)*NUM_ELEMENT, cudaMemcpyHostToDevice);

  // kernel call & exec time check
  cudaEventRecord(start, 0);
	reduction_max<<<grid, block, SHARED_SIZE>>>(d_arr, d_max);
	cudaEventRecord(end, 0);
	cudaEventSynchronize(end);
	cudaEventElapsedTime(&etime, start, end);

  cudaMemcpy(&max, d_max, sizeof(int), cudaMemcpyDeviceToHost);

	printf("MAX NUM : %d\n", max);
	printf("EXEC TIME : %f ms\n", etime);

  cudaEventDestroy(start);
	cudaEventDestroy(end);
  cudaFree(d_arr);
  cudaFree(d_max);

	return 0;
}

Overwriting reduction_d.cu


In [11]:
!nvcc -o reduction_d reduction_d.cu

In [12]:
!./reduction_d

MAX NUM : 964131
EXEC TIME : 0.019616 ms
