In [None]:
%%writefile Min_Max.cu
#include <iostream>
#include <vector>
#include <climits>
#include <cassert>

#define CUDA_CHECK(err) do { \
    if (err != cudaSuccess) { \
        std::cerr << "CUDA Error: " << cudaGetErrorString(err) << " at " << __FILE__ << ":" << __LINE__ << std::endl; \
        exit(1); \
    } \
} while(0)

__global__ void min_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicMin(result, arr[tid]);
    }
}

__global__ void max_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicMax(result, arr[tid]);
    }
}

__global__ void sum_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicAdd(result, arr[tid]);
    }
}

int main() {
    std::vector<int> arr = {5, 2, 9, 1, 7, 6, 8, 3, 4};
    int size = arr.size();
    int* d_arr;
    int* d_result;
    int result_min = INT_MAX;
    int result_max = INT_MIN;
    int result_sum = 0;

    // Allocate device memory
    CUDA_CHECK(cudaMalloc(&d_arr, size * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_result, sizeof(int)));

    // Copy data to device
    CUDA_CHECK(cudaMemcpy(d_arr, arr.data(), size * sizeof(int), cudaMemcpyHostToDevice));

    // Minimum reduction
    CUDA_CHECK(cudaMemcpy(d_result, &result_min, sizeof(int), cudaMemcpyHostToDevice));
    min_reduction_kernel<<<(size + 255)/256, 256>>>(d_arr, size, d_result);
    CUDA_CHECK(cudaMemcpy(&result_min, d_result, sizeof(int), cudaMemcpyDeviceToHost));

    // Maximum reduction
    CUDA_CHECK(cudaMemcpy(d_result, &result_max, sizeof(int), cudaMemcpyHostToDevice));
    max_reduction_kernel<<<(size + 255)/256, 256>>>(d_arr, size, d_result);
    CUDA_CHECK(cudaMemcpy(&result_max, d_result, sizeof(int), cudaMemcpyDeviceToHost));

    // Sum reduction
    result_sum = 0; // Reset
    CUDA_CHECK(cudaMemcpy(d_result, &result_sum, sizeof(int), cudaMemcpyHostToDevice));
    sum_reduction_kernel<<<(size + 255)/256, 256>>>(d_arr, size, d_result);
    CUDA_CHECK(cudaMemcpy(&result_sum, d_result, sizeof(int), cudaMemcpyDeviceToHost));

    std::cout << "Minimum: " << result_min << "\n"
              << "Maximum: " << result_max << "\n"
              << "Sum: " << result_sum << "\n"
              << "Average: " << static_cast<double>(result_sum)/size << std::endl;

    // Cleanup
    CUDA_CHECK(cudaFree(d_arr));
    CUDA_CHECK(cudaFree(d_result));

    return 0;
}


Writing Min_Max.cu


In [None]:
!nvcc -arch=sm_70 Min_Max.cu -o Min_Max

In [None]:
!./Min_Max

Minimum: 1
Maximum: 9
Sum: 45
Average: 5
