In [None]:
!nvcc --version
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [None]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>

#define N 20
#define BLOCK_SIZE 256

__global__ void findMinMax(int *input, int *minOutput, int *maxOutput) {
    __shared__ int sharedMin[BLOCK_SIZE];
    __shared__ int sharedMax[BLOCK_SIZE];

    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // Out Of Bound bhi check krna rehta hai
    if (i < N) {
        sharedMin[tid] = input[i];
        sharedMax[tid] = input[i];
    } else {
        sharedMin[tid] = INT_MAX;
        sharedMax[tid] = INT_MIN;
    }
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sharedMin[tid] = min(sharedMin[tid], sharedMin[tid + stride]);
            sharedMax[tid] = max(sharedMax[tid], sharedMax[tid + stride]);
        }
        __syncthreads();
    }
    if (tid == 0) {
        minOutput[blockIdx.x] = sharedMin[0];
        maxOutput[blockIdx.x] = sharedMax[0];
    }
}

int main() {
    size_t sz = N*sizeof(int);
    int *h_array = (int *)malloc(sz);
    int *d_array, *d_min, *d_max;
    int numBlocks = (N+BLOCK_SIZE-1) / BLOCK_SIZE;
    size_t szB=numBlocks*sizeof(int);


    int *h_minBlock = (int *)malloc(szB);
    int *h_maxBlock = (int *)malloc(szB);
    int finalMin = INT_MAX, finalMax = INT_MIN;

    for (int i = 0; i < N; i++) {
        h_array[i] = 2*89*(i+2*(i+6));
        printf("%d ",h_array[i]);
    }
    printf("\n");

    cudaMalloc((void **)&d_array, sz);
    cudaMalloc((void **)&d_min, szB);
    cudaMalloc((void **)&d_max, szB);

    cudaMemcpy(d_array, h_array, sz, cudaMemcpyHostToDevice);

    findMinMax<<<numBlocks, BLOCK_SIZE>>>(d_array, d_min, d_max);

    cudaMemcpy(h_minBlock, d_min, szB, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_maxBlock, d_max, szB, cudaMemcpyDeviceToHost);

    for (int i = 0; i < numBlocks; i++) {
        if (h_minBlock[i] < finalMin) finalMin = h_minBlock[i];
        if (h_maxBlock[i] > finalMax) finalMax = h_maxBlock[i];
    }

    printf("Minimum value: %d\n", finalMin);
    printf("Maximum value: %d\n", finalMax);

    cudaFree(d_array);
    cudaFree(d_min);
    cudaFree(d_max);
    free(h_array);
    free(h_minBlock);
    free(h_maxBlock);

    return 0;
}


2136 2670 3204 3738 4272 4806 5340 5874 6408 6942 7476 8010 8544 9078 9612 10146 10680 11214 11748 12282 
Minimum value: 2136
Maximum value: 12282

