In [None]:
!nvcc --version
%pip install nvcc4jupyter
%load_ext nvcc4jupyter

In [4]:
%%cuda
#include <stdio.h>
#include <stdlib.h>

#define N 16

__global__ 
void mergeKernel(int *input, int *output, int width, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int start = tid * 2 * width;

    if (start >= size) return;

    int mid = min(start + width, size);
    int end = min(start + 2 * width, size);

    int i = start, j = mid, k = start;

    while (i < mid && j < end) {
        if (input[i] <= input[j])
            output[k++] = input[i++];
        else
            output[k++] = input[j++];
    }

    while (i < mid)
        output[k++] = input[i++];
    while (j < end)
        output[k++] = input[j++];
}


int main() {
    int h_input[N],h_output[N], *d_input, *d_output;

    printf("Unsorted array:\n");
    for (int i = 0; i < N; i++) {
        h_input[i] = rand() % 100;
        printf("%d ", h_input[i]);
    }
    printf("\n");

    size_t bytes = N * sizeof(int);
    cudaMalloc(&d_input, bytes);
    cudaMalloc(&d_output, bytes);
    cudaMemcpy(d_input, h_input, bytes, cudaMemcpyHostToDevice);

    int *in = d_input;
    int *out = d_output;

    for (int width = 1; width < N; width *= 2) {
        int threads = (N + 2 * width - 1) / (2 * width);
        mergeKernel<<<(threads + 255) / 256, 256>>>(in, out, width, N);
        cudaDeviceSynchronize();

        int *temp = in;
        in = out;
        out = temp;
    }

    cudaMemcpy(h_output, in, bytes, cudaMemcpyDeviceToHost);

    printf("\nSorted array:\n");
    for (int i = 0; i < N; i++)
        printf("%d ", h_output[i]);
    printf("\n");

    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Unsorted array:
41 67 34 0 69 24 78 58 62 64 5 45 81 27 61 91 

Sorted array:
0 5 24 27 34 41 45 58 61 62 64 67 69 78 81 91 

