In [1]:
cuda_code = r'''
#include <iostream>
#include <cuda.h>
#include <cassert>
#include <sys/time.h>
#include <cmath>

#define N 5
#define BLOCKSIZE 5

// CPU Prefix Sum
void PrefixSumCPU(float *hostInput, float *hostOutputCPU) {
    hostOutputCPU[0] = hostInput[0];
    for (int i = 1; i < N; i++) {
        hostOutputCPU[i] = hostOutputCPU[i - 1] + hostInput[i];
    }
}

// CUDA Kernel
__global__ void PrefixSumKernel(float *deviceInput, float *deviceOutput, int arrayLength, int threadCount) {
    __shared__ float tempShared[N];
    int tid = threadIdx.x;

    if (tid < arrayLength) {
        tempShared[tid] = deviceInput[tid];
    }

    __syncthreads();

    for (int offset = 1; offset < arrayLength; offset *= 2) {
        float val = 0;
        if (tid >= offset)
            val = tempShared[tid - offset];
        __syncthreads();
        tempShared[tid] += val;
        __syncthreads();
    }

    if (tid < arrayLength) {
        deviceOutput[tid] = tempShared[tid];
    }
}

// Time difference helper
double getTimeDiff(timeval start, timeval end) {
    return (double)(end.tv_sec - start.tv_sec) + (double)(end.tv_usec - start.tv_usec) / 1e6;
}

int main() {
    float hostInput[N], hostOutputGPU[N], hostOutputCPU[N];
    float *deviceInput, *deviceOutput;

    // Initialize host input
    for (int i = 0; i < N; i++) {
        hostInput[i] = static_cast<float>(i + 1);
    }

    cudaMalloc((void**)&deviceInput,  N * sizeof(float));
    cudaMalloc((void**)&deviceOutput, N * sizeof(float));
    cudaMemcpy(deviceInput, hostInput, N * sizeof(float), cudaMemcpyHostToDevice);

    dim3 dimBlock(BLOCKSIZE);
    dim3 dimGrid(1);

    timeval start, end;
    gettimeofday(&start, nullptr);
    PrefixSumKernel<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, N, BLOCKSIZE);
    cudaDeviceSynchronize();
    gettimeofday(&end, nullptr);

    double gpuTime = getTimeDiff(start, end);
    cudaMemcpy(hostOutputGPU, deviceOutput, N * sizeof(float), cudaMemcpyDeviceToHost);
    PrefixSumCPU(hostInput, hostOutputCPU);

    // Output formatted results
    std::cout << "Input Array: [";
    for (int i = 0; i < N; i++) {
        std::cout << hostInput[i];
        if (i < N - 1) std::cout << ", ";
    }
    std::cout << "]\n";

    std::cout << "CPU Prefix Sum: [";
    for (int i = 0; i < N; i++) {
        std::cout << hostOutputCPU[i];
        if (i < N - 1) std::cout << ", ";
    }
    std::cout << "]\n";

    std::cout << "GPU Prefix Sum: [";
    for (int i = 0; i < N; i++) {
        std::cout << hostOutputGPU[i];
        if (i < N - 1) std::cout << ", ";
        assert(fabs(hostOutputCPU[i] - hostOutputGPU[i]) < 1e-5);
    }
    std::cout << "]\n";

    std::cout << "GPU Execution Time: " << gpuTime << " seconds\n";

    cudaFree(deviceInput);
    cudaFree(deviceOutput);

    return 0;
}
'''

# Save updated code to file
with open("prefix_sum_cleaned.cu", "w") as f:
    f.write(cuda_code)

# Compile for T4 GPU
!nvcc -arch=sm_75 prefix_sum_cleaned.cu -o prefix_sum_cleaned

# Run it
!./prefix_sum_cleaned

Input Array: [1, 2, 3, 4, 5]
CPU Prefix Sum: [1, 3, 6, 10, 15]
GPU Prefix Sum: [1, 3, 6, 10, 15]
GPU Execution Time: 0.000151 seconds
