<a href="https://colab.research.google.com/github/siavashadpey/gpu_intro/blob/master/cs179_lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Lab 1 of CS179 (http://courses.cms.caltech.edu/cs179/)

In [19]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-38vlorot
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-38vlorot
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=b5b5ab6bfe342757979fd498e1b1f48371c794dd8807714efa392b676d44e730
  Stored in directory: /tmp/pip-ephem-wheel-cache-28uwl94l/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [42]:
%%cu

/* 
 * CUDA blur
 * Kevin Yuh, 2014 
 * Revised by Nailen Matschke, 2016
 * Revised by Loko Kung, 2018
 */

#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <vector>
#include <fstream>
#include <iostream>
#include <time.h>

#include <cuda_runtime.h>
#include <algorithm>
#include <cassert>

using std::cerr;
using std::cout;
using std::endl;

const float PI = 3.14159265358979;
#define AUDIO_ON 0

#if AUDIO_ON
#include <sndfile.h>
#endif

__host__ __device__ void cuda_blur_kernel_convolution(uint thread_index, 
                                  const float* gpu_raw_data,
                                  const float* gpu_blur_v, float* gpu_out_data,
                                  const unsigned int n_frames,
                                  const unsigned int blur_v_size) {
    // TODO: Implement the necessary convolution function that should be
    //       completed for each thread_index. Use the CPU implementation in
    //       blur.cpp as a reference.
    if (thread_index < blur_v_size) {
        for (int j = 0; j <= thread_index; j++) {
            gpu_out_data[thread_index] += gpu_raw_data[thread_index - j]*gpu_blur_v[j];
        }
    }
    else {
        for (int j = 0; j < blur_v_size; j++) {
            gpu_out_data[thread_index] += gpu_raw_data[thread_index - j]*gpu_blur_v[j];
        }
    }
}

__global__ void cuda_blur_kernel(const float *gpu_raw_data, const float *gpu_blur_v,
                      float *gpu_out_data, int n_frames, int blur_v_size) {
    // DONE: Compute the current thread index. (1D grid and block)
    uint thread_index = blockDim.x*blockIdx.x + threadIdx.x; 

    // DONE: Update the while loop to handle all indices for this thread.
    //       Remember to advance the index as necessary.
    while (thread_index < n_frames) {
        // Do computation for this thread index
        cuda_blur_kernel_convolution(thread_index, gpu_raw_data,
                                     gpu_blur_v, gpu_out_data,
                                     n_frames, blur_v_size);
        // DONE: Update the thread index
        thread_index += blockDim.x*gridDim.x;
    }
}

// This function will be called from the host code to invoke the kernel
// function. Any memory address/pointer locations passed to this function
// must be host addresses. This function will be in charge of allocating
// GPU memory, invoking the kernel, and cleaning up afterwards. The result
// will be stored in out_data. The function returns the amount of time that
// it took for the function to complete (prior to returning) in milliseconds.
float cuda_call_blur_kernel(const unsigned int blocks,
                            const unsigned int threads_per_block,
                            const float *raw_data,
                            const float *blur_v,
                            float *out_data,
                            const unsigned int n_frames,
                            const unsigned int blur_v_size) {
    // Use the CUDA machinery for recording time
    cudaEvent_t start_gpu, stop_gpu;
    float time_milli = -1;
    cudaEventCreate(&start_gpu);
    cudaEventCreate(&stop_gpu);
    cudaEventRecord(start_gpu);

    // DONE: Allocate GPU memory for the raw input data (either audio file
    //       data or randomly generated data. The data is of type float and
    //       has n_frames elements. Then copy the data in raw_data into the
    //       GPU memory you allocated.
    float* gpu_raw_data;
    cudaMalloc((void**) &gpu_raw_data, n_frames*sizeof(float));
    cudaMemcpy(gpu_raw_data, raw_data, n_frames*sizeof(float), cudaMemcpyHostToDevice);

    // DONE: Allocate GPU memory for the impulse signal (for now global GPU
    //       memory is fine. The data is of type float and has blur_v_size
    //       elements. Then copy the data in blur_v into the GPU memory you
    //       allocated.
    float* gpu_blur_v;
    cudaMalloc((void**) &gpu_blur_v, blur_v_size*sizeof(float));
    cudaMemcpy(gpu_blur_v, blur_v, blur_v_size*sizeof(float), cudaMemcpyHostToDevice);

    // Done: Allocate GPU memory to store the output audio signal after the
    //       convolution. The data is of type float and has n_frames elements.
    //       Initialize the data as necessary.
    float* gpu_out_data;
    cudaMalloc((void**) &gpu_out_data, n_frames*sizeof(float));
    cudaMemset(gpu_out_data, 0, n_frames*sizeof(float));

    // DONE: Appropriately call the kernel function
    cuda_blur_kernel<<<blocks, threads_per_block>>>(gpu_raw_data, gpu_blur_v, gpu_out_data, n_frames, blur_v_size);

    // Check for errors on kernel call
    cudaError err = cudaGetLastError();
    if (cudaSuccess != err)
        fprintf(stderr, "Error %s\n", cudaGetErrorString(err));
    else
        fprintf(stderr, "No kernel error detected\n");

    // DONE: Now that kernel calls have finished, copy the output signal
    //       back from the GPU to host memory. (We store this channel's result
    //       in out_data on the host.)
    cudaMemcpy(out_data, gpu_out_data, n_frames*sizeof(float), cudaMemcpyDeviceToHost);

    // DONE: Free GPU resources
    cudaFree(gpu_raw_data);
    cudaFree(gpu_blur_v);
    cudaFree(gpu_out_data);
    
    // Stop the recording timer and return the computation time
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);
    cudaEventElapsedTime(&time_milli, start_gpu, stop_gpu);
    return time_milli;
}

float gaussian(float x, float mean, float std) {
    return (1 / (std * sqrt(2 * PI))) 
        * exp(-1.0 / 2.0 * pow((x - mean) / std, 2));
}

/*
 * NOTE: You can use this macro to easily check cuda error codes 
 * and get more information. 
 * 
 * Modified from:
 * http://stackoverflow.com/questions/14038589/
 *   what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpu_errchk(ans) { gpu_assert((ans), __FILE__, __LINE__); }
inline void gpu_assert(cudaError_t code, const char *file, int line,
                       bool abort = true) {
    if (code != cudaSuccess) {
        fprintf(stderr, "gpu_assert: %s %s %d\n",
                cudaGetErrorString(code), file, line);
        exit(code);
    }
}

/* Checks the passed-in arguments for validity. */
void check_args(int argc, char **argv) {
#if AUDIO_ON
    if (argc != 5) {
        cerr << "Incorrect number of arguments.\n";
        cerr << "Arguments: <threads per block> <max number of blocks> " <<
            "<input file> <output file>\n";
        exit(EXIT_FAILURE);
    }
#else
    if (argc != 3) {
        cerr << "Incorrect number of arguments.\n";
        cerr << "Arguments: <threads per block> <max number of blocks>\n";
        exit(EXIT_FAILURE);
    }
#endif
}


/*
 * Reads in audio data (alternatively, generates random data), and convolves
 * each channel with the specified filtering function h[n], producing output
 * data. 
 * 
 * Uses both CPU and GPU implementations, and compares the results.
 */
int large_gauss_test(const unsigned int local_size, const unsigned int max_blocks){ //(int argc, char **argv) {
    //check_args(argc, argv);

    /* Form Gaussian blur vector */
    float mean = 0.0;
    float std = 5.0;

    int GAUSSIAN_SIDE_WIDTH = 10;
    int GAUSSIAN_SIZE = 2 * GAUSSIAN_SIDE_WIDTH + 1;

    // Space for both sides of the gaussian blur vector, plus the middle,
    // gives this size requirement
    float *blur_v = (float *) malloc(sizeof (float) * GAUSSIAN_SIZE );

    // Fill it from the middle out
    for (int i = -GAUSSIAN_SIDE_WIDTH; i <= GAUSSIAN_SIDE_WIDTH; i++)
        blur_v[ GAUSSIAN_SIDE_WIDTH + i ] = gaussian(i, mean, std);

    // Normalize to avoid clipping and/or hearing loss (brackets for scoping)
    {
        float total = 0.0;
        for (int i = 0; i < GAUSSIAN_SIZE; i++)
            total += blur_v[i];
        for (int i = 0; i < GAUSSIAN_SIZE; i++)
            blur_v[i] /= total;

        cout << "Normalized by factor of: " << total << endl;
    }


#if 1
    for (int i = 0; i < GAUSSIAN_SIZE; i++)
        cout << "gaussian[" << i << "] = " << blur_v[i] << endl;
#endif


#if AUDIO_ON
    SNDFILE *in_file, *out_file;
    SF_INFO in_file_info, out_file_info;

    int amt_read;

    // Open input audio file
    in_file = sf_open(argv[3], SFM_READ, &in_file_info);
    if (!in_file) {
        cerr << "Cannot open input file, exiting\n";
        exit(EXIT_FAILURE);
    }

    // Read audio
    float *all_channel_input =
        new float[in_file_info.frames * in_file_info.channels];
    amt_read =
        sf_read_float(in_file, all_channel_input,
            in_file_info.frames * in_file_info.channels);
    assert(amt_read == in_file_info.frames * in_file_info.channels);

    // Prepare output storage
    float *all_channel_output =
        new float[in_file_info.frames * in_file_info.channels];

    int n_channels = in_file_info.channels;
    int n_frames = in_file_info.frames;


#else
    // If we're using random data instead of audio data, we can control the size
    // of our input signal, and use the "channels" parameter to control how many
    // trials we run.

    // Can set as the number of trials
    int n_channels = 1;
    // Can set how many data points arbitrarily
    int n_frames = 1e7;
#endif


    // Per-channel input data
    float *input_data = (float *) malloc(sizeof (float) * n_frames);

    // Output data storage for GPU implementation (will write to this from GPU)
    float *output_data = (float *) malloc(n_frames * sizeof (float));

    // Output data storage for CPU implementation
    float *output_data_host = (float *) malloc(n_frames * sizeof (float));

    // Iterate through each audio channel (e.g. 2 iterations for stereo files)
    for (int ch = 0; ch < n_channels; ch++) {
    #if AUDIO_ON
        // Load this channel's data
        for (int i = 0; i < n_frames; i++)
            input_data[i] = all_channel_input[(i * n_channels) + ch];
    #else
        // Generate random data if not using audio
        for (int i = 0; i < n_frames; i++)
            input_data[i] = ((float) rand()) / RAND_MAX;
    #endif
        
        // CPU Blurring
        cout << "CPU blurring..." << endl;

        memset(output_data_host, 0, n_frames * sizeof (float));

        // Use the CUDA machinery for recording time
        cudaEvent_t start_cpu, stop_cpu;
        cudaEventCreate(&start_cpu);
        cudaEventCreate(&stop_cpu);
        cudaEventRecord(start_cpu);
        	
        // (For scoping)
        {
            for (int i = 0; i < GAUSSIAN_SIZE; i++) {
                for (int j = 0; j <= i; j++)
                    output_data_host[i] += input_data[i - j] * blur_v[j]; 
            }
            for (int i = GAUSSIAN_SIZE; i < n_frames; i++) {
                for (int j = 0; j < GAUSSIAN_SIZE; j++)
                    output_data_host[i] += input_data[i - j] * blur_v[j]; 
            }
        }

        // Stop timer
        cudaEventRecord(stop_cpu);
        cudaEventSynchronize(stop_cpu);

        // GPU blurring
        cout << "GPU blurring..." << endl;

        // Cap the number of blocks
        const unsigned int blocks = std::min(max_blocks,
            (unsigned int) ceil(n_frames / (float) local_size));
        printf("nframes: %d", n_frames);
        printf("nblocks: %d", blocks);

        // Call our exposed entry point to our GPU kernel handler
        float gpu_time_milliseconds = cuda_call_blur_kernel(blocks, local_size,
                                                            input_data, blur_v,
                                                            output_data, n_frames,
                                                            GAUSSIAN_SIZE);

        cout << "Comparing..." << endl;

        // Compare results
        bool success = true;
        for (int i = 0; i < n_frames; i++) {
            if (fabs(output_data_host[i] - output_data[i]) < 1e-6) {
            #if 0
                cout << "Correct output at index " << i << ": " << output_data_host[i] << ", " 
                    << output_data[i] << endl;
            #endif
            }
            else {
                success = false;
                cerr << "Incorrect output at index " << i << ": " <<
                    output_data_host[i] << ", "  << output_data[i] << endl;
            }
        }

        if (success)
            cout << endl << "Successful output" << endl;

        float cpu_time_milliseconds;
        cudaEventElapsedTime(&cpu_time_milliseconds, start_cpu, stop_cpu);

        cout << endl;
        cout << "CPU time: " << cpu_time_milliseconds << " milliseconds" << endl;
        cout << "GPU time: " << gpu_time_milliseconds << " milliseconds" << endl;
        cout << endl << "Speedup factor: " <<
            cpu_time_milliseconds / gpu_time_milliseconds << endl << endl;

        // Write output audio data to multichannel array
    #if AUDIO_ON
        for (int i = 0; i < n_frames; i++){
            all_channel_output[i * n_channels + ch] = output_data[i];
        }
    #endif


    }

    // Free memory on host
    free(input_data);
    free(output_data);
    free(output_data_host);


    // Write audio output to file
#if AUDIO_ON
    out_file_info = in_file_info;
    out_file = sf_open(argv[4], SFM_WRITE, &out_file_info);
    if (!out_file) {
        cerr << "Cannot open output file, exiting\n";
        exit(EXIT_FAILURE);
    }

    sf_write_float(out_file, all_channel_output, amt_read); 
    sf_close(in_file);
    sf_close(out_file);

#endif

    return EXIT_SUCCESS;
}

int main(int argc, char **argv) {
    const unsigned int local_size = 512;
    const unsigned int max_blocks = 200;
    return large_gauss_test(local_size, max_blocks);
}

Normalized by factor of: 0.964579
gaussian[0] = 0.0111947
gaussian[1] = 0.0163699
gaussian[2] = 0.0229988
gaussian[3] = 0.0310452
gaussian[4] = 0.0402634
gaussian[5] = 0.0501713
gaussian[6] = 0.0600659
gaussian[7] = 0.0690923
gaussian[8] = 0.0763588
gaussian[9] = 0.0810805
gaussian[10] = 0.0827185
gaussian[11] = 0.0810805
gaussian[12] = 0.0763588
gaussian[13] = 0.0690923
gaussian[14] = 0.0600659
gaussian[15] = 0.0501713
gaussian[16] = 0.0402634
gaussian[17] = 0.0310452
gaussian[18] = 0.0229988
gaussian[19] = 0.0163699
gaussian[20] = 0.0111947
CPU blurring...
GPU blurring...
No kernel error detected
nframes: 10000000nblocks: 200Comparing...

Successful output

CPU time: 727.829 milliseconds
GPU time: 39.0294 milliseconds

Speedup factor: 18.6482


