In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

!nvcc --version

!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

%load_ext nvcc_plugin

In [None]:
%%cu
#include <iostream>
#include <stdio.h>
#include <cuda.h>
#include <math.h>
#include <chrono>
#include <bits/stdc++.h> 

using namespace std;
using namespace std::chrono;

// # calculate maximum
__global__ 
void maximum(int *input) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads>0) {
        if(tid < number_of_threads) {
            int first = tid*step_size*2;
            int second = first + step_size;
            if(input[second] > input[first])
              input[first] = input[second];
        }
        step_size <<= 1;
        if(number_of_threads == 1)
          number_of_threads = 0;
        else
          number_of_threads = ceil((double)number_of_threads / 2);
    }
}

// # calculate minimum
__global__ 
void minimum(int *input, int n) {
    int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads>0) {
        if(tid < number_of_threads) {
            int first = tid*step_size*2;
            int second = first + step_size;
            if((first < n && second < n) && input[second] < input[first])
              input[first] = input[second];
        }
        step_size <<= 1;
        if(number_of_threads == 1)
          number_of_threads = 0;
        else
          number_of_threads = ceil((double)number_of_threads / 2);
    }
}

// # calculate sum
__global__ 
void gpu_sum(int *input) {
    const int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads > 0) {
        if(tid < number_of_threads) {
            int first = tid * step_size * 2;
            int second = first + step_size;
            input[first] += input[second];
        }
        step_size <<= 1;
        if(number_of_threads == 1)
          number_of_threads = 0;
        else
          number_of_threads = ceil((double)number_of_threads / 2);
    }
    if(tid == 0) {
        int first = tid * step_size * 2;
        int second = first + step_size;
        input[first] += input[second];
    }
}

// # calculate square of mean difference
__global__ 
void mean_diff_sq(float *input, float mean) {
    input[threadIdx.x] -= mean;
    input[threadIdx.x] *= input[threadIdx.x];
}

// # convert array from int to float
void copy_int_to_float(float *dest, int *src, int size){
    for(int i = 0; i < size; i++)
        dest[i] = (float)src[i];
}

// # calculate standard deviation
__global__ 
void gpu_sd(float *input) {
    const int tid = threadIdx.x;
    int step_size = 1;
    int number_of_threads = blockDim.x;
    
    while(number_of_threads > 0) {
        if(tid < number_of_threads) {
            int first = tid * step_size * 2;
            int second = first + step_size;
            input[first] += input[second];
        }
        step_size <<= 1;
        if(number_of_threads == 1)
          number_of_threads = 0;
        else
          number_of_threads = ceil((double)number_of_threads / 2);
    }
    if(tid == 0) {
        int first = tid * step_size * 2;
        int second = first + step_size;
        input[first] += input[second];
    }
}

long cpu_sum(int *input, int n) {
    long sum = 0;
    for(int i = 0 ; i < n ; i++) {
        sum += input[i];
    }
    return sum;
}

long cpu_min(int *arr, int n) {
    int min =  arr[0];
    for(int i = 1 ; i < n ; i++) {
        if(arr[i] < min)
          min = arr[i];
    }
    return min;
}

long cpu_max(int *arr, int n) {
    int max =  arr[0];
    for(int i = 1 ; i < n ; i++) {
        if(arr[i] > max)
          max = arr[i];
    }
    return max;
}

double cpu_sd(int *arr, int n, float mean) {
    float *arr_std = new float[n];
    for(int i = 0 ; i < n ; i++) {
        arr_std[i] = pow(((float)arr[i] - mean),2);
    }
    double total = 0;
    for(int i = 0 ; i < n ; i++) {
        total += arr_std[i];
    }
    total = total / n;
    return sqrt(total);
}

void random_init(int *arr, int n) {
    for(int i = 0 ; i < n ; i++) {
        arr[i] = rand()%1000;
    }
}

int main() {
    int *d;   // # array for device
    int n = 500;   
    int *arr = new int[n];
    int result;   // # display result on host side
    int size = n * sizeof(int);
    random_init(arr, n);   // # generate random array elements

    // # display array
    cout<<"Input Array: [";
    for(int i = 0 ; i < n ; i++) {
        cout<<arr[i]<<", ";
    }
    cout<<"]"<<endl;

    cudaMalloc((void **)&d, size);   // # allocate space for device array
    cudaMemcpy(d, arr, size, cudaMemcpyHostToDevice);    // # copy arr into d (from host to device)
    
    gpu_sum<<<1, n/2>>>(d);    // # calculate sum of array elements

    cudaMemcpy(&result, d, sizeof(int), cudaMemcpyDeviceToHost);   // # copy d into result (from device to host)
    cout<<"======================================="<<endl;

    // # display sum and mean
    cout<<"GPU Sum is: "<<result<<endl;
    float mean = (double)result/n;
    cout<<"GPU Mean is: "<<mean<<endl;

    float *arr_float = new float[n];    // # float array
    float *arr_std, std;    // # arr_std is device array, std is host result

    cudaMalloc((void **)&arr_std, n*sizeof(float));  // # allocate space for arr_std
    copy_int_to_float(arr_float, arr, n);   // # convert array to float
    cudaMemcpy(arr_std, arr_float, n*sizeof(float), cudaMemcpyHostToDevice);  // # copy arr_float into arr_std
    
    mean_diff_sq<<<1, n>>>(arr_std, mean);    // # calculate mean difference square

    auto start = high_resolution_clock::now();
    gpu_sd<<<1, n/2>>>(arr_std);    // # calculate sum
    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);

    // # display standard deviation
    cudaMemcpy(&std,arr_std,sizeof(float),cudaMemcpyDeviceToHost);    
    cout<<"GPU Standard Deviation: "<<sqrt(std/n)<<", Time taken: " << duration.count() << "ms" << endl;
    cout<<"======================================="<<endl;

    // # cpu calculations
    result = cpu_sum(arr,n);
    cout<<"CPU Sum is: "<<result<< endl;

    mean = (float)result/n;
    cout<<"CPU Mean is: "<<mean<<endl;

    start = high_resolution_clock::now();
    std = cpu_sd(arr, n, mean);
    stop = high_resolution_clock::now();
    duration = duration_cast<microseconds>(stop - start);
    cout<<"CPU Standard Deviation: "<<std<<", Time taken: " << duration.count() << "ms" << endl;
    cout<<"======================================="<<endl;

    // # calculate minimum
    cudaMemcpy(d,arr,size,cudaMemcpyHostToDevice);
    minimum<<<1,n/2>>>(d,n);
    cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost);
    cout<<"GPU Min is: "<<result<<endl;

    // # calculate minimum cpu
    result = cpu_min(arr,n);
    cout<<"CPU Min is: "<<result<<endl;
    cout<<"======================================="<<endl;

    // # calculate maximum
    cudaMemcpy(d,arr,size,cudaMemcpyHostToDevice);
    maximum<<<1,n/2>>>(d);
    cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost);
    cout<<"GPU Max is: "<<result<<endl;

    // # calculate maximum cpu
    result = cpu_max(arr,n);
    cout<<"CPU Max is: "<<result<<"\n";
    cout<<"======================================="<<endl;
    return 0;
}

Input Array: [383, 886, 777, 915, 793, 335, 386, 492, 649, 421, 362, 27, 690, 59, 763, 926, 540, 426, 172, 736, 211, 368, 567, 429, 782, 530, 862, 123, 67, 135, 929, 802, 22, 58, 69, 167, 393, 456, 11, 42, 229, 373, 421, 919, 784, 537, 198, 324, 315, 370, 413, 526, 91, 980, 956, 873, 862, 170, 996, 281, 305, 925, 84, 327, 336, 505, 846, 729, 313, 857, 124, 895, 582, 545, 814, 367, 434, 364, 43, 750, 87, 808, 276, 178, 788, 584, 403, 651, 754, 399, 932, 60, 676, 368, 739, 12, 226, 586, 94, 539, 795, 570, 434, 378, 467, 601, 97, 902, 317, 492, 652, 756, 301, 280, 286, 441, 865, 689, 444, 619, 440, 729, 31, 117, 97, 771, 481, 675, 709, 927, 567, 856, 497, 353, 586, 965, 306, 683, 219, 624, 528, 871, 732, 829, 503, 19, 270, 368, 708, 715, 340, 149, 796, 723, 618, 245, 846, 451, 921, 555, 379, 488, 764, 228, 841, 350, 193, 500, 34, 764, 124, 914, 987, 856, 743, 491, 227, 365, 859, 936, 432, 551, 437, 228, 275, 407, 474, 121, 858, 395, 29, 237, 235, 793, 818, 428, 143, 11, 928, 529, 776, 404