In [None]:
!apt-get --purge remove cuda nvidia* libnvidia-*
!dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
!apt-get remove cuda-*
!apt autoremove
!apt-get update

!wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
!apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
!apt-get update
!apt-get install cuda-9.2

!nvcc --version

!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

%load_ext nvcc_plugin

In [3]:
%%cu
#include <iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

// # gpu addition
__global__
void vectorAdd(int *a, int *b, int *result, int n) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    if(tid <= n) {
        result[tid] = a[tid] + b[tid];
    }
}

// # cpu addition
void vectorAddCpu(int *a, int *b, int *result, int n) {
    for(int i=0; i<n; ++i) {
        result[i] = a[i] + b[i];
    }
}

void print_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void init_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        a[i] = rand()%10 + 1;
    }
}

int main() {
    int *a, *b, *c;     // # host vectors
    int *a_dev, *b_dev, *c_dev;   // # device vectors
    int n = 1000;           //   # size of vector
    
    // # allocate space
    a = (int*)malloc(n * sizeof(n));
    b = (int*)malloc(n * sizeof(n));
    c = (int*)malloc(n * sizeof(n));

    int size = n * sizeof(int);
    
    // # allocate space for device vectors
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);
    
    // # intialize array a, b
    init_array(a, n);
    init_array(b, n);
    
    // # print array a, b
    print_array(a, n);
    print_array(b, n);
    
    // # copy a, b into a_dev, b_dev (from host to device)
    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);

    // # addition of 2 vectors - gpu
    auto start = high_resolution_clock::now();
    vectorAdd<<<1, 1024>>>(a_dev, b_dev, c_dev, n);
    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);
    
    // # store result in c
    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);
    
    // # display vector c
    cout<<"GPU Results : "<<endl;
    print_array(c, n);

    cout << "GPU Time Taken: " << duration.count() << "ms" << endl;
    
    // # cpu calculations
    start = high_resolution_clock::now();
    vectorAddCpu(a, b, c, n);
    stop = high_resolution_clock::now();
    duration = duration_cast<microseconds>(stop - start);

    cout<<"CPU Results : "<<endl;
    print_array(c, n);

    cout << "CPU Time Taken: " << duration.count() << "ms" << endl;
    
    // # free space
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);

    delete[] a;
    delete[] b;
    delete[] c;
        
    return 0;
}

  4  7  8  6  4  6  7  3  10  2  3  8  1  10  4  7  1  7  3  7  2  9  8  10  3  1  3  4  8  6  10  3  3  9  10  8  4  7  2  3  10  4  2  10  5  8  9  5  6  1  4  7  2  1  7  4  3  1  7  2  6  6  5  8  7  6  7  10  4  8  5  6  3  6  5  8  5  5  4  1  8  9  7  9  9  5  4  2  5  10  3  1  7  9  10  3  7  7  5  10  6  1  5  9  8  2  8  3  8  3  3  7  2  1  7  2  6  10  5  10  1  10  2  8  8  2  2  6  10  8  8  7  8  4  7  6  7  4  10  5  9  2  3  10  4  10  1  9  9  6  1  10  7  4  9  6  7  2  2  6  10  9  5  9  2  1  4  1  5  5  5  5  8  7  4  2  8  6  10  7  3  2  8  9  6  8  5  2  9  6  10  8  6  4  9  9  4  2  9  10  7  5  4  4  4  9  7  1  5  9  9  9  10  8  8  7  5  4  1  4  1  10  3  6  5  1  6  10  5  7  10  3  3  5  8  8  6  5  9  2  3  9  10  4  7  9  1  3  2  1  6  2  2  1  9  6  1  7  5  7  3  6  9  7  3  9  5  8  3  5  1  7  3  10  10  1  9  2  4  2  2  1  4  5  1  4  10  2  10  7  10  4  4  9  1  6  7  7  5  1  1  5  7  3  7  8  6  7  10  9  8  3  9  3  10  10  7  1  3  8  7 

In [5]:
%%cu
#include<iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

// # multiply matrix and vector - gpu
__global__
void matrixVector(int *vec, int *mat, int *result, int n, int m) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    int sum=0;
    
    if(tid <= n) {
        for(int i=0; i<n; i++) {
            sum += vec[i]*mat[(i*m) + tid];
        }
        result[tid] = sum;
    }
}

// # multiply matrix and vector - cpu
void matrixVectorCpu(int *vec, int *mat, int *result, int n, int m) {
    for(int j=0; j<m; ++j) {
        int sum = 0;
        for(int i=0; i<n; ++i) {
            sum += mat[i*m + j]*vec[i];
        }
        result[j] = sum;
    }
}


void init_array(int *a, int n) {
    for(int i=0; i<n; i++)
      a[i] = rand()%n + 1;
}

void init_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++) {
            a[i*m + j] = rand()%n + 1;
        }
    }
}

void print_array(int *a, int n) {
    for(int i=0; i<n; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void print_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
        for(int j=0; j<m; j++)
          cout<<"  "<<a[i*m + j];
        cout<<endl;
    }
}

int main() {
    int *a, *b, *c;   // # host variables
    int *a_dev, *b_dev, *c_dev;   // # device variables
    
    int n = 1000;    
    int m = 100;
    
    a = new int[n];
    b = new int[n*m];
    c = new int[m];
    
    // # initialize array and matrix
    init_array(a, n);
    init_matrix(b, n, m);
    memset(c, 0, sizeof(c));
    
    // # allocate space for device
    cudaMalloc(&a_dev, sizeof(int)*n);
    cudaMalloc(&b_dev, sizeof(int)*n*m);
    cudaMalloc(&c_dev, sizeof(int)*m);
    
    // # copy vector and matrix (from host to device)
    cudaMemcpy(a_dev, a, sizeof(int)*n, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*n*m, cudaMemcpyHostToDevice);
    
    // # multiply vector and matrix - gpu
    auto start = high_resolution_clock::now();
    matrixVector<<<m/256+1, 256>>>(a_dev, b_dev, c_dev, n, m);
    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);
    
    // # copy result in c
    cudaMemcpy(c, c_dev, sizeof(int)*m, cudaMemcpyDeviceToHost);
    
    // # display resultant vector
    cout<<"Results : ";
    print_array(c, m);
    cout << "GPU Time Taken: " << duration.count() << "ms" << endl;
    
    // # cpu calculations
    start = high_resolution_clock::now();
    matrixVectorCpu(a, b, c, n, m);
    stop = high_resolution_clock::now();
    duration = duration_cast<microseconds>(stop - start);

    cout<<"Results : ";
    print_array(c, m);
    cout << "CPU Time Taken: " << duration.count() << "ms" << endl;

    // # free space
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    
    delete[] a;
    delete[] b;
    delete[] c;
    
    return 0;
}

Results :   257749280  251306881  249622518  249179775  243393237  253630484  246770864  244834826  249894143  249914005  242578666  248488753  241468047  250467247  248966870  251458711  251279543  259169373  260851125  241413129  248803927  252958306  251882019  247460470  254778440  246926372  252475005  255596076  249022149  252176778  247761738  252195534  255024786  253577878  252875729  255027220  254529012  245455899  247590340  240480392  245830055  251471079  246170504  253634927  248446974  252055770  242557841  248983999  249071619  244537158  247577737  248996028  259722387  256804914  245249819  247094247  246384697  255191103  250553228  244025865  248288857  251259785  239730874  255791683  249691300  255898895  249567357  255028858  248358764  245142887  245984387  251155972  247386540  247169846  253620512  250264051  249361895  249333812  250783270  246709766  252669472  242625042  241412188  249125586  255828038  254813957  254079861  253770653  257526342  258197058

In [8]:
%%cu
#include<iostream>
#include <chrono>

using namespace std;
using namespace std::chrono;

// # multiply matrix - gpu
__global__
void matrixMultiplication(int *a, int *b, int *c, int m, int n, int k) {
    int row = blockIdx.y*blockDim.y + threadIdx.y;
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    int sum=0;
   
    if(col<k && row<m) {
      for(int j=0;j<n;j++) {
        sum += a[row*n+j] * b[j*k+col];
      }
      c[k*row+col]=sum;
    }
}

// # matrix multiplication - cpu
void matrixMultiplicationCpu(int *a, int *b, int *c, int m, int n, int k) {
    for(int i=0; i<m; ++i) {
        for(int j=0; j<k; ++j) {
            c[i*k+j] = 0;
            for(int p=0; p<n; ++p) {
                c[i*k+j] += a[i*n+p]*b[p*k+j];
            }
        }
    } 
}

void init_result(int *a, int m, int k) {
    for(int i=0; i<m; i++) {
      for(int j=0; j<k; j++) {
        a[i*k + j] = 0;
      }
    }
}

void init_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
      for(int j=0; j<m; j++) {
        a[i*m + j] = rand()%10 + 1;
      }
    }
}

void print_matrix(int *a, int n, int m) {
    for(int i=0; i<n; i++) {
      for(int j=0; j<m; j++) {
        cout<<"  "<<a[i*m + j];
      }
      cout<<endl;
    }
    cout<<endl;
}

int main() {
    int *a, *b, *c;   // # host matrices
    int *a_dev, *b_dev, *c_dev;   // # device matrices
    int m = 32, n = 32, k = 32;
    
    a = new int[m*n];
    b = new int[n*k];
    c = new int[m*k];
    
    // # initialize matrices
    init_matrix(a, m, n);
    init_matrix(b, n ,k);
    init_result(c, m, k);
    
    // # allocate space
    cudaMalloc(&a_dev, sizeof(int)*m*n);
    cudaMalloc(&b_dev, sizeof(int)*n*k);
    cudaMalloc(&c_dev, sizeof(int)*m*k);
       
    // # copy a, b, into a_dev, b_dev (from host to device)
    cudaMemcpy(a_dev, a, sizeof(int)*m*n, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*n*k, cudaMemcpyHostToDevice);
    
    // # grid with 1x1 = 1 block
    dim3 dimGrid(1,1);

    // # block with 32x32 = 1024 threads
    dim3 dimBlock(32,32);
    
    // # multiply matrices - gpu
    auto start = high_resolution_clock::now();
    matrixMultiplication<<<dimGrid, dimBlock>>>(a_dev,b_dev,c_dev, m, n, k);
    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<microseconds>(stop - start);

    // # copy result (from device to host)
    cudaMemcpy(c, c_dev, sizeof(int)*m*k, cudaMemcpyDeviceToHost);
    
    // # display result
    cout<<"Result : \n";
    print_matrix(c, m, k);
    
    cout << "GPU Time Taken: " << duration.count() << "ms" << endl;

    // # cpu calculations
    start = high_resolution_clock::now();
    matrixMultiplicationCpu(a, b, c, m, n, k);
    stop = high_resolution_clock::now();
    duration = duration_cast<microseconds>(stop - start);

    cout<<"Result : \n";
    print_matrix(c, m, k);
    
    cout << "CPU Time Taken: " << duration.count() << "ms" << endl;

    // # free space
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
    
    delete[] a;
    delete[] b;
    delete[] c;
    
    return 0;
}

Result : 
  1046  888  1071  909  999  984  1025  1061  1018  894  738  1007  1052  789  996  890  817  993  966  971  1162  1020  986  972  663  806  896  797  1172  1048  942  919
  976  869  973  956  1034  1038  1057  955  1016  876  691  933  1002  787  984  890  787  958  909  830  1093  948  902  913  708  796  844  819  1067  1077  932  850
  1159  1024  1011  1019  1117  1062  1167  1200  1119  958  832  1011  1147  807  1060  985  963  1108  1003  1024  1279  1123  997  1002  754  870  1021  989  1183  1225  988  1018
  1011  854  1021  974  930  1090  1112  1009  981  782  719  1100  1020  818  1014  903  812  1056  849  936  1208  887  978  996  769  808  931  754  1130  969  856  887
  1226  979  1095  1015  1104  1215  1221  1103  1174  1015  800  1145  1223  880  1105  1109  1021  1140  1034  1112  1312  1027  1203  1137  920  839  957  861  1275  1118  1026  945
  1136  1017  1022  1062  1033  1107  1139  1111  1121  934  844  1047  1159  828  1019  908  929  1103  1014