In [10]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Wed_Apr_11_23:16:29_CDT_2018
Cuda compilation tools, release 9.2, V9.2.88


In [11]:
code = """
#include<iostream>
#include<cstdlib>
using namespace std;

__global__ void vectorAdd(int *a, int *b, int *result, int n) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    if(tid <= n) {
        result[tid] = a[tid] + b[tid];
    }
}

void print_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void init_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        a[i] = rand()%10 + 1;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 8;           //24
    
    a = (int*)malloc(n * sizeof(n));
    b = (int*)malloc(n * sizeof(n));
    c = (int*)malloc(n * sizeof(n));

    int size = n * sizeof(int);
    
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);
    
    init_array(a, n);
    init_array(b, n);
    
    print_array(a, n);
    print_array(b, n);
        
    //cudaEvent_t start, end;
    //cudaEventCreate(&start);
    //cudaEventCreate(&end);
    
    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);
    
    //int threads = 1024;
    //int blocks = (n+threads-1)/threads;
    
    //cudaEventRecord(start);
    
    //vectorAdd<<<blocks,threads>>>(a_dev, b_dev, c_dev, n);
    vectorAdd<<<1,1024>>>(a_dev, b_dev, c_dev, n);
    
    //cudaEventRecord(end);
    
    //cudaDeviceSynchronize();
       
    //float time = 0.0;
    //cudaEventElapsedTime(&time, start, end);
    
    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);
    
    cout<<"Results : "<<endl;
    print_array(c, n);
 
    //cout<<"Time elapsed : "<<time<<endl;
        
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
        
    return 0;
}
"""

In [12]:
text_file = open("ass2.cu", "w")
text_file.write(code)
text_file.close()

In [13]:
!nvcc ass2.cu

In [14]:
!./a.out

  4  7  8  6  4  6  7  3
  10  2  3  8  1  10  4  7
Results : 
  14  9  11  14  5  16  11  10


In [15]:
!nvprof ./a.out

==15363== NVPROF is profiling process 15363, command: ./a.out
  4  7  8  6  4  6  7  3
  10  2  3  8  1  10  4  7
Results : 
  14  9  11  14  5  16  11  10
==15363== Profiling application: ./a.out
==15363== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   42.44%  4.2240us         2  2.1120us  1.7600us  2.4640us  [CUDA memcpy HtoD]
                   33.76%  3.3600us         1  3.3600us  3.3600us  3.3600us  vectorAdd(int*, int*, int*, int)
                   23.79%  2.3680us         1  2.3680us  2.3680us  2.3680us  [CUDA memcpy DtoH]
      API calls:   99.24%  167.41ms         3  55.804ms  5.9250us  167.40ms  cudaMalloc
                    0.37%  620.27us         1  620.27us  620.27us  620.27us  cuDeviceTotalMem
                    0.18%  309.48us        96  3.2230us     130ns  135.84us  cuDeviceGetAttribute
                    0.08%  128.77us         3  42.924us  8.3680us  102.97us  cudaFree
                    0.06%