In [1]:
code = """
#include<iostream>
#include<cstdlib>
using namespace std;

__global__ void vectorAdd(int *a, int *b, int *result, int n) {
    int tid = blockIdx.x*blockDim.x + threadIdx.x;
    if(tid <= n) {
        result[tid] = a[tid] + b[tid];
    }
}

void print_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        cout<<"  "<<a[i];
    }
    cout<<endl;
}

void init_array(int *a, int N) {
    for(int i=0; i<N; i++) {
        a[i] = rand()%10 + 1;
    }
}

int main() {
    int *a, *b, *c;
    int *a_dev, *b_dev, *c_dev;
    int n = 8;           //24
    
    a = (int*)malloc(n * sizeof(n));
    b = (int*)malloc(n * sizeof(n));
    c = (int*)malloc(n * sizeof(n));

    int size = n * sizeof(int);
    
    cudaMalloc(&a_dev, size);
    cudaMalloc(&b_dev, size);
    cudaMalloc(&c_dev, size);
    
    init_array(a, n);
    init_array(b, n);
    
    print_array(a, n);
    print_array(b, n);
        
    //cudaEvent_t start, end;
    //cudaEventCreate(&start);
    //cudaEventCreate(&end);
    
    cudaMemcpy(a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, size, cudaMemcpyHostToDevice);
    
    //int threads = 1024;
    //int blocks = (n+threads-1)/threads;
    
    //cudaEventRecord(start);
    
    //vectorAdd<<<blocks,threads>>>(a_dev, b_dev, c_dev, n);
    vectorAdd<<<1,1024>>>(a_dev, b_dev, c_dev, n);
    
    //cudaEventRecord(end);
    
    //cudaDeviceSynchronize();
       
    //float time = 0.0;
    //cudaEventElapsedTime(&time, start, end);
    
    cudaMemcpy(c, c_dev, size, cudaMemcpyDeviceToHost);
    
    cout<<"Results : "<<endl;
    print_array(c, n);
 
    //cout<<"Time elapsed : "<<time<<endl;
        
    cudaFree(a_dev);
    cudaFree(b_dev);
    cudaFree(c_dev);
        
    return 0;
}
"""

In [2]:

text_file = open("assign2.cu", "w")
text_file.write(code)
text_file.close()

In [3]:

!nvcc assign2.cu


In [4]:
!nvprof ./a.out

==158== NVPROF is profiling process 158, command: ./a.out
  4  7  8  6  4  6  7  3
  10  2  3  8  1  10  4  7
Results : 
  0  0  0  0  0  0  0  0
==158== Profiling application: ./a.out
==158== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   60.30%  3.8400us         2  1.9200us  1.5360us  2.3040us  [CUDA memcpy HtoD]
                   39.70%  2.5280us         1  2.5280us  2.5280us  2.5280us  [CUDA memcpy DtoH]
      API calls:   99.45%  273.66ms         3  91.221ms  3.1900us  273.66ms  cudaMalloc
                    0.23%  640.64us         1  640.64us  640.64us  640.64us  cuDeviceGetPCIBusId
                    0.16%  431.47us         1  431.47us  431.47us  431.47us  cuDeviceTotalMem
                    0.07%  198.46us       101  1.9640us     152ns  77.622us  cuDeviceGetAttribute
                    0.05%  150.63us         3  50.210us  6.4180us  130.98us  cudaFree
                    0.02%  67.437us         3  22.47