In [None]:
!ls /usr/local/cuda*

/usr/local/cuda:
bin		   EULA.txt  libnvvp	       nvvm-prev  src
compat		   extras    nsightee_plugins  README	  targets
compute-sanitizer  include   nvml	       samples	  tools
DOCS		   lib64     nvvm	       share	  version.json

/usr/local/cuda-11:
bin		   EULA.txt  libnvvp	       nvvm-prev  src
compat		   extras    nsightee_plugins  README	  targets
compute-sanitizer  include   nvml	       samples	  tools
DOCS		   lib64     nvvm	       share	  version.json

/usr/local/cuda-11.2:
bin		   EULA.txt  libnvvp	       nvvm-prev  src
compat		   extras    nsightee_plugins  README	  targets
compute-sanitizer  include   nvml	       samples	  tools
DOCS		   lib64     nvvm	       share	  version.json


In [None]:
!nvidia-smi

Mon Jan  2 10:54:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
%%writefile vectorAdd_v1.cu


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>
 
// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x;
 
    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

double cpuSecond() {
   struct timeval tp;
   gettimeofday(&tp,NULL);
   return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}
 
int main( int argc, char* argv[] )
{
    // Size of vectors
    int n = 2048;
 
    // Host input vectors
    double *h_a;
    double *h_b;
    //Host output vector
    double *h_c;
 
    // Device input vectors
    double *d_a;
    double *d_b;
    //Device output vector
    double *d_c;
 
    // Size, in bytes, of each vector
    size_t bytes = n*sizeof(double);
 
    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);
 
    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);
 
    int i;
    // Initialize vectors on host
    for( i = 0; i < n; i++ ) {
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
    }
 
    // Copy host vectors to device
    //TIME THIS
    double hTdTime = cpuSecond();
    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);
    cudaDeviceSynchronize();
    double hTdTimeElapsed = cpuSecond() - hTdTime;
 
    int blockSize, gridSize;
 
    // Number of threads in each thread block
    blockSize = 1024;
 
    // Number of thread blocks in grid
    gridSize = (int)ceil((float)n/blockSize);
 
    // Execute the kernel
    //TIME THIS
    double kernelTime = cpuSecond();
    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
    cudaDeviceSynchronize();
    double kernelTimeElapsed = cpuSecond() - kernelTime;

    // Copy array back to host
    //TIME THIS
    double dThTime = cpuSecond();
    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );
    cudaDeviceSynchronize();
    double dThTimeElapsed = cpuSecond() - dThTime;
 
    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++)
        sum += h_c[i];
    printf("final result: %f\n", sum/n);
 
    // Release device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
 
    // Release host memory
    free(h_a);
    free(h_b);
    free(h_c);


    printf("Host to Device: %f\n", hTdTimeElapsed);
    printf("Kernel: %f\n", kernelTimeElapsed);
    printf("Device to Host: %f\n", dThTimeElapsed);
 
    return 0;
}

Overwriting vectorAdd_v1.cu


In [None]:
%%writefile vectorAdd.cu


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>

// CUDA kernel. Each thread takes care of one element of c
__global__ void vecAdd(double *a, double *b, double *c, int n, int offset)
{
    // Get our global thread ID
    int id = blockIdx.x*blockDim.x+threadIdx.x+ offset;

    // Make sure we do not go out of bounds
    if (id < n)
        c[id] = a[id] + b[id];
}

double cpuSecond() {
   struct timeval tp;
   gettimeofday(&tp,NULL);
   return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
}

int main( int argc, char* argv[] )
{
    // Size of vectors
    int n = 1024;
    
    // number of streams
    int nStreams = 4;
    int streamSize = n/nStreams;
    
    // Segment size 
    int S_seg = 256;

    // Number of segments 
    int num_seg = n / S_seg;

    // Host input vectors
    double *h_a;
    double *h_b;
    //Host output vector
    double *h_c;

    // Device input vectors
    double *d_a;
    double *d_b;
    //Device output vector
    double *d_c;

    // Size, in bytes, of each vector
    size_t bytes = S_seg*sizeof(double);

    // Allocate memory for each vector on host
    h_a = (double*)malloc(bytes);
    h_b = (double*)malloc(bytes);
    h_c = (double*)malloc(bytes);

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    // Create 4 CUDA streams
    cudaStream_t streams[nStreams];
    for (int i = 0; i < nStreams; i++) {
        cudaStreamCreate(&streams[i]);
    }

    int i;
    // Initialize vectors on host -- when summing should add up to around 1
    for( i = 0; i < n; i++ ) {
        h_a[i] = sin(i)*sin(i);
        h_b[i] = cos(i)*cos(i);
    }

    // Timer for host-to-device memory transfer
    double hTdTime = 0;
    // Timer for kernel execution
    double kernelTime = 0;
    // Timer for device-to-host memory transfer
    double dThTime = 0;

    int Db = 32;
    int Dg = (S_seg + Db - 1)/ Db;
    // Process each segment
    for (int seg = 0; seg < num_seg; seg++) {
        int offset = seg * S_seg;
        

        cudaMemcpyAsync(d_a, h_a + seg*S_seg, bytes, cudaMemcpyHostToDevice, streams[seg]);
        cudaMemcpyAsync(d_b, h_b + seg*S_seg, bytes, cudaMemcpyHostToDevice, streams[seg]);
        
        
        // Execute kernel
        double kernelStart = cpuSecond();
        vecAdd<<<Dg, Db, 0, streams[seg]>>>(d_a, d_b, d_c, S_seg, offset);
        kernelTime += cpuSecond() - kernelStart;

        
        cudaMemcpyAsync(h_c + seg*S_seg, d_c, bytes, cudaMemcpyDeviceToHost, streams[seg]);
    } 

    for (int i = 0; i < nStreams; i++) {
      cudaStreamSynchronize(streams[i]);
    }

    // Sum up vector c and print result divided by n, this should equal 1 within error
    double sum = 0;
    for(i=0; i<n; i++) {
        sum += h_c[i];
    }
    printf("final result: %f\n", sum/n);

    // Print timing results
    //printf("Host to Device Time: %f\n", hTdTime);
    printf("Kernel Time: %f\n", kernelTime);
    //printf("Device to Host Time: %f\n", dThTime);

    // Cleanup
    free(h_a);
    free(h_b);
    free(h_c);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    for (int i = 0; i < nStreams; i++) {
        cudaStreamDestroy(streams[i]);
    }

    return 0;
}

Overwriting vectorAdd.cu


In [None]:
!nvcc vectorAdd.cu -o vectorAdd






In [None]:
!nvcc vectorAdd_v1.cu -o vectorAdd_v1

In [None]:
!./vectorAdd

In [None]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli vectorAdd_v1

==PROF== Connected to process 21904 (/content/vectorAdd_v1)
==PROF== Profiling "vecAdd" - 1: 0%....50%....100% - 8 passes
final result: 1.000000
Host to Device: 0.000055
Kernel: 0.661935
Device to Host: 0.000050
==PROF== Disconnected from process 21904
[21904] vectorAdd_v1@127.0.0.1
  vecAdd(double*, double*, double*, int), 2023-Jan-02 18:11:19, Context 1, Stream 7
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.86
    SM Frequency                                                             cycle/usecond                         568.75
    Elapsed Cycles                                                                   cycle                          2,877
    Memory [%]                                                                           %                      

In [None]:
!/usr/local/cuda-11/bin/nv-nsight-cu-cli vectorAdd

==PROF== Connected to process 28981 (/content/vectorAdd)
==PROF== Profiling "vecAdd" - 1: 0%....50%....100% - 8 passes
==PROF== Profiling "vecAdd" - 2: 0%....50%....100% - 8 passes
==PROF== Profiling "vecAdd" - 3: 0%....50%....100% - 8 passes
==PROF== Profiling "vecAdd" - 4: 0%....50%....100% - 8 passes
final result: 1.000087
Kernel Time: 1.941233
==PROF== Disconnected from process 28981
[28981] vectorAdd@127.0.0.1
  vecAdd(double*, double*, double*, int, int), 2023-Jan-02 18:37:56, Context 1, Stream 13
    Section: GPU Speed Of Light
    ---------------------------------------------------------------------- --------------- ------------------------------
    DRAM Frequency                                                           cycle/nsecond                           4.55
    SM Frequency                                                             cycle/usecond                         534.02
    Elapsed Cycles                                                                   cycle   

In [None]:
!nvprof --print-gpu-trace ./vectorAdd

==18095== NVPROF is profiling process 18095, command: ./vectorAdd
==18095== Profiling application: ./vectorAdd
==18095== Profiling result:
No kernels were profiled.
