# 

In [1]:
%%writefile add.cpp

#include <iostream>
#include <math.h>

// function to add the elements of two arrays
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20; // 1M elements

  float *x = new float[N];
  float *y = new float[N];

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the CPU
  add(N, x, y);

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  delete [] x;
  delete [] y;

  return 0;
}

Writing add.cpp


In [2]:
%%shell
g++ add.cpp -o add



In [3]:
%%shell
./add

Max error: 0




In [4]:
%%writefile add.cu

#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20
 ;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Writing add.cu


In [5]:
%%shell

nvcc add.cu -o add_cuda
./add_cuda

Max error: 1




In [6]:
%%writefile add_block.cu

#include <iostream>
#include <math.h>

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      y[i] = x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y;

  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N*sizeof(float));
  cudaMallocManaged(&y, N*sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 256>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i]-3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);
  
  return 0;
}

Writing add_block.cu


In [8]:
%%shell

nvcc add_block.cu -o add_block
nvprof ./add_block

==10401== NVPROF is profiling process 10401, command: ./add_block
Max error: 1
==10401== Profiling application: ./add_block
==10401== Profiling result:
No kernels were profiled.
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   93.38%  113.67ms         2  56.835ms  43.248us  113.63ms  cudaMallocManaged
                    6.20%  7.5520ms         1  7.5520ms  7.5520ms  7.5520ms  cudaLaunchKernel
                    0.26%  318.99us         2  159.50us  120.73us  198.26us  cudaFree
                    0.12%  152.03us       114  1.3330us     103ns  61.332us  cuDeviceGetAttribute
                    0.01%  14.579us         1  14.579us  14.579us  14.579us  cuDeviceGetName
                    0.01%  8.7100us         1  8.7100us  8.7100us  8.7100us  cudaDeviceSynchronize
                    0.01%  7.9880us         1  7.9880us  7.9880us  7.9880us  cuDeviceGetPCIBusId
                    0.00%  2.5170us         2  1.2580us     219ns  2.2980us  c



In [9]:
%%writefile vec_add_openacc.cu
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    int n = 1 << 20;          // 1M elements
    size_t size = n * sizeof(float);

    float *a = (float*) malloc(size);
    float *b = (float*) malloc(size);
    float *c = (float*) malloc(size);

    for (int i = 0; i < n; ++i) {
        a[i] = 1.0f;
        b[i] = 2.0f;
    }

    // Offload this loop to GPU with OpenACC
    #pragma acc parallel loop copyin(a[0:n], b[0:n]) copyout(c[0:n])
    for (int i = 0; i < n; ++i) {
        c[i] = a[i] + b[i];
    }

    printf("c[0] = %f\n", c[0]);
    printf("c[n-1] = %f\n", c[n-1]);

    free(a);
    free(b);
    free(c);
    return 0;
}

Overwriting vec_add_openacc.cu


In [11]:
%%shell
nvcc -acc -Minfo=accel vec_add_openacc.c -o vec_add_openacc
./vec_add_openacc

nvcc fatal   : Unknown option '-acc'
/bin/bash: line 2: ./vec_add_openacc: No such file or directory


CalledProcessError: Command 'nvcc -acc -Minfo=accel vec_add_openacc.c -o vec_add_openacc
./vec_add_openacc
' returned non-zero exit status 127.