<a href="https://colab.research.google.com/github/ronglu-stanford/RL_reference_public/blob/main/Copy_of_2_shared_memory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Shared Memory Tutorial 

# C++ threads Hello World

In [None]:
%%file hello_world_threads.cpp

#include <thread>
#include <cassert>
#include <iostream>

using namespace std;

void f1()
{
    cout << "Hello World!\n";
}

void f2(int m)
{
    cout << "Hello World with m = " << m << endl;
}

void f3(int &k)
{
    cout << "Hello World; k was passed by reference; k = " << k << endl;
    k += 3;
}

int main(void)
{
  // Demonstrate using thread constructor
  thread t1(f1);

  int m = 5;
  // With an argument
  thread t2(f2, m);

  int k = 7;
  // With a reference
  thread t3(f3, ref(k)); /* use ref to pass a reference */

  // Wait for t3 to complete
  t3.join();
  cout << "k is now equal to " << k << endl;
  assert(k == 10);
 
  /* wait for t1 and t2 to finish */
  t1.join();
  t2.join(); 

  return 0;
}

Overwriting hello_world_threads.cpp


In [None]:
!ls -l

total 392
-rwxr-xr-x 1 root root  21248 Aug 12 18:56 entropy
-rw-r--r-- 1 root root   1776 Aug 12 18:56 entropy.cpp
-rwxr-xr-x 1 root root 134256 Aug 12 18:56 future
-rw-r--r-- 1 root root   1443 Aug 12 18:56 future.cpp
-rwxr-xr-x 1 root root  44048 Aug 12 18:56 hello_world_thread
-rw-r--r-- 1 root root    716 Aug 12 21:54 hello_world_threads.cpp
-rwxr-xr-x 1 root root  21480 Aug 12 18:56 nbody
-rw-r--r-- 1 root root   2333 Aug 12 18:56 nbody.cpp
-rwxr-xr-x 1 root root  21648 Aug 12 18:56 nbody_omp
-rw-r--r-- 1 root root   2403 Aug 12 18:56 nbody_omp.cpp
-rwxr-xr-x 1 root root  40912 Aug 12 18:56 pi
-rw-r--r-- 1 root root   1525 Aug 12 18:56 pi.cpp
-rwxr-xr-x 1 root root  14144 Aug 12 18:56 pi_omp_1
-rw-r--r-- 1 root root    751 Aug 12 18:56 pi_omp_1.cpp
-rwxr-xr-x 1 root root  14248 Aug 12 18:56 pi_omp_2
-rw-r--r-- 1 root root    751 Aug 12 18:56 pi_omp_2.cpp
-rwxr-xr-x 1 root root   8792 Aug 12 18:56 pi_openmp
-rw-r--r-- 1 root root    322 Aug 12 18:56 pi_openmp.cpp
drwxr-xr-x 1 root

In [None]:
!g++ -o hello_world_thread hello_world_threads.cpp -lpthread && ./hello_world_thread

Hello World!
Hello World; k was passed by reference; k = 7
k is now equal to 10
Hello World with m = 5


# C++ threads and future/promise

In [None]:
%%file future.cpp 

#include <cassert>
#include <future>
#include <iostream>
#include <thread>
#include <valarray>
#include <numeric>

using namespace std;
typedef valarray<int> vint;

int accumulate(vint &v) { return v.sum(); }

int global_sum;

void accumulate_global(vint &v) { global_sum = accumulate(v); }

void accumulate_ref(vint &v, int &sum) { sum = accumulate(v); }

void accumulate_promise(vint &v, promise<int> accumulate_promise) {
  accumulate_promise.set_value(accumulate(v));  // Notify future
}

int main(void) {
  vint vec0 = {1, 2, 3, 4, 5, 6};
  int exact_value = vec0.sum();

  thread th0(accumulate_global, ref(vec0));
  th0.join();
  printf("result of accumulate_global [21 expected]  = %d\n", global_sum);    
  assert(global_sum == exact_value);

  int sum0;
  thread th1(accumulate_ref, ref(vec0), ref(sum0));
  th1.join();
  printf("result of accumulate_ref [21 expected]     = %d\n", sum0);  
  assert(sum0 == exact_value);

  promise<int> sum_promise;  // Will store the int
  future<int> sum_future = sum_promise.get_future();
  // Used to retrieve the value asynchronously, at a later time

  thread th2(accumulate_promise, ref(vec0), move(sum_promise));
  // move() will "move" the resources

  // future::get() waits until the future has a valid result and retrieves it
  int sum1 = sum_future.get();
  printf("result of accumulate_promise [21 expected] = %d\n", sum1);
  assert(sum1 == exact_value);

  th2.join();
  return 0;
}

Overwriting future.cpp


In [None]:
!g++ -o future future.cpp -lpthread && ./future

result of accumulate_global [21 expected]  = 21
result of accumulate_ref [21 expected]     = 21
result of accumulate_promise [21 expected] = 21


# C++ mutex

In [None]:
%%file pi.cpp

#include <cmath>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <thread>
#include <vector>

using namespace std;

typedef long double flt_t;

flt_t pi;
mutex pi_mutex;

const flt_t one = flt_t(1);
const flt_t two = flt_t(2);
const flt_t four = flt_t(4);

// Bailey–Borwein–Plouffe formula
flt_t increment(const int k) {
  return flt_t((four / (8 * k + 1) - two / (8 * k + 4) - one / (8 * k + 5) -
                one / (8 * k + 6)) /
               pow(16, k));
}

void BBP_range(const int k_start, const int k_chunk) {
  flt_t local_pi = 0;
  for (int k = k_start + k_chunk - 1; k >= k_start; --k) {
    local_pi += increment(k);
  }
  lock_guard<mutex> guard(pi_mutex);
  // Protect the update on the next line
  pi += local_pi;
}

int main(void) {
  const int n_threads = 8;

  // create vector of thread*
  vector<thread*> th_pool(n_threads);

  pi = 0;
  unsigned k_chunk = 2;  // size of range computed by each thread
  for (int i = 0; i < n_threads; ++i) {
    th_pool[i] = new thread(BBP_range, i * k_chunk, k_chunk);
  };

  for (auto th : th_pool) {
    th->join();
  };

  cout << "pi exact: 3.14159265358979323846264338327950288419716939937510\n";
  cout << setprecision(19);
  cout << "pi:       " << pi << endl;

  cout << setprecision(5);
  cout << "precision of type: " << numeric_limits<flt_t>::digits10 << endl;
  cout << "epsilon: " << numeric_limits<flt_t>::epsilon() << endl;
  cout << "minimum error: " << 3.141592 * numeric_limits<flt_t>::epsilon()
       << endl;

  return 0;
}

Overwriting pi.cpp


In [None]:
!g++ -std=c++11 -o pi pi.cpp -lpthread && ./pi

pi exact: 3.14159265358979323846264338327950288419716939937510
pi:       3.141592653589793238
precision of type: 18
epsilon: 1.0842e-19
minimum error: 3.4061e-19


# Hello World, openMP

In [None]:
%%file pi_omp_1.cpp

#include <omp.h>

#include <cmath>
#include <iomanip>
#include <iostream>
#include <vector>

using namespace std;
typedef long double flt_t;

const flt_t one = flt_t(1);
const flt_t two = flt_t(2);
const flt_t four = flt_t(4);

// Bailey–Borwein–Plouffe formula
flt_t increment(const int k) {
  return flt_t((four / (8 * k + 1) - two / (8 * k + 4) - one / (8 * k + 5) -
                one / (8 * k + 6)) /
               pow(16, k));
}

int main(void) {
  flt_t pi;

#pragma omp parallel for
  for (int k = 0; k < 64; ++k) {
    pi += increment(k); // BUG: there is a race condition on pi
  };

  cout << "pi exact: 3.14159265358979323846264338327950288419716939937510\n";
  cout << "pi omp:   " << setprecision(19) << pi << endl;

  return 0;
}

Overwriting pi_omp_1.cpp


In [None]:
! g++ -std=c++11 -o pi_omp_1 pi_omp_1.cpp -fopenmp && ./pi_omp_1

pi exact: 3.14159265358979323846264338327950288419716939937510
pi omp:   3.141592653589793238


# OpenMP reduction clause

In [None]:
%%file pi_omp_2.cpp

#include <omp.h>

#include <cmath>
#include <iomanip>
#include <iostream>
#include <vector>

using namespace std;
typedef long double flt_t;

const flt_t one = flt_t(1);
const flt_t two = flt_t(2);
const flt_t four = flt_t(4);

// Bailey–Borwein–Plouffe formula
flt_t increment(const int k) {
  return flt_t((four / (8 * k + 1) - two / (8 * k + 4) - one / (8 * k + 5) -
                one / (8 * k + 6)) /
               pow(16, k));
}

int main(void) {
  flt_t pi;

#pragma omp parallel for reduction(+:pi)
  for (int k = 0; k < 64; ++k) {
    pi += increment(k); // this code is correct
  };

  cout << "pi exact: 3.14159265358979323846264338327950288419716939937510\n";
  cout << "pi omp:   " << setprecision(19) << pi << endl;

  return 0;
}

Overwriting pi_omp_2.cpp


In [None]:
! g++ -std=c++11 -o pi_omp_2 pi_omp_2.cpp -fopenmp && ./pi_omp_2

pi exact: 3.14159265358979323846264338327950288419716939937510
pi omp:   3.141592653589793238


# Exercise: Computing Pi

Modify the example below to make the code multi-threaded using OpenMP.

We use the formula:

$$\pi = \int_0^1 \frac{4}{1+x^2} \, dx $$

The integral is approximated using a quadrature rule.

In [None]:
%%file pi_openmp.cpp

#include <omp.h>
#include <cstdio>

int main(){
    double sum = 0;
    const int n = 10000;
    for(int i = 0; i<n; ++i){
        const double x = (i+0.5) / n;
        sum += 4. / (1. + x*x); 
    }
    sum /= n;
    printf("Pi is %17.15f\n      3.141592653589793\n",sum);
}

Overwriting pi_openmp.cpp


In [None]:
!g++ -o pi_openmp -fopenmp pi_openmp.cpp && ./pi_openmp

Pi is 3.141592654423134
      3.141592653589793


# OpenMP tasks

In [None]:
%%file tree_postorder.cpp

#include <omp.h>

#include <cstdio>
#include <cstdlib>

#define P 0.8

bool InsertCond() { return rand() < static_cast<int>(P * RAND_MAX); }

struct Node {
  int data;
  Node *left, *right;
};

void FillTree(const int max_level, int level, Node *curr_node) {
  if (level < max_level) {
    curr_node->left = new Node;
    curr_node->left->left = curr_node->left->right = NULL;

    if (InsertCond()) FillTree(max_level, level + 1, curr_node->left);

    curr_node->right = new Node;
    curr_node->right->left = curr_node->right->right = NULL;

    if (InsertCond()) FillTree(max_level, level + 1, curr_node->right);
  }
}

// Sequential code
int PostOrderTraverseSequential(struct Node *curr_node) {
  int left = 0, right = 0;
  if (curr_node->left) left = PostOrderTraverseSequential(curr_node->left);
  if (curr_node->right) right = PostOrderTraverseSequential(curr_node->right);
  curr_node->data = left + right;  // Number of children nodes

  return 1 + left + right;
}

// Parallel post-order traversal
int PostOrderTraverse(struct Node *curr_node) {
  int left = 0, right = 0;

  if (curr_node->left)
#pragma omp task shared(left)
    left = PostOrderTraverse(curr_node->left);
  // Default attribute for task constructs is firstprivate
  if (curr_node->right)
#pragma omp task shared(right)
    right = PostOrderTraverse(curr_node->right);

#pragma omp taskwait
  curr_node->data = left + right;  // Number of children nodes

  return 1 + left + right;
}

int main() {
  int n_level = 32;  // Maximum number of levels in the tree
  Node *root = new Node;

  // Create a random tree
  FillTree(n_level, 1, root);

  printf("Post-order traversal:              %d\n", PostOrderTraverseSequential(root));

#pragma omp parallel
#pragma omp single  // Only a single thread should execute this
  printf("Multi-threaded omp implementation: %d\n", PostOrderTraverse(root));
}

Overwriting tree_postorder.cpp


In [None]:
! g++ -std=c++11 -o tree_postorder tree_postorder.cpp -fopenmp && ./tree_postorder

Post-order traversal:              2934467
Multi-threaded omp implementation: 2934467


# Exercise: Entropy

In [None]:
%%file entropy.cpp

#include <vector>
#include <cmath>
#include <cassert>
#include <iostream>
#include <omp.h>

using namespace std;

// helper function
void print_vector(vector<float> &vec)
{
    const unsigned n = vec.size();
    printf("(");
    for (unsigned i = 0; i < n - 1; ++i)
        printf("%6.4f,", vec[i]);
    printf("%6.4f)\n", vec[n - 1]);
}

int main(void)
{
    const unsigned size = 16;
    vector<float> entropy0(size), entropy1(size);

    #pragma omp parallel
    #pragma omp single
    printf("number of threads = %d\n", omp_get_num_threads());
    // omp_get_num_threads returns the number of threads used
    // when running a parallel region

    // Computing two probabilities
    for (unsigned i = 0; i < size; i++)
    {
        float d = i - float(size - 1) / 2.;
        entropy0[i] = exp(-d * d / 0.5);
        entropy1[i] = exp(-d * d / 10.);
    }

    // We sum up all the entries to normalize the vectors
    // entropy0 and entropy1.
    float sum1 = 0, sum2 = 0;
    for (unsigned i = 0; i < size; i++)
    {
        sum1 += entropy0[i];
        sum2 += entropy1[i];
    }

    // Now normalize entropy0 and entropy1.
    for (unsigned i = 0; i < size; i++)
    {
        entropy0[i] /= sum1;
        entropy1[i] /= sum2;
    }

    print_vector(entropy0);
    print_vector(entropy1);

    // Computing the entropy of the two probabilities
    float ent1 = 0, ent2 = 0;
    for (unsigned i = 0; i < size; i++)
    {
        if (entropy0[i] > 0)
            ent1 += entropy0[i] * log(entropy0[i]);
        if (entropy1[i] > 0)
            ent2 += entropy1[i] * log(entropy1[i]);
    }

    ent1 = -ent1;
    ent2 = -ent2;

    printf("Change in entropy: %43.40f\n", ent2 - ent1);
    assert(fabs(ent2 - ent1 - 1.4378435611724853515625) < 1e-6);

    return 0;
}

Overwriting entropy.cpp


In [None]:
! g++ -std=c++11 -o entropy entropy.cpp -fopenmp && ./entropy

number of threads = 2
(0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0090,0.4910,0.4910,0.0090,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000)
(0.0006,0.0026,0.0087,0.0236,0.0524,0.0955,0.1425,0.1741,0.1741,0.1425,0.0955,0.0524,0.0236,0.0087,0.0026,0.0006)
Change in entropy:  1.4378435611724853515625000000000000000000


# Exercise: N-body problem

In [None]:
%%file nbody.cpp

#include <omp.h>

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

using namespace std;

// Pseudo-random number generator
int rand_r(unsigned int *seed) {
  unsigned int next = *seed;
  int result;

  next *= 1103515245;
  next += 12345;
  result = (unsigned int)(next / 65536) % 2048;

  next *= 1103515245;
  next += 12345;
  result <<= 10;
  result ^= (unsigned int)(next / 65536) % 1024;

  next *= 1103515245;
  next += 12345;
  result <<= 10;
  result ^= (unsigned int)(next / 65536) % 1024;

  *seed = next;

  return result;
}

float force(const float x) { return -2. * atan(x) / (x * x + 1.); }

int main(void) {
  const unsigned n_threads = 1;
  omp_set_num_threads(n_threads);  
  // set the number of threads for this example
#pragma omp parallel
#pragma omp single
  printf("number of threads = %d\n", omp_get_num_threads());

  const int n = 64;
  vector<float> x(n);

  // Generate random points on the unit interval
  // Todo: parallelize this block
  {
    assert(omp_get_num_threads() == n_threads);
    // Thread safe generation of random numbers
    const long tid = omp_get_thread_num();
    // Seed is unique to thread
    unsigned int seed = 4357U + unsigned(tid) * 1103515245;
    // Parallelize this loop
    for (int i = 0; i < n; ++i) {
      x[i] = float(rand_r(&seed)) / RAND_MAX;
      assert(x[i] > 0 && x[i] < 1);
    }
  }

  /* Compute interaction forces between particles.
   Atomic is used. */
  vector<float> f(n);
  // Parallelize this loop
  for (int i = 0; i < n; ++i) f[i] = 0.;

  // Parallelize this loop
  for (int i = 0; i < n; ++i)
    for (int j = i + 1; j < n; ++j) {
      const float x_ = x[i] - x[j];
      const float f_ = force(x_);
      // Avoid the race condition
      f[i] += f_;
      f[j] -= f_;
    }

  // Test
  {
    vector<float> f0(n, 0.);

    for (int i = 0; i < n; ++i)
      for (int j = i + 1; j < n; ++j) {
        const float x_ = x[i] - x[j];
        const float f_ = force(x_);
        f0[i] += f_;
        f0[j] -= f_;
      }

    for (int i = 0; i < n; ++i) assert(fabs(f0[i] - f[i]) < 1e-4);

    float max = 0;
    for (int i = 0; i < n; ++i)
      max = max < fabs(f0[i] - f[i]) ? fabs(f0[i] - f[i]) : max;
    printf("largest error %g\n", max);

    cout << "PASS\n";
  }

  return 0;
}

Overwriting nbody.cpp


In [None]:
!g++ -fopenmp -o nbody nbody.cpp && ./nbody

number of threads = 1
largest error 0
PASS


In [None]:
!OMP_NUM_THREADS=16 ./entropy

number of threads = 16
(0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0090,0.4910,0.4910,0.0090,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000)
(0.0006,0.0026,0.0087,0.0236,0.0524,0.0955,0.1425,0.1741,0.1741,0.1425,0.0955,0.0524,0.0236,0.0087,0.0026,0.0006)
Change in entropy:  1.4378435611724853515625000000000000000000


Attempt to do the exercises before reading the solutions!!

# Solutions

## Computing Pi solution

In [None]:
%%file pi_openmp.cpp

#include <omp.h>
#include <cstdio>

int main(){
    double sum = 0;
    const int n = 10000;
    #pragma omp parallel for reduction(+:sum)
    for(int i = 0; i<n; ++i){
        const double x = (i+0.5) / n;
        sum += 4. / (1. + x*x); 
    }
    sum /= n;
    printf("Pi is %17.15f\n      3.141592653589793\n",sum);
}

Overwriting pi_openmp.cpp


In [None]:
!g++ -o pi_openmp -fopenmp pi_openmp.cpp && ./pi_openmp

Pi is 3.141592654423129
      3.141592653589793


## Entropy homework solution

In [None]:
%%file entropy.cpp

#include <omp.h>

#include <cassert>
#include <cmath>
#include <iostream>
#include <vector>

using namespace std;

// helper function
void print_vector(vector<float> &vec) {
  const unsigned n = vec.size();
  printf("(");
  for (unsigned i = 0; i < n - 1; ++i) printf("%6.4f,", vec[i]);
  printf("%6.4f)\n", vec[n - 1]);
}

int main(void) {
  const unsigned size = 16;
  vector<float> entropy0(size), entropy1(size);

#pragma omp parallel
#pragma omp single
  printf("number of threads = %d\n", omp_get_num_threads());
  // omp_get_num_threads returns the number of threads used
  // when running a parallel region

  // Computing two probabilities
#pragma omp parallel for
  for (unsigned i = 0; i < size; i++) {
    float d = i - float(size - 1) / 2.;
    entropy0[i] = exp(-d * d / 0.5);
    entropy1[i] = exp(-d * d / 10.);
  }

  // We sum up all the entries to normalize the vectors
  // entropy0 and entropy1.
  float sum1 = 0, sum2 = 0;
#pragma omp parallel for reduction(+ : sum1, sum2)
  for (unsigned i = 0; i < size; i++) {
    sum1 += entropy0[i];
    sum2 += entropy1[i];
  }

// Now normalize entropy0 and entropy1.
#pragma omp parallel for
  for (unsigned i = 0; i < size; i++) {
    entropy0[i] /= sum1;
    entropy1[i] /= sum2;
  }

  print_vector(entropy0);
  print_vector(entropy1);

  // Computing the entropy of the two probabilities
  float ent1 = 0, ent2 = 0;
#pragma omp parallel for reduction(+ : ent1, ent2)
  for (unsigned i = 0; i < size; i++) {
    if (entropy0[i] > 0) ent1 += entropy0[i] * log(entropy0[i]);
    if (entropy1[i] > 0) ent2 += entropy1[i] * log(entropy1[i]);
  }

  ent1 = -ent1;
  ent2 = -ent2;

  printf("Change in entropy: %43.40f\n", ent2 - ent1);
  assert(fabs(ent2 - ent1 - 1.4378435611724853515625) < 1e-6);

  return 0;
}

Overwriting entropy.cpp


In [None]:
! g++ -std=c++11 -o entropy entropy.cpp -fopenmp && ./entropy

number of threads = 2
(0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0090,0.4910,0.4910,0.0090,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000)
(0.0006,0.0026,0.0087,0.0236,0.0524,0.0955,0.1425,0.1741,0.1741,0.1425,0.0955,0.0524,0.0236,0.0087,0.0026,0.0006)
Change in entropy:  1.4378437995910644531250000000000000000000


## N-body problem solution

In [None]:
%%file nbody_omp.cpp

#include <omp.h>

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

using namespace std;

// Pseudo-random number generator
int rand_r(unsigned int *seed) {
  unsigned int next = *seed;
  int result;

  next *= 1103515245;
  next += 12345;
  result = (unsigned int)(next / 65536) % 2048;

  next *= 1103515245;
  next += 12345;
  result <<= 10;
  result ^= (unsigned int)(next / 65536) % 1024;

  next *= 1103515245;
  next += 12345;
  result <<= 10;
  result ^= (unsigned int)(next / 65536) % 1024;

  *seed = next;

  return result;
}

float force(const float x) { return -2. * atan(x) / (x * x + 1.); }

int main(void) {
  const unsigned n_threads = 4;
  omp_set_num_threads(n_threads);  
  // set the number of threads for this example
#pragma omp parallel
#pragma omp single
  printf("number of threads = %d\n", omp_get_num_threads());

  const int n = 64;
  vector<float> x(n);

// Generate random points on the unit interval
// Create a parallel region
#pragma omp parallel
  {
    assert(omp_get_num_threads() == n_threads);
    // Thread safe generation of random numbers
    const long tid = omp_get_thread_num();
    // Seed is unique to thread
    unsigned int seed = 4357U + unsigned(tid) * 1103515245;
   // Parallel for loop
#pragma omp for
    for (int i = 0; i < n; ++i) {
      x[i] = float(rand_r(&seed)) / RAND_MAX;
      assert(x[i] > 0 && x[i] < 1);
    }
  }

  /* Compute interaction forces between particles.
   Atomic is used. */
  vector<float> f(n);
#pragma omp parallel for
  for (int i = 0; i < n; ++i) f[i] = 0.;

#pragma omp parallel for
  for (int i = 0; i < n; ++i)
    for (int j = i + 1; j < n; ++j) {
      const float x_ = x[i] - x[j];
      const float f_ = force(x_);
      // Use atomic to avoid a race condition
#pragma omp atomic
      f[i] += f_;
#pragma omp atomic
      f[j] -= f_;
    }

  // Test
  {
    vector<float> f0(n, 0.);

    for (int i = 0; i < n; ++i)
      for (int j = i + 1; j < n; ++j) {
        const float x_ = x[i] - x[j];
        const float f_ = force(x_);
        f0[i] += f_;
        f0[j] -= f_;
      }

    for (int i = 0; i < n; ++i) assert(fabs(f0[i] - f[i]) < 1e-4);

    float max = 0;
    for (int i = 0; i < n; ++i)
      max = max < fabs(f0[i] - f[i]) ? fabs(f0[i] - f[i]) : max;
    printf("largest error %g\n", max);

    cout << "PASS\n";
  }

  return 0;
}

Overwriting nbody_omp.cpp


In [None]:
!g++ -o nbody_omp -fopenmp nbody_omp.cpp && ./nbody_omp

number of threads = 4
largest error 7.62939e-06
PASS


In [None]:
!OMP_NUM_THREADS=16 ./nbody_omp

number of threads = 4
largest error 1.14441e-05
PASS
