<a href="https://colab.research.google.com/github/ronglu-stanford/RL_reference_public/blob/main/4_mpi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MPI Hello World

In [None]:
%%file mpi_hello_world.cpp

#include <cassert>
#include <cstdio>

#include "mpi.h"

int main(int argc, char **argv) {
  MPI_Init(&argc, &argv);

  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  assert(size >= 2);

  printf("Hello from processor %2d out of %2d\n", rank, size);

  int i = 0;

  if (rank == 0) {
    i = 123;
    printf("Processor 0 has set i = 123\n");
    MPI_Send(&i, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
    printf("Processor 0 has sent i to processor 1\n");
  } else if (rank == 1) {
    MPI_Recv(&i, 1, MPI_INT, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    printf("Processor 1 has received i from processor 0; i = %2d\n", i);
    assert(i == 123);
  }

  MPI_Finalize();
}

Overwriting mpi_hello_world.cpp


In [None]:
!name=mpi_hello_world && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

Hello from processor  0 out of  4
Hello from processor  2 out of  4
Processor 0 has set i = 123
Processor 0 has sent i to processor 1
Hello from processor  3 out of  4
Hello from processor  1 out of  4
Processor 1 has received i from processor 0; i = 123


# Laplacian smoothing using MPI

In [None]:
%%file laplacian.cpp

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

#include "mpi.h"

using namespace std;

double init_field(const int N1, const int N2, const int i, const int j) {
  double x = double(i) / N1;
  double y = double(j) / N2;
  return x <= 0.5 ? x * x + y : 1 + x + y * y;
}

void test(const unsigned N1, const unsigned N2, const unsigned ITER,
          const double EPS, const unsigned n1, const unsigned n2,
          const unsigned i0, vector<double>& a_) {
  vector<double> a(N1 * N2);

  for (int i = 0; i < N1; ++i) {
    for (int j = 0; j < N2; ++j) {
      a[i * N2 + j] = init_field(N1, N2, i, j);
    }
  }

  vector<double> b(N1 * N2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < (N1 - 1); ++i) {
      const unsigned row = i * N2;
      for (int j = 1; j < (N2 - 1); ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - N2] + a[row + j - N2 - 1] + a[row + j - N2 + 1] +
                   a[row + j + N2] + a[row + j + N2 + 1] + a[row + j + N2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < (N1 - 1); ++i) {
      for (int j = 1; j < (N2 - 1); ++j) {
        a[i * N2 + j] = b[i * N2 + j];
      }
    }
  }

  // Test
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < N2; ++j) {
      assert(a_[i * N2 + j] == a[(i0 + i) * N2 + j]);
    }
  }
}

int main(int argc, char* argv[]) {
  MPI_Init(&argc, &argv);

  double time = MPI_Wtime();

  // Problem parameters
  const unsigned N1 = 8;
  const unsigned N2 = 4;
  const unsigned ITER = 4;
  const double EPS = 0.1;

  // Get identification wrt other ranks
  int i_rank, n_ranks;
  MPI_Comm_rank(MPI_COMM_WORLD, &i_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);

  // Compute number of elements to process per rank
  unsigned N1_inner = N1 - 2;  // Remove the borders
  unsigned n1 = N1_inner / n_ranks;
  const unsigned u_rem = N1_inner - n_ranks * n1;
  n1 += i_rank < u_rem ? 1 : 0;
  // Required if n_ranks does not divide N1_inner

  // Starting x index
  unsigned i0 = i_rank * n1;
  i0 += i_rank < u_rem ? 0 : u_rem;

  // Add boundary nodes
  n1 += 2;

  unsigned n2 = N2;

  // Define processors to be used in communications
  const int p_prev = i_rank - 1;
  const int p_next = i_rank + 1;

  printf(
      "rank = %2d, rank above %2d, rank below %2d, starting index i0 %2d, "
      "domain size n1 %2d\n",
      i_rank, p_prev, p_next, i0, n1);

  // 2D field array
  vector<double> a(n1 * n2);

  // Initialize a
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < n2; ++j) {
      a[i * n2 + j] = init_field(N1, N2, i + i0, j);
    }
  }

  vector<double> b(n1 * n2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < n1 - 1; ++i) {
      const unsigned row = i * n2;
      for (int j = 1; j < n2 - 1; ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - n2] + a[row + j - n2 - 1] + a[row + j - n2 + 1] +
                   a[row + j + n2] + a[row + j + n2 + 1] + a[row + j + n2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < n1 - 1; ++i) {
      for (int j = 1; j < n2 - 1; ++j) {
        a[i * n2 + j] = b[i * n2 + j];
      }
    }

    // Copy from other processors
    // For this example, we could also have used MPI_Sendrecv to make the code
    // simpler.
    if (i_rank % 2 == 0) {
      if (p_next < n_ranks) {
        MPI_Recv(&a[(n1 - 1) * n2], n2, MPI_DOUBLE, p_next, MPI_ANY_TAG,
                 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        MPI_Send(&a[(n1 - 2) * n2], n2, MPI_DOUBLE, p_next, 0, MPI_COMM_WORLD);
      }
      if (p_prev >= 0) {
        MPI_Recv(&a[0], n2, MPI_DOUBLE, p_prev, MPI_ANY_TAG, MPI_COMM_WORLD,
                 MPI_STATUS_IGNORE);
        MPI_Send(&a[n2], n2, MPI_DOUBLE, p_prev, 0, MPI_COMM_WORLD);
      }
    } else {
      if (p_prev >= 0) {
        MPI_Send(&a[n2], n2, MPI_DOUBLE, p_prev, 0, MPI_COMM_WORLD);
        MPI_Recv(&a[0], n2, MPI_DOUBLE, p_prev, MPI_ANY_TAG, MPI_COMM_WORLD,
                 MPI_STATUS_IGNORE);
      }
      if (p_next < n_ranks) {
        MPI_Send(&a[(n1 - 2) * n2], n2, MPI_DOUBLE, p_next, 0, MPI_COMM_WORLD);
        MPI_Recv(&a[(n1 - 1) * n2], n2, MPI_DOUBLE, p_next, MPI_ANY_TAG,
                 MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      }
    }
  }

  // Get average temperature
  double sum_temperature = 0;
  for (int i = 1; i < (n1 - 1); ++i) {
    for (int j = 1; j < (n2 - 1); ++j) sum_temperature += a[i * n2 + j];
  }

  double total_temperature;
  MPI_Reduce(&sum_temperature, &total_temperature, 1, MPI_DOUBLE, MPI_SUM, 0,
             MPI_COMM_WORLD);

  double local_temperature = sum_temperature / (double)((n2 - 2) * (n1 - 2));

  time = MPI_Wtime() - time;

  if (i_rank == 0) {
    double average_temperature =
        total_temperature / (double)((N1 - 2) * (N2 - 2));
    printf("rank = %2d, time = %f, local temp = %.3f, global temp = %.3f\n",
           i_rank, time, local_temperature, average_temperature);
  } else {
    printf("rank = %2d, time = %f, local temp = %.3f\n", i_rank, time,
           local_temperature);
  }

  // Test
  test(N1, N2, ITER, EPS, n1, n2, i0, a);
  printf("rank = %2d PASS\n", i_rank);

  MPI_Finalize();
}

Overwriting laplacian.cpp


In [None]:
!name=laplacian && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

rank =  1, rank above  0, rank below  2, starting index i0  2, domain size n1  4
rank =  2, rank above  1, rank below  3, starting index i0  4, domain size n1  3
rank =  0, rank above -1, rank below  1, starting index i0  0, domain size n1  4
rank =  3, rank above  2, rank below  4, starting index i0  5, domain size n1  3
rank =  2, time = 0.003677, local temp = 1.480
rank =  1, time = 0.003818, local temp = 0.838
rank =  1 PASS
rank =  2 PASS
rank =  3, time = 0.000635, local temp = 1.876
rank =  3 PASS
rank =  0, time = 0.003938, local temp = 0.447, global temp = 0.988
rank =  0 PASS


# Exercise: computing $\pi$ using MPI

We want to estimate $\pi$ using the formula:

$$\pi = \int_0^1 \frac{4}{1+x^2} \, dx $$

The integral is approximated using a quadrature rule.

Write an MPI program to estimate $\pi$ using this formula.

Here is a skeleton code to get started:
```
  int n = 10000;
  double s = 0.;
  for (int i = 0; i < n; ++i) {
    double x = double(i + 0.5) / n;
    s += 1. / (1. + x * x);
  }
  s *= 4. / n;
```
Use `MPI_Reduce` to calculate the final sum.

Syntax:
```
MPI_Reduce(&sendbuf,&recvbuf,count,datatype,op,root,comm)
```

In [None]:
%%file pi_mpi.cpp

#include <cstdio>

#include "mpi.h"

int main(int argc, char **argv) {
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  int n = 10000;
  double s = 0.;
  for (int i = rank; i < n; i += size) {
    double x = double(i + 0.5) / n;
    s += 1. / (1. + x * x);
  }
  s *= 4. / n;
  double total;

  // Fill-in code here to compute the reduction using a collective communication
  // MPI_Reduce(...);

  if (rank == 0) {
    printf("Pi estimate %18.15f\nExact        3.14159265359\n", total);
  }
  MPI_Finalize();
}

Overwriting pi_mpi.cpp


In [None]:
# The output is
# Pi estimate  0.000000000000000
# until the code is implemented correctly.
# The correct output is
# Pi estimate  3.141592654423124
# Exact        3.14159265359

!name=pi_mpi && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

Pi estimate  0.000000000000000
Exact        3.14159265359


# Exercise: `MPI_Sendrecv`

Implement the Laplacian smoothing algorithm using `MPI_Sendrecv`.

Here is the syntax:

```
int MPI_Sendrecv(
  const void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest, int sendtag, 
  void *recvbuf, int recvcount, MPI_Datatype recvtype, int source, int recvtag,
  MPI_Comm comm, MPI_Status *status)
```

Try to complete the exercise before reading the solution below.

In [None]:
%%file laplacian_sendrecv.cpp

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

#include "mpi.h"

using namespace std;

double init_field(const int N1, const int N2, const int i, const int j) {
  double x = double(i) / N1;
  double y = double(j) / N2;
  return x <= 0.5 ? x * x + y : 1 + x + y * y;
}

void test(const unsigned N1, const unsigned N2, const unsigned ITER,
          const double EPS, const unsigned n1, const unsigned n2,
          const unsigned i0, vector<double>& a_) {
  vector<double> a(N1 * N2);

  for (int i = 0; i < N1; ++i) {
    for (int j = 0; j < N2; ++j) {
      a[i * N2 + j] = init_field(N1, N2, i, j);
    }
  }

  vector<double> b(N1 * N2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < (N1 - 1); ++i) {
      const unsigned row = i * N2;
      for (int j = 1; j < (N2 - 1); ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - N2] + a[row + j - N2 - 1] + a[row + j - N2 + 1] +
                   a[row + j + N2] + a[row + j + N2 + 1] + a[row + j + N2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < (N1 - 1); ++i) {
      for (int j = 1; j < (N2 - 1); ++j) {
        a[i * N2 + j] = b[i * N2 + j];
      }
    }
  }

  // Test
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < N2; ++j) {
      assert(a_[i * N2 + j] == a[(i0 + i) * N2 + j]);
    }
  }
}

int main(int argc, char* argv[]) {
  MPI_Init(&argc, &argv);

  double time = MPI_Wtime();

  // Problem parameters
  const unsigned N1 = 8;
  const unsigned N2 = 4;
  const unsigned ITER = 4;
  const double EPS = 0.1;

  // Get identification wrt other ranks
  int i_rank, n_ranks;
  MPI_Comm_rank(MPI_COMM_WORLD, &i_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);

  // Compute number of elements to process per rank
  unsigned N1_inner = N1 - 2;  // Remove the borders
  unsigned n1 = N1_inner / n_ranks;
  const unsigned u_rem = N1_inner - n_ranks * n1;
  n1 += i_rank < u_rem ? 1 : 0;
  // Required if n_ranks does not divide N1_inner

  // Starting x index
  unsigned i0 = i_rank * n1;
  i0 += i_rank < u_rem ? 0 : u_rem;

  // Add boundary nodes
  n1 += 2;

  unsigned n2 = N2;

  // Define processors to be used in communications
  const int p_prev = i_rank - 1;
  const int p_next = i_rank + 1;

  printf(
      "rank = %2d, rank above %2d, rank below %2d, starting index i0 %2d, "
      "domain size n1 %2d\n",
      i_rank, p_prev, p_next, i0, n1);

  // 2D field array
  vector<double> a(n1 * n2);

  // Initialize a
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < n2; ++j) {
      a[i * n2 + j] = init_field(N1, N2, i + i0, j);
    }
  }

  vector<double> b(n1 * n2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < n1 - 1; ++i) {
      const unsigned row = i * n2;
      for (int j = 1; j < n2 - 1; ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - n2] + a[row + j - n2 - 1] + a[row + j - n2 + 1] +
                   a[row + j + n2] + a[row + j + n2 + 1] + a[row + j + n2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < n1 - 1; ++i) {
      for (int j = 1; j < n2 - 1; ++j) {
        a[i * n2 + j] = b[i * n2 + j];
      }
    }

    // Copy from other processors
    // fill-in your code here
    // You need to call MPI_Sendrecv twice, in order to communicate with
    // p_prev and p_next.
    // MPI_Sendrecv()
    // MPI_Sendrecv()
  }

  // Get average temperature
  double sum_temperature = 0;
  for (int i = 1; i < (n1 - 1); ++i) {
    for (int j = 1; j < (n2 - 1); ++j) sum_temperature += a[i * n2 + j];
  }

  double total_temperature;
  MPI_Reduce(&sum_temperature, &total_temperature, 1, MPI_DOUBLE, MPI_SUM, 0,
             MPI_COMM_WORLD);

  double local_temperature = sum_temperature / (double)((n2 - 2) * (n1 - 2));

  time = MPI_Wtime() - time;

  if (i_rank == 0) {
    double average_temperature =
        total_temperature / (double)((N1 - 2) * (N2 - 2));
    printf("rank = %2d, time = %f, local temp = %.3f, global temp = %.3f\n",
           i_rank, time, local_temperature, average_temperature);
  } else {
    printf("rank = %2d, time = %f, local temp = %.3f\n", i_rank, time,
           local_temperature);
  }

  // Test
  test(N1, N2, ITER, EPS, n1, n2, i0, a);
  printf("rank = %2d PASS\n", i_rank);

  MPI_Finalize();
}

Overwriting laplacian_sendrecv.cpp


In [None]:
# You will get error messages until the assert statements in test() pass
!name=laplacian_sendrecv && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

rank =  2, rank above  1, rank below  3, starting index i0  4, domain size n1  3
rank =  3, rank above  2, rank below  4, starting index i0  5, domain size n1  3
rank =  3, time = 0.000070, local temp = 1.959
laplacian_sendrecv: laplacian_sendrecv.cpp:55: void test(unsigned int, unsigned int, unsigned int, double, unsigned int, unsigned int, unsigned int, std::vector<double>&): Assertion `a_[i * N2 + j] == a[(i0 + i) * N2 + j]' failed.
[21bddc4f2ef3:01364] *** Process received signal ***
[21bddc4f2ef3:01364] Signal: Aborted (6)
[21bddc4f2ef3:01364] Signal code:  (-6)
rank =  2, time = 0.000206, local temp = 1.378
[21bddc4f2ef3:01364] [ 0] laplacian_sendrecv: laplacian_sendrecv.cpp:55: void test(unsigned int, unsigned int, unsigned int, double, unsigned int, unsigned int, unsigned int, std::vector<double>&): Assertion `a_[i * N2 + j] == a[(i0 + i) * N2 + j]' failed.
/lib/x86_64-linux-gnu/libc.so.6(+0x3f040)[0x7ff5d0ea0040]
[21bddc4f2ef3:01364] [ 1] /lib/x86_64-linux-gnu/libc.so.6(gsigna

# Exercise: non-blocking communications
Try to complete the exercise before reading the solution below.

Syntax:
```
int MPI_Isend(
  const void *buf, int count, MPI_Datatype datatype, 
  int dest, int tag, 
  MPI_Comm comm, MPI_Request *request)

int MPI_Irecv(
  void *buf, int count, MPI_Datatype datatype,
  int source, int tag, 
  MPI_Comm comm, MPI_Request *request)

int MPI_Waitall(int count, 
    MPI_Request array_of_requests[],
    MPI_Status *array_of_statuses)  
```

In [None]:
%%file laplacian_nb.cpp

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

#include "mpi.h"

using namespace std;

double init_field(const int N1, const int N2, const int i, const int j) {
  double x = double(i) / N1;
  double y = double(j) / N2;
  return x <= 0.5 ? x * x + y : 1 + x + y * y;
}

void test(const unsigned N1, const unsigned N2, const unsigned ITER,
          const double EPS, const unsigned n1, const unsigned n2,
          const unsigned i0, vector<double>& a_) {
  vector<double> a(N1 * N2);

  for (int i = 0; i < N1; ++i) {
    for (int j = 0; j < N2; ++j) {
      a[i * N2 + j] = init_field(N1, N2, i, j);
    }
  }

  vector<double> b(N1 * N2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < (N1 - 1); ++i) {
      const unsigned row = i * N2;
      for (int j = 1; j < (N2 - 1); ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - N2] + a[row + j - N2 - 1] + a[row + j - N2 + 1] +
                   a[row + j + N2] + a[row + j + N2 + 1] + a[row + j + N2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < (N1 - 1); ++i) {
      for (int j = 1; j < (N2 - 1); ++j) {
        a[i * N2 + j] = b[i * N2 + j];
      }
    }
  }

  // Test
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < N2; ++j) {
      assert(a_[i * N2 + j] == a[(i0 + i) * N2 + j]);
    }
  }
}

int main(int argc, char* argv[]) {
  MPI_Init(&argc, &argv);

  double time = MPI_Wtime();

  // Problem parameters
  const unsigned N1 = 8;
  const unsigned N2 = 4;
  const unsigned ITER = 4;
  const double EPS = 0.1;

  // Get identification wrt other ranks
  int i_rank, n_ranks;
  MPI_Comm_rank(MPI_COMM_WORLD, &i_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);

  // Compute number of elements to process per rank
  unsigned N1_inner = N1 - 2;  // Remove the borders
  unsigned n1 = N1_inner / n_ranks;
  const unsigned u_rem = N1_inner - n_ranks * n1;
  n1 += i_rank < u_rem ? 1 : 0;
  // Required if n_ranks does not divide N1_inner

  // Starting x index
  unsigned i0 = i_rank * n1;
  i0 += i_rank < u_rem ? 0 : u_rem;

  // Add boundary nodes
  n1 += 2;

  unsigned n2 = N2;

  // Define processors to be used in communications
  const int p_prev = i_rank - 1;
  const int p_next = i_rank + 1;

  printf(
      "rank = %2d, rank above %2d, rank below %2d, starting index i0 %2d, "
      "domain size n1 %2d\n",
      i_rank, p_prev, p_next, i0, n1);

  // 2D field array
  vector<double> a(n1 * n2);

  // Initialize a
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < n2; ++j) {
      a[i * n2 + j] = init_field(N1, N2, i + i0, j);
    }
  }

  vector<double> b(n1 * n2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < n1 - 1; ++i) {
      const unsigned row = i * n2;
      for (int j = 1; j < n2 - 1; ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - n2] + a[row + j - n2 - 1] + a[row + j - n2 + 1] +
                   a[row + j + n2] + a[row + j + n2 + 1] + a[row + j + n2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < n1 - 1; ++i) {
      for (int j = 1; j < n2 - 1; ++j) {
        a[i * n2 + j] = b[i * n2 + j];
      }
    }

    // Copy from other processors
    int n_requests = 0;
    n_requests += p_next < n_ranks ? 2 : 0;
    n_requests += p_prev >= 0 ? 2 : 0;
    MPI_Request request[n_requests];
    unsigned i_req = 0;
    // Fill-in code here
    // Use MPI_Irecv(...), MPI_Isend(...), and 
    // MPI_Waitall(...)
    // Syntax for MPI_Waitall:
    // MPI_Waitall(n_requests, request, MPI_STATUSES_IGNORE);
    // request should be initialized correctly in MPI_Irecv and MPI_Isend.
  }

  // Get average temperature
  double sum_temperature = 0;
  for (int i = 1; i < (n1 - 1); ++i) {
    for (int j = 1; j < (n2 - 1); ++j) sum_temperature += a[i * n2 + j];
  }

  double total_temperature;
  MPI_Reduce(&sum_temperature, &total_temperature, 1, MPI_DOUBLE, MPI_SUM, 0,
             MPI_COMM_WORLD);

  double local_temperature = sum_temperature / (double)((n2 - 2) * (n1 - 2));

  time = MPI_Wtime() - time;

  if (i_rank == 0) {
    double average_temperature =
        total_temperature / (double)((N1 - 2) * (N2 - 2));
    printf("rank = %2d, time = %f, local temp = %.3f, global temp = %.3f\n",
           i_rank, time, local_temperature, average_temperature);
  } else {
    printf("rank = %2d, time = %f, local temp = %.3f\n", i_rank, time,
           local_temperature);
  }

  // Test
  test(N1, N2, ITER, EPS, n1, n2, i0, a);
  printf("rank = %2d PASS\n", i_rank);

  MPI_Finalize();
}

Overwriting laplacian_nb.cpp


In [None]:
# You will get error messages until the assert statements in test() pass
!name=laplacian_nb && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

rank =  1, rank above  0, rank below  2, starting index i0  2, domain size n1  4
rank =  3, rank above  2, rank below  4, starting index i0  5, domain size n1  3
rank =  1, time = 0.000073, local temp = 0.885
laplacian_nb: laplacian_nb.cpp:55: void test(unsigned int, unsigned int, unsigned int, double, unsigned int, unsigned int, unsigned int, std::vector<double>&): Assertion `a_[i * N2 + j] == a[(i0 + i) * N2 + j]' failed.
[21bddc4f2ef3:01384] *** Process received signal ***
[21bddc4f2ef3:01384] Signal: Aborted (6)
[21bddc4f2ef3:01384] Signal code:  (-6)
[21bddc4f2ef3:01384] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x3f040)[0x7fc18a0ff040]
[21bddc4f2ef3:01384] [ 1] rank =  3, time = 0.000275, local temp = 1.959
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7)[0x7fc18a0fefb7]
[21bddc4f2ef3:01384] [ 2] /lib/x86_64-linux-gnu/libc.so.6(abort+0x141)[0x7fc18a100921]
[21bddc4f2ef3:01384] [ 3] /lib/x86_64-linux-gnu/libc.so.6(+0x3048a)[0x7fc18a0f048a]
[21bddc4f2ef3:01384] [ 4] /lib/x86_64-linux-gnu/

# Solutions

## Computing $\pi$ using MPI

In [None]:
%%file pi_mpi.cpp

#include <cstdio>

#include "mpi.h"

int main(int argc, char **argv) {
  MPI_Init(&argc, &argv);
  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  int n = 10000;
  double s = 0.;
  for (int i = rank; i < n; i += size) {
    double x = double(i + 0.5) / n;
    s += 1. / (1. + x * x);
  }
  s *= 4. / n;
  double total;
  MPI_Reduce(&s, &total, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  if (rank == 0) {
    printf("Pi estimate %18.15f\nExact        3.14159265359\n", total);
  }
  MPI_Finalize();
}

Overwriting pi_mpi.cpp


In [None]:
!name=pi_mpi && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

Pi estimate  3.141592654423124
Exact        3.14159265359


## MPI_Sendrecv

In [None]:
%%file laplacian_sendrecv.cpp

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

#include "mpi.h"

using namespace std;

double init_field(const int N1, const int N2, const int i, const int j) {
  double x = double(i) / N1;
  double y = double(j) / N2;
  return x <= 0.5 ? x * x + y : 1 + x + y * y;
}

void test(const unsigned N1, const unsigned N2, const unsigned ITER,
          const double EPS, const unsigned n1, const unsigned n2,
          const unsigned i0, vector<double>& a_) {
  vector<double> a(N1 * N2);

  for (int i = 0; i < N1; ++i) {
    for (int j = 0; j < N2; ++j) {
      a[i * N2 + j] = init_field(N1, N2, i, j);
    }
  }

  vector<double> b(N1 * N2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < (N1 - 1); ++i) {
      const unsigned row = i * N2;
      for (int j = 1; j < (N2 - 1); ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - N2] + a[row + j - N2 - 1] + a[row + j - N2 + 1] +
                   a[row + j + N2] + a[row + j + N2 + 1] + a[row + j + N2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < (N1 - 1); ++i) {
      for (int j = 1; j < (N2 - 1); ++j) {
        a[i * N2 + j] = b[i * N2 + j];
      }
    }
  }

  // Test
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < N2; ++j) {
      assert(a_[i * N2 + j] == a[(i0 + i) * N2 + j]);
    }
  }
}

int main(int argc, char* argv[]) {
  MPI_Init(&argc, &argv);

  double time = MPI_Wtime();

  // Problem parameters
  const unsigned N1 = 8;
  const unsigned N2 = 4;
  const unsigned ITER = 4;
  const double EPS = 0.1;

  // Get identification wrt other ranks
  int i_rank, n_ranks;
  MPI_Comm_rank(MPI_COMM_WORLD, &i_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);

  // Compute number of elements to process per rank
  unsigned N1_inner = N1 - 2;  // Remove the borders
  unsigned n1 = N1_inner / n_ranks;
  const unsigned u_rem = N1_inner - n_ranks * n1;
  n1 += i_rank < u_rem ? 1 : 0;
  // Required if n_ranks does not divide N1_inner

  // Starting x index
  unsigned i0 = i_rank * n1;
  i0 += i_rank < u_rem ? 0 : u_rem;

  // Add boundary nodes
  n1 += 2;

  unsigned n2 = N2;

  // Define processors to be used in communications
  const int p_prev = i_rank - 1;
  const int p_next = i_rank + 1;

  printf(
      "rank = %2d, rank above %2d, rank below %2d, starting index i0 %2d, "
      "domain size n1 %2d\n",
      i_rank, p_prev, p_next, i0, n1);

  // 2D field array
  vector<double> a(n1 * n2);

  // Initialize a
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < n2; ++j) {
      a[i * n2 + j] = init_field(N1, N2, i + i0, j);
    }
  }

  vector<double> b(n1 * n2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < n1 - 1; ++i) {
      const unsigned row = i * n2;
      for (int j = 1; j < n2 - 1; ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - n2] + a[row + j - n2 - 1] + a[row + j - n2 + 1] +
                   a[row + j + n2] + a[row + j + n2 + 1] + a[row + j + n2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < n1 - 1; ++i) {
      for (int j = 1; j < n2 - 1; ++j) {
        a[i * n2 + j] = b[i * n2 + j];
      }
    }

    // Copy from other processors
    if (i_rank % 2 == 0) {
      if (p_next < n_ranks)
        MPI_Sendrecv(&a[(n1 - 2) * n2], n2, MPI_DOUBLE, p_next, 0,
                     &a[(n1 - 1) * n2], n2, MPI_DOUBLE, p_next, 0,
                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      if (p_prev >= 0)
        MPI_Sendrecv(&a[n2], n2, MPI_DOUBLE, p_prev, 0, &a[0], n2, MPI_DOUBLE,
                     p_prev, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    } else {
      if (p_prev >= 0)
        MPI_Sendrecv(&a[n2], n2, MPI_DOUBLE, p_prev, 0, &a[0], n2, MPI_DOUBLE,
                     p_prev, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      if (p_next < n_ranks)
        MPI_Sendrecv(&a[(n1 - 2) * n2], n2, MPI_DOUBLE, p_next, 0,
                     &a[(n1 - 1) * n2], n2, MPI_DOUBLE, p_next, 0,
                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    }
  }

  // Get average temperature
  double sum_temperature = 0;
  for (int i = 1; i < (n1 - 1); ++i) {
    for (int j = 1; j < (n2 - 1); ++j) sum_temperature += a[i * n2 + j];
  }

  double total_temperature;
  MPI_Reduce(&sum_temperature, &total_temperature, 1, MPI_DOUBLE, MPI_SUM, 0,
             MPI_COMM_WORLD);

  double local_temperature = sum_temperature / (double)((n2 - 2) * (n1 - 2));

  time = MPI_Wtime() - time;

  if (i_rank == 0) {
    double average_temperature =
        total_temperature / (double)((N1 - 2) * (N2 - 2));
    printf("rank = %2d, time = %f, local temp = %.3f, global temp = %.3f\n",
           i_rank, time, local_temperature, average_temperature);
  } else {
    printf("rank = %2d, time = %f, local temp = %.3f\n", i_rank, time,
           local_temperature);
  }

  // Test
  test(N1, N2, ITER, EPS, n1, n2, i0, a);
  printf("rank = %2d PASS\n", i_rank);

  MPI_Finalize();
}

Overwriting laplacian_sendrecv.cpp


In [None]:
!name=laplacian_sendrecv && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

rank =  1, rank above  0, rank below  2, starting index i0  2, domain size n1  4
rank =  3, rank above  2, rank below  4, starting index i0  5, domain size n1  3
rank =  0, rank above -1, rank below  1, starting index i0  0, domain size n1  4
rank =  2, rank above  1, rank below  3, starting index i0  4, domain size n1  3
rank =  3, time = 0.000520, local temp = 1.876
rank =  2, time = 0.000150, local temp = 1.480
rank =  2 PASS
rank =  3 PASS
rank =  1, time = 0.000680, local temp = 0.838
rank =  1 PASS
rank =  0, time = 0.000566, local temp = 0.447, global temp = 0.988
rank =  0 PASS


## Non-blocking communications

In [None]:
%%file laplacian_nb.cpp

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <vector>

#include "mpi.h"

using namespace std;

double init_field(const int N1, const int N2, const int i, const int j) {
  double x = double(i) / N1;
  double y = double(j) / N2;
  return x <= 0.5 ? x * x + y : 1 + x + y * y;
}

void test(const unsigned N1, const unsigned N2, const unsigned ITER,
          const double EPS, const unsigned n1, const unsigned n2,
          const unsigned i0, vector<double>& a_) {
  vector<double> a(N1 * N2);

  for (int i = 0; i < N1; ++i) {
    for (int j = 0; j < N2; ++j) {
      a[i * N2 + j] = init_field(N1, N2, i, j);
    }
  }

  vector<double> b(N1 * N2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < (N1 - 1); ++i) {
      const unsigned row = i * N2;
      for (int j = 1; j < (N2 - 1); ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - N2] + a[row + j - N2 - 1] + a[row + j - N2 + 1] +
                   a[row + j + N2] + a[row + j + N2 + 1] + a[row + j + N2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < (N1 - 1); ++i) {
      for (int j = 1; j < (N2 - 1); ++j) {
        a[i * N2 + j] = b[i * N2 + j];
      }
    }
  }

  // Test
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < N2; ++j) {
      assert(a_[i * N2 + j] == a[(i0 + i) * N2 + j]);
    }
  }
}

int main(int argc, char* argv[]) {
  MPI_Init(&argc, &argv);

  double time = MPI_Wtime();

  // Problem parameters
  const unsigned N1 = 8;
  const unsigned N2 = 4;
  const unsigned ITER = 4;
  const double EPS = 0.1;

  // Get identification wrt other ranks
  int i_rank, n_ranks;
  MPI_Comm_rank(MPI_COMM_WORLD, &i_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &n_ranks);

  // Compute number of elements to process per rank
  unsigned N1_inner = N1 - 2;  // Remove the borders
  unsigned n1 = N1_inner / n_ranks;
  const unsigned u_rem = N1_inner - n_ranks * n1;
  n1 += i_rank < u_rem ? 1 : 0;
  // Required if n_ranks does not divide N1_inner

  // Starting x index
  unsigned i0 = i_rank * n1;
  i0 += i_rank < u_rem ? 0 : u_rem;

  // Add boundary nodes
  n1 += 2;

  unsigned n2 = N2;

  // Define processors to be used in communications
  const int p_prev = i_rank - 1;
  const int p_next = i_rank + 1;

  printf(
      "rank = %2d, rank above %2d, rank below %2d, starting index i0 %2d, "
      "domain size n1 %2d\n",
      i_rank, p_prev, p_next, i0, n1);

  // 2D field array
  vector<double> a(n1 * n2);

  // Initialize a
  for (int i = 0; i < n1; ++i) {
    for (int j = 0; j < n2; ++j) {
      a[i * n2 + j] = init_field(N1, N2, i + i0, j);
    }
  }

  vector<double> b(n1 * n2);
  for (int niter = 0; niter < ITER; ++niter) {
    // Laplacian smoothing
    for (int i = 1; i < n1 - 1; ++i) {
      const unsigned row = i * n2;
      for (int j = 1; j < n2 - 1; ++j) {
        b[row + j] =
            a[row + j] +
            EPS * (a[row + j - 1] + a[row + j + 1] - 8.0 * a[row + j] +
                   a[row + j - n2] + a[row + j - n2 - 1] + a[row + j - n2 + 1] +
                   a[row + j + n2] + a[row + j + n2 + 1] + a[row + j + n2 - 1]);
      }
    }

    // Copy b into a
    for (int i = 1; i < n1 - 1; ++i) {
      for (int j = 1; j < n2 - 1; ++j) {
        a[i * n2 + j] = b[i * n2 + j];
      }
    }

    // Copy from other processors
    int n_requests = 0;
    n_requests += p_next < n_ranks ? 2 : 0;
    n_requests += p_prev >= 0 ? 2 : 0;
    MPI_Request request[n_requests];
    unsigned i_req = 0;
    if (p_next < n_ranks) {
      MPI_Irecv(&a[(n1 - 1) * n2], n2, MPI_DOUBLE, p_next, MPI_ANY_TAG,
                MPI_COMM_WORLD, &request[i_req++]);
      MPI_Isend(&a[(n1 - 2) * n2], n2, MPI_DOUBLE, p_next, 0, MPI_COMM_WORLD,
                &request[i_req++]);
    }
    if (p_prev >= 0) {
      MPI_Irecv(&a[0], n2, MPI_DOUBLE, p_prev, MPI_ANY_TAG, MPI_COMM_WORLD,
                &request[i_req++]);
      MPI_Isend(&a[n2], n2, MPI_DOUBLE, p_prev, 0, MPI_COMM_WORLD,
                &request[i_req++]);
    }
    assert(i_req == n_requests);
    MPI_Waitall(n_requests, request, MPI_STATUSES_IGNORE);
  }

  // Get average temperature
  double sum_temperature = 0;
  for (int i = 1; i < (n1 - 1); ++i) {
    for (int j = 1; j < (n2 - 1); ++j) sum_temperature += a[i * n2 + j];
  }

  double total_temperature;
  MPI_Reduce(&sum_temperature, &total_temperature, 1, MPI_DOUBLE, MPI_SUM, 0,
             MPI_COMM_WORLD);

  double local_temperature = sum_temperature / (double)((n2 - 2) * (n1 - 2));

  time = MPI_Wtime() - time;

  if (i_rank == 0) {
    double average_temperature =
        total_temperature / (double)((N1 - 2) * (N2 - 2));
    printf("rank = %2d, time = %f, local temp = %.3f, global temp = %.3f\n",
           i_rank, time, local_temperature, average_temperature);
  } else {
    printf("rank = %2d, time = %f, local temp = %.3f\n", i_rank, time,
           local_temperature);
  }

  // Test
  test(N1, N2, ITER, EPS, n1, n2, i0, a);
  printf("rank = %2d PASS\n", i_rank);

  MPI_Finalize();
}

Overwriting laplacian_nb.cpp


In [None]:
!name=laplacian_nb && mpic++ -o $name $name.cpp && mpirun -np 4 --allow-run-as-root ./$name

rank =  0, rank above -1, rank below  1, starting index i0  0, domain size n1  4
rank =  2, rank above  1, rank below  3, starting index i0  4, domain size n1  3
rank =  3, rank above  2, rank below  4, starting index i0  5, domain size n1  3
rank =  1, rank above  0, rank below  2, starting index i0  2, domain size n1  4
rank =  2, time = 0.002823, local temp = 1.480
rank =  2 PASS
rank =  1, time = 0.000583, local temp = 0.838
rank =  1 PASS
rank =  3, time = 0.002090, local temp = 1.876
rank =  3 PASS
rank =  0, time = 0.003018, local temp = 0.447, global temp = 0.988
rank =  0 PASS
