seantalts · seantalts · Aug 9, 2017 · Aug 9, 2017 · Aug 12, 2017 · Aug 12, 2017
diff --git a/MPI_NOTES b/MPI_NOTES
@@ -0,0 +1,40 @@
+
+WARNING: Boost MPI is BROKEN in 1.64.0; I used 1.65.1, but 1.65.0
+seems to be OK as well, see here:
+https://svn.boost.org/trac10/ticket/12723
+
+Build boost serialization and boost mpi as shared libraries
+
+./b2 --prefix=$HOME/local --with-mpi --with-serialization -j4 install
+
+In make/local you need to set
+
+CC=mpicxx
+
+and ensure that the LDFLAGS finds the boost installation of mpi and serialization
+
+... and of course, the openmpi implementation needs to be
+available. The macports openmpi works great.
+
+The use of mpicxx is recommended by openmp, but not required. That is,
+for openmp you can find out the extra compiler flags needed for
+linking to openmpi with the command.
+
+mpicxx --showme
+
+The mpicxx man page has more details on this.
+
+The minimal tests are in
+
+test/unit/math/rev/arr/functor
+
+map_rect_mpi_test.cpp
+map_rect_lpdf_mpi_test.cpp
+
+To compile these, there is a makefile in their directory. After
+compilation they can be started with
+
+mpirun -np 4 ./map_rect_mpi_test
+
+with 4 cpus.
+
diff --git a/make/libraries b/make/libraries
@@ -7,6 +7,11 @@ GTEST ?= $(MATH)lib/gtest_1.7.0
 CPPLINT ?= $(MATH)lib/cpplint_4.45
 CVODES ?= $(MATH)lib/cvodes_2.9.0
 
+BOOST_LIB ?= $(BOOST)/libs
+
+## TODO: autodetect somehow?
+MPI_INCLUDE ?= /opt/local/include/openmpi-mp/
+
 ##
 # Build rules for cvodes
 ##
@@ -18,6 +23,8 @@ SUNDIALS_CVODES := $(patsubst %.c,%.o,\
 SUNDIALS_NVECSERIAL := $(patsubst %.c,%.o,\
   $(addprefix $(CVODES)/src/, nvec_ser/nvector_serial.c sundials/sundials_math.c))
 
+BOOST_SERIALIZATION := $(patsubst %.cpp,%.o,$(wildcard $(BOOST_LIB)/serialization/src/*.cpp))
+BOOST_MPI := $(patsubst %.cpp,%.o,$(wildcard $(BOOST_LIB)/mpi/src/*.cpp))
 
 $(sort $(SUNDIALS_CVODES) $(SUNDIALS_NVECSERIAL)) : %.o : %.c
 	@mkdir -p $(dir $@)
@@ -31,12 +38,33 @@ $(CVODES)/lib/libsundials_nvecserial.a: $(SUNDIALS_NVECSERIAL)
 	@mkdir -p $(dir $@)
 	$(AR) -rs $@ $^
 
+$(BOOST_SERIALIZATION) : %.o : %.cpp
+	@mkdir -p $(dir $@)
+	$(COMPILE.c) -O$(O) -isystem $(BOOST) $< -o $@
+
+$(BOOST)/lib/libboost_serialization.a: $(BOOST_SERIALIZATION)
+	@mkdir -p $(dir $@)
+	$(AR) -rs $@ $^
+
+$(BOOST_MPI) : %.o : %.cpp
+	@mkdir -p $(dir $@)
+	$(COMPILE.c) -O$(O) -isystem $(BOOST) -isystem $(MPI_INCLUDE) -DBOOST_MPI_SOURCE=1 $< -o $@
+
+$(BOOST)/lib/libboost_mpi.a: $(BOOST_MPI)
+	@mkdir -p $(dir $@)
+	$(AR) -rs $@ $^
+
+LIBMPI = $(BOOST)/lib/libboost_serialization.a $(BOOST)/lib/libboost_mpi.a
+
 LIBCVODES = $(CVODES)/lib/libsundials_nvecserial.a $(CVODES)/lib/libsundials_cvodes.a
 
+
+print-%  : ; @echo $* = $($*)
+
 STAN_CVODES_HEADERS := $(shell find stan -name *cvodes*.hpp)
 $(STAN_CVODES_HEADERS) : $(LIBCVODES)
 
 
 .PHONY: clean-libraries
 clean-libraries:
-	$(RM) $(sort $(SUNDIALS_CVODES) $(SUNDIALS_NVECSERIAL) $(LIBCVODES))
+	$(RM) $(sort $(SUNDIALS_CVODES) $(SUNDIALS_NVECSERIAL) $(BOOST_SERIALIZATION) $(BOOST_MPI) $(LIBCVODES) $(LIBMPI) )
diff --git a/stan/math/prim/arr.hpp b/stan/math/prim/arr.hpp
@@ -36,6 +36,7 @@
 #include <stan/math/prim/arr/functor/coupled_ode_observer.hpp>
 #include <stan/math/prim/arr/functor/coupled_ode_system.hpp>
 #include <stan/math/prim/arr/functor/integrate_ode_rk45.hpp>
+#include <stan/math/prim/arr/functor/map_rect.hpp>
 
 #include <stan/math/prim/scal.hpp>
 

diff --git a/stan/math/prim/arr/functor/map_rect.hpp b/stan/math/prim/arr/functor/map_rect.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#ifdef STAN_HAS_MPI
+#include <stan/math/prim/arr/functor/map_rect_mpi.hpp>
+#endif
+
+namespace stan {
+  namespace math {
+
+    template <typename F>
+    std::vector<double>
+    map_rect(const std::vector<double>& eta,
+             const std::vector<std::vector<double> >& theta,
+             const std::vector<std::vector<double> >& x_r,
+             const std::vector<std::vector<int> >& x_i,
+             const std::size_t uid) {
+#ifdef STAN_HAS_MPI
+      return(map_rect_mpi<F>(eta, theta, x_r, x_i, uid));
+#else
+      std::vector<double> res;
+      const std::size_t N = theta.size();
+
+      for(std::size_t i = 0; i != N; ++i) {
+        const std::vector<double> f = F::apply(eta, theta[i], x_r[i], x_i[i]);
+        res.insert(res.end(), f.begin(), f.end());
+      }
+      return(res);
+#endif
+    }
+  }
+}
diff --git a/stan/math/prim/arr/functor/map_rect_lpdf_mpi.hpp b/stan/math/prim/arr/functor/map_rect_lpdf_mpi.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <vector>
+#include <set>
+
+#include <stan/math/prim/mat/fun/to_array_1d.hpp>
+#include <stan/math/prim/arr/functor/mpi_command.hpp>
+
+namespace stan {
+  namespace math {
+
+    template <typename F>
+    double
+    map_rect_lpdf_mpi(const std::vector<double>& eta,
+                      const std::vector<std::vector<double> >& theta,
+                      const std::vector<std::vector<double> >& x_r,
+                      const std::vector<std::vector<int> >& x_i,
+                      const std::size_t uid) {
+
+      double res = 0;
+      const std::size_t N = theta.size();
+
+      for(std::size_t i = 0; i != N; ++i) {
+        res += F::apply(eta, theta[i], x_r[i], x_i[i]);
+      }
+
+      return(res);
+    }
+
+  }
+}
diff --git a/stan/math/prim/arr/functor/map_rect_mpi.hpp b/stan/math/prim/arr/functor/map_rect_mpi.hpp
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <vector>
+#include <set>
+
+#include <stan/math/prim/mat/fun/to_array_1d.hpp>
+#include <stan/math/prim/arr/functor/mpi_command.hpp>
+#include <stan/math/prim/arr/functor/mpi_cluster.hpp>
+
+namespace stan {
+  namespace math {
+
+    namespace internal {
+      // forward declare
+      void
+      distribute_map_rect_data(const std::vector<std::vector<double> >& x_r,
+                               const std::vector<std::vector<int> >& x_i,
+                               const std::size_t uid);
+
+
+      // data distribution needs to go to prim
+      struct distributed_map_rect_data {
+        distributed_map_rect_data(std::size_t uid) : uid_(uid), N_(-1) {}
+        std::size_t uid_;
+        std::size_t N_;
+        std::vector<std::vector<double> > x_r_;
+        std::vector<std::vector<int> > x_i_;
+        static void distributed_apply() {
+          // called on workers to register data
+          distribute_map_rect_data(std::vector<std::vector<double> >(), std::vector<std::vector<int> >(), 0);
+        }
+      };
+
+      typedef boost::serialization::singleton<std::map<std::size_t,const distributed_map_rect_data> > distributed_data;
+
+      void
+      distribute_map_rect_data(const std::vector<std::vector<double> >& x_r,
+                               const std::vector<std::vector<int> >& x_i,
+                               const std::size_t uid) {
+
+        boost::mpi::communicator world;
+
+        const std::size_t W = world.size();
+        const std::size_t R = world.rank();
+
+        // first check if uid is already registered
+        if(distributed_data::get_const_instance().find(uid) == distributed_data::get_const_instance().cend()) {
+          //std::cout << "Job " << R << " registers data..." << std::endl;
+        } else {
+          //std::cout << "UID = " << uid << " is ALREADY distributed." << std::endl;
+          return;
+        }
+
+        std::vector<int> meta(4, 0);
+
+        if(R == 0) {
+          // initiate on the root call of this function on the workers
+          mpi_broadcast_command<stan::math::mpi_distributed_apply<distributed_map_rect_data> >();
+
+          meta[0] = uid;
+          meta[1] = x_r.size();
+          meta[2] = x_r[0].size();
+          meta[3] = x_i[0].size();
+        }
+
+        boost::mpi::broadcast(world, meta.data(), 4, 0);
+
+        distributed_map_rect_data data(meta[0]);
+
+        const std::size_t N = meta[1];
+        const std::size_t X_r = meta[2];
+        const std::size_t X_i = meta[3];
+
+        data.N_ = N;
+
+        //std::cout << "worker " << R << " / " << W << " registers shapes " << N << ", " << X_r << ", " << X_i << std::endl;
+
+        const std::vector<int> chunks = mpi_map_chunks(N, 1);
+
+        data.x_r_.resize(chunks[R]);
+        data.x_i_.resize(chunks[R]);
+
+           // flatten data and send out/recieve using scatterv
+        if(X_r > 0) {
+          const std::vector<double> world_x_r = to_array_1d(x_r);
+          const std::vector<int> chunks_x_r = mpi_map_chunks(N, X_r);
+          std::vector<double> flat_x_r_local(chunks_x_r[R]);
+
+          boost::mpi::scatterv(world, world_x_r.data(), chunks_x_r, flat_x_r_local.data(), 0);
+
+          // now register data
+          for(std::size_t i = 0; i != chunks[R]; ++i)
+            data.x_r_[i] = std::vector<double>(flat_x_r_local.begin() + i * X_r, flat_x_r_local.begin() + (i+1) * X_r);
+
+          //std::cout << "Job " << R << " got " << flat_x_r_local.size() << " real data " << std::endl;
+        }
+        if(X_i > 0) {
+          const std::vector<int> world_x_i = to_array_1d(x_i);
+          const std::vector<int> chunks_x_i = mpi_map_chunks(N, X_i);
+          std::vector<int> flat_x_i_local(chunks_x_i[R]);
+
+          boost::mpi::scatterv(world, world_x_i.data(), chunks_x_i, flat_x_i_local.data(), 0);
+
+          // now register data
+          for(std::size_t i = 0; i != chunks[R]; ++i)
+            data.x_i_[i] = std::vector<int>(flat_x_i_local.begin() + i * X_i, flat_x_i_local.begin() + (i+1) * X_i);
+
+          //std::cout << "Job " << R << " got " << flat_x_i_local.size() << " int data " << std::endl;
+        }
+
+        distributed_data::get_mutable_instance().insert(std::make_pair(uid, data));
+
+        //std::cout << "Job " << R << " done caching data for uid = " << meta[0] << "." << std::endl;
+
+        return;
+      }
+    }
+
+    template <typename F>
+    std::vector<double>
+    map_rect_mpi(const std::vector<double>& eta,
+                 const std::vector<std::vector<double> >& theta,
+                 const std::vector<std::vector<double> >& x_r,
+                 const std::vector<std::vector<int> >& x_i,
+                 const std::size_t uid) {
+
+      std::vector<double> res;
+      const std::size_t N = theta.size();
+
+      for(std::size_t i = 0; i != N; ++i) {
+        const std::vector<double> f = F::apply(eta, theta[i], x_r[i], x_i[i]);
+        res.insert(res.end(), f.begin(), f.end());
+
+        //for(std::size_t j = 0; j != f.size(); j++)
+        //  std::cout << "res[" << i << ", " << j << "] = " << f[j] << std::endl;
+      }
+
+      return(res);
+    }
+
+  }
+}
+
+
+BOOST_CLASS_EXPORT(stan::math::mpi_distributed_apply<stan::math::internal::distributed_map_rect_data>);
+BOOST_CLASS_TRACKING(stan::math::mpi_distributed_apply<stan::math::internal::distributed_map_rect_data>,track_never);
+BOOST_SERIALIZATION_FACTORY_0(stan::math::mpi_distributed_apply<stan::math::internal::distributed_map_rect_data>)