diff --git a/.github/Dockerfile b/.github/Dockerfile new file mode 100644 index 0000000..75e9f40 --- /dev/null +++ b/.github/Dockerfile @@ -0,0 +1,29 @@ +#Built for testing, not designed for application use. + +FROM ubuntu:20.04 +#="open-mpi/ompi" for github.com/open-mpi/ompi +ARG OPENMPI_REPO="open-mpi/ompi" +#="tags" or ="heads", for tag or branch name +ARG OPENMPI_VERS_PREFIX="tags" +#="v5.0.0rc10" or ="v5.0.x", ie tag name or branch name. +ARG OPENMPI_VERS="v5.0.0rc10" +run echo Using https://github.com/${OPENMPI_REPO}/git/refs/${OPENMPI_VERS_PREFIX}/${OPENMPI_VERS} + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential python3 m4 autoconf automake libtool flex git zlib1g-dev + +#Add files listing latest commit for this branch/tag, which invalidates the clone +#when a change has been pushed. +ADD https://api.github.com/repos/${OPENMPI_REPO}/git/refs/${OPENMPI_VERS_PREFIX}/${OPENMPI_VERS} commit_info +RUN git clone --recursive --branch ${OPENMPI_VERS} --depth 1 https://github.com/${OPENMPI_REPO}.git ompi_src && \ + mkdir ompi_build ompi_install && cd ompi_src && export AUTOMAKE_JOBS=8 && ./autogen.pl && cd ../ompi_build && ../ompi_src/configure --prefix=/ompi_install --disable-man-pages --with-ft=ulfm && make install -j8 && cd .. + + +#New build stage, tosses out src/build trees from openmpi +FROM ubuntu:20.04 +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential cmake ssh zlib1g-dev +COPY . ./fenix_src +COPY --from=0 ompi_install/ /ompi_install/ +ENV PATH="$PATH:/ompi_install/bin" +RUN mkdir fenix_build fenix_install && cd fenix_build && cmake ../fenix_src -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/ompi_install/bin/mpicc \ + -DFENIX_EXAMPLES=ON -DFENIX_TESTS=ON -DCMAKE_INSTALL_PREFIX=../fenix_install -DMPIEXEC_PREFLAGS="--allow-run-as-root;--map-by;:OVERSUBSCRIBE" && make install -j8 +CMD ["sh", "-c", "cd fenix_build && ctest --verbose --timeout 60"] diff --git a/.github/docker-compose.yml b/.github/docker-compose.yml new file mode 100644 index 0000000..b29e083 --- /dev/null +++ b/.github/docker-compose.yml @@ -0,0 +1,81 @@ +version: "3.9" + +x-fenix: &fenix + build: &fenix-build + context: ./ + dockerfile: .github/Dockerfile + args: + OPENMPI_REPO: open-mpi/ompi + OPENMPI_VERS_PREFIX: tags + OPENMPI_VERS: v5.0.0rc10 + #Caches should be manually scoped, or they'll conflict. + x-bake: + cache-from: + - type=gha,scope=default + cache-to: + - type=gha,scope=default,mode=max + +services: + #fenix_ompi_5rc10: + # <<: *fenix + # image: "fenix:ompi_5rc10" + # build: + # <<: *fenix-build + # x-bake: + # cache-from: + # - type=gha,scope=ompi_5rc10 + # cache-to: + # - type=gha,scope=ompi_5rc10,mode=max + + fenix_ompi_5: + <<: *fenix + image: "fenix:ompi_5" + build: + <<: *fenix-build + args: + - OPENMPI_VERS_PREFIX=heads + - OPENMPI_VERS=v5.0.x + x-bake: + cache-from: + - type=gha,scope=ompi_5 + cache-to: + - type=gha,scope=ompi_5,mode=max + + fenix_ompi_main: + <<: *fenix + image: "fenix:ompi_main" + build: + <<: *fenix-build + args: + - OPENMPI_VERS_PREFIX=heads + - OPENMPI_VERS=main + x-bake: + cache-from: + - type=gha,scope=ompi_main + cache-to: + - type=gha,scope=ompi_main,mode=max + + fenix_icldisco_latest: + <<: *fenix + image: "fenix:icldisco_latest" + build: + <<: *fenix-build + args: + - OPENMPI_REPO=icldisco/ompi + - OPENMPI_VERS_PREFIX=heads + - OPENMPI_VERS=ulfm/latest + x-bake: + cache-from: + - type=gha,scope=icldisco_latest + cache-to: + - type=gha,scope=icldisco_latest,mode=max + + #fenix_icldisco_experimental: + # <<: *fenix + # image: fenix/icldisco + # build: + # <<: *fenix-build + # args: + # - OPENMPI_REPO=icldisco/ompi + # - OPENMPI_VERS_PREFIX=heads + # - OPENMPI_VERS=ulfm/experimental diff --git a/.github/workflows/ci_checks.yaml b/.github/workflows/ci_checks.yaml new file mode 100644 index 0000000..ebeeef8 --- /dev/null +++ b/.github/workflows/ci_checks.yaml @@ -0,0 +1,31 @@ +name: Build & Test + +on: + push: + pull_request_target: + types: + - opened + - synchronized + - edited + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: docker/setup-buildx-action@v2 + - name: Build + uses: docker/bake-action@master + with: + files: | + .github/docker-compose.yml + load: true + - name: Test open-mpi v5.0.x + if: success() || failure() + run: docker run fenix:ompi_5 + - name: Test open-mpi main + if: success() || failure() + run: docker run fenix:ompi_main + - name: Test icldisco latest + if: success() || failure() + run: docker run fenix:icldisco_latest diff --git a/.gitignore b/.gitignore index 20f1a05..3e3dd51 100644 --- a/.gitignore +++ b/.gitignore @@ -39,6 +39,8 @@ examples/05_subset_create/subset_create examples/06_subset_createv/subset_createv test/request_tracking/fenix_request_tracking_test test/request_tracking/fenix_request_tracking_test_nofenix +build/ +install/ # Other *~ diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e292727..0000000 --- a/.travis.yml +++ /dev/null @@ -1,62 +0,0 @@ -language: c -addons: - apt: - packages: - - cmake - - autoconf - - automake - - libtool - - valgrind -cache: - directories: - - ulfm-install -before_install: - - echo "Configuring ULFM" - - if [ -f ulfm-install/lib/libmpi.so ]; then - echo "libmpi.so found -- nothing to build."; - cd ulfm-install; - else - ROOT=`pwd`; - mkdir ulfm-install; - echo "Downloading ULFM from repo"; - git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/; - echo " - Configuring and building ULFM."; - cd ulfm-src; - echo " - Running autogen.pl"; - ./autogen.pl >../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Running configure"; - ./configure --prefix=$ROOT/ulfm-install >>../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Running make"; - make -j4 >>../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Running make install"; - make install >>../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Finished installing ULFM"; - cd ../ulfm-install/; - fi - - #Expect that any changes to the above still puts me in the install's home dir - - export MPI_HOME=`pwd` - - export PATH=$MPI_HOME/bin/:$PATH - - export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH - - export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH - - export MANPATH=$MPI_HOME/share/man:$MANPATH - - - export MPICC="`which mpicc`" - - export MPICXX="`which mpic++`" - - #Allow oversubscription for tests, since we're potentially single core - - export OMPI_MCA_rmaps_base_oversubscribe=1 - - - tail -n50 ./ulfm_build_output.txt - - cd ../ #End back at root -install: - - mkdir build && cd build - - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 -script: - - make test -after_success: - - echo "Success, printing run logs:" - - cat Testing/Temporary/LastTest.log -after_failure: - - echo "Failure occured, printing run logs:" - - cat Testing/Temporary/LastTest.log diff --git a/CMakeLists.txt b/CMakeLists.txt index b866e11..7b8b20c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,108 +8,66 @@ # directory. # -cmake_minimum_required(VERSION 3.0.2) +cmake_minimum_required(VERSION 3.10.2) project(Fenix C) # The version number. set(FENIX_VERSION_MAJOR 1) set(FENIX_VERSION_MINOR 0) -option(BUILD_EXAMPLES "Builds example programs from the examples directory" OFF) -option(BUILD_TESTING "Builds tests and test modes of files" ON) +option(BUILD_EXAMPLES "Builds example programs from the examples directory" OFF) +option(BUILD_TESTING "Builds tests and test modes of files" ON) -# Set empty string for shared linking (we use static library only at this moment) -set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) +#Solves an issue with some system environments putting their MPI headers before +#the headers CMake includes. Forces non-system MPI headers when incorrect headers +#detected in include path. +option(FENIX_SYSTEM_INC_FIX "Attempts to force overriding any system MPI headers" ON) +option(FENIX_PROPAGATE_INC_FIX "Attempt overriding system MPI headers in linking projects" ON) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +find_package(MPI REQUIRED) -#set(CMAKE_BUILD_TYPE Release) -set(CMAKE_BUILD_TYPE Debug) -#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O0 -ggdb") +if(${FENIX_SYSTEM_INC_FIX}) + include(cmake/systemMPIOverride.cmake) +endif() -#ENABLE_TESTING -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) -#include(testref/TestAgainstReference) -configure_file( - ${CMAKE_CURRENT_SOURCE_DIR}/include/fenix-config.h.in - ${CMAKE_CURRENT_BINARY_DIR}/include/fenix-config.h @ONLY -) +add_subdirectory(src) -#Check for MPICC definition, if not try to find MPI -if(NOT "a$ENV{MPICC}" STREQUAL "a") - #set(CMAKE_C_COMPILER ${MPI_C_COMPILER} CACHE STRING "The compiler CMake should use - often set to mpicc" FORCE) - set(MPI_C_COMPILER $ENV{MPICC}) - set(CMAKE_C_COMPILER ${MPI_C_COMPILER}) - - message("[fenix] MPICC has been passed: $ENV{MPICC}") -else() - message("[fenix] MPICC was not passed, searching for MPI") - find_package(MPI REQUIRED) - if(${MPI_C_FOUND}) - message("[fenix] Found MPICC: ${MPI_C_COMPILER}") - else() - message( FATAL_ERROR "[fenix] MPI not found :( Aborting!") - endif() +include(CTest) +list(APPEND MPIEXEC_PREFLAGS "--with-ft;mpi") + +if(BUILD_EXAMPLES) + add_subdirectory(examples) endif() -#Helper function for linking with MPI only if needed -function(linkMPI TOLINK) - #We only want to try to find MPI outrselves if it wasn't provided in MPICC by user - if("a$ENV{MPICC}" STREQUAL "a") - #find_package(MPI REQUIRED) - target_link_libraries(${TOLINK} MPI::MPI_C) - endif() -endfunction(linkMPI) +if(BUILD_TESTING) + add_subdirectory(test) +endif() + + + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/include/fenix-config.h.in + ${CMAKE_CURRENT_BINARY_DIR}/include/fenix-config.h @ONLY +) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/systemMPIOverride.cmake + ${CMAKE_CURRENT_BINARY_DIR}/cmake/systemMPIOverride.cmake COPYONLY +) -add_subdirectory(src) include(CMakePackageConfigHelpers) -configure_package_config_file(fenixConfig.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/fenixConfig.cmake +configure_package_config_file(cmake/fenixConfig.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/cmake/fenixConfig.cmake INSTALL_DESTINATION cmake) -write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/fenixConfigVersion.cmake +write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/cmake/fenixConfigVersion.cmake VERSION "${FENIX_VERSION_MAJOR}.${FENIX_VERSION_MINOR}" COMPATIBILITY SameMajorVersion) install( FILES - ${CMAKE_CURRENT_BINARY_DIR}/fenixConfig.cmake - ${CMAKE_CURRENT_BINARY_DIR}/fenixConfigVersion.cmake + ${CMAKE_CURRENT_BINARY_DIR}/cmake/fenixConfig.cmake + ${CMAKE_CURRENT_BINARY_DIR}/cmake/systemMPIOverride.cmake DESTINATION cmake ) - - -include(CTest) - -if(BUILD_EXAMPLES) - add_subdirectory(examples/01_hello_world/fenix) - add_subdirectory(examples/01_hello_world/mpi) - add_subdirectory(examples/02_send_recv/fenix) - add_subdirectory(examples/02_send_recv/mpi) - add_subdirectory(examples/03_reduce/fenix) - #add_subdirectory(examples/03_reduce/mpi) - add_subdirectory(examples/04_Isend_Irecv/fenix) - add_subdirectory(examples/04_Isend_Irecv/mpi) - add_subdirectory(examples/05_subset_create) - add_subdirectory(examples/06_subset_createv) - -elseif(BUILD_TESTING) - #Some examples are useful tests as well. - add_subdirectory(examples/01_hello_world/fenix) - add_subdirectory(examples/02_send_recv/fenix) - add_subdirectory(examples/03_reduce/fenix) - add_subdirectory(examples/05_subset_create) - add_subdirectory(examples/06_subset_createv) -endif() - -if(BUILD_TESTING) - add_subdirectory(test/subset_internal) - add_subdirectory(test/subset_merging) - add_subdirectory(test/request_tracking) - add_subdirectory(test/request_cancelled) - add_subdirectory(test/no_jump) - add_subdirectory(test/issend) -endif() diff --git a/README.md b/README.md index 09efb60..b7f4c97 100644 --- a/README.md +++ b/README.md @@ -17,17 +17,16 @@ These instructions assume you are in your home directory. 1. Checkout Fenix sources - * For example: ` git clone
` + * For example: ` git clone && cd Fenix` 2. Create a build directory. - * For example: ` mkdir -p ~/build/fenix/ && cd ~/build/fenix/ ` 3. Specify the MPI C compiler to use. [Open MPI 5+](https://github.com/open-mpi/ompi/tree/v5.0.x) is the required version. - * To manually indicate which compiler `cmake` should use, set the `MPICC` variable to point to it. - * For example: ` export MPICC=~/install/mpi-ulfm/bin/mpicc ` - * If the `MPICC` environment variable is not there, `cmake` will try to guess where the MPI implementation is. To help, make sure you include the installation directory of MPI in your `PATH`. - * For example: ` export PATH=~/install/mpi-ulfm/bin:$PATH ` -4. Run ` cmake
diff --git a/cmake/fenixConfig.cmake.in b/cmake/fenixConfig.cmake.in
new file mode 100644
index 0000000..464e150
--- /dev/null
+++ b/cmake/fenixConfig.cmake.in
@@ -0,0 +1,13 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+include("${CMAKE_CURRENT_LIST_DIR}/fenixTargets.cmake")
+
+set(FENIX_SYSTEM_INC_FIX @FENIX_SYSTEM_INC_FIX@)
+if(${FENIX_SYSTEM_INC_FIX})
+ option(FENIX_PROPAGATE_INC_FIX "Attempt overriding system MPI headers in linking projects" @FENIX_PROPAGATE_INC_FIX@)
+ if(${FENIX_PROPAGATE_INC_FIX})
+ include("${CMAKE_CURRENT_LIST_DIR}/systemMPIOverride.cmake")
+ endif()
+endif()
diff --git a/cmake/systemMPIOverride.cmake b/cmake/systemMPIOverride.cmake
new file mode 100644
index 0000000..95b2619
--- /dev/null
+++ b/cmake/systemMPIOverride.cmake
@@ -0,0 +1,51 @@
+#If we're using mpicc, we don't need to worry about the includes.
+if("${CMAKE_C_COMPILER}" MATCHES ".*/?mpic")
+ return()
+endif()
+
+include(CheckIncludeFile)
+set(CMAKE_REQUIRED_QUIET ON)
+check_include_file("mpi.h" MPI_HEADER_CLASH)
+set(CMAKE_REQUIRED_QUIET OFF)
+
+if(${MPI_HEADER_CLASH})
+ if(TARGET fenix)
+ message(WARNING "Fenix detected system MPI headers, attempting to force use of ${MPI_C_INCLUDE_DIRS}. Disable FENIX_PROPAGATE_INC_FIX to stop this behavior.")
+ else()
+ message(WARNING "Detected system MPI headers, attempting to force use of ${MPI_C_INCLUDE_DIRS}. Disable FENIX_SYSTEM_INC_FIX to stop this behavior.")
+ endif()
+
+ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.25")
+
+ if(TARGET MPI::MPI_C)
+ set_target_properties(MPI::MPI_C PROPERTIES SYSTEM "FALSE")
+ endif()
+ if(TARGET MPI::MPI_CXX)
+ set_target_properties(MPI::MPI_CXX PROPERTIES SYSTEM "FALSE")
+ endif()
+
+ else()
+
+ if(TARGET MPI::MPI_C)
+ set_property(DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY INCLUDE_DIRECTORIES "${MPI_C_INCLUDE_DIRS}")
+ endif()
+ if(TARGET MPI::MPI_CXX)
+ set_property(DIRECTORY ${CMAKE_SOURCE_DIR} APPEND PROPERTY INCLUDE_DIRECTORIES "${MPI_CXX_INCLUDE_DIRS}")
+ endif()
+
+ if(TARGET fenix)
+ get_target_property(FENIX_INCLUDES fenix INTERFACE_INCLUDE_DIRECTORIES)
+ list(REMOVE_ITEM FENIX_INCLUDES ${MPI_C_INCLUDE_DIRS})
+ list(REMOVE_ITEM FENIX_INCLUDES ${MPI_CXX_INCLUDE_DIRS})
+ set_target_properties(fenix PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${FENIX_INCLUDES}")
+ endif()
+
+ if(TARGET MPI::MPI_C)
+ set_target_properties(MPI::MPI_C PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
+ endif()
+ if(TARGET MPI::MPI_CXX)
+ set_target_properties(MPI::MPI_CXX PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "")
+ endif()
+
+ endif()
+endif()
diff --git a/examples/01_hello_world/fenix/CMakeLists.txt b/examples/01_hello_world/fenix/CMakeLists.txt
index 2dad662..6a344f4 100644
--- a/examples/01_hello_world/fenix/CMakeLists.txt
+++ b/examples/01_hello_world/fenix/CMakeLists.txt
@@ -12,9 +12,6 @@ add_executable(fenix_hello_world fenix_hello_world.c)
target_link_libraries(fenix_hello_world fenix ${MPI_C_LIBRARIES})
if(BUILD_TESTING)
- #set(CMAKE_BUILD_TYPE Debug)
- add_executable(fenix_hello_world-debug fenix_hello_world.c)
- target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME hello_world
- COMMAND mpirun --with-ft mpi -n 3 fenix_hello_world-debug "1")
+ COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 3 ${MPIEXEC_PREFLAGS} fenix_hello_world ${MPIEXEC_POSTFLAGS} "1")
endif()
diff --git a/examples/02_send_recv/fenix/CMakeLists.txt b/examples/02_send_recv/fenix/CMakeLists.txt
index aa5dc65..bf40679 100644
--- a/examples/02_send_recv/fenix/CMakeLists.txt
+++ b/examples/02_send_recv/fenix/CMakeLists.txt
@@ -12,11 +12,8 @@ add_executable(fenix_ring fenix_ring.c)
target_link_libraries(fenix_ring fenix ${MPI_C_LIBRARIES} m )
if(BUILD_TESTING)
- set(CMAKE_BUILD_TYPE Debug)
- add_executable(fenix_ring-debug fenix_ring.c)
- target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME ring
- COMMAND mpirun --with-ft mpi -np 5 fenix_ring-debug 1 2)
+ COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} fenix_ring ${MPIEXEC_POSTFLAGS} 1 2)
set_tests_properties(ring PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
diff --git a/examples/05_subset_create/CMakeLists.txt b/examples/05_subset_create/CMakeLists.txt
index bf2da45..7f1efcd 100644
--- a/examples/05_subset_create/CMakeLists.txt
+++ b/examples/05_subset_create/CMakeLists.txt
@@ -12,11 +12,8 @@ add_executable(subset_create subset_create.c)
target_link_libraries(subset_create fenix ${MPI_C_LIBRARIES})
if(BUILD_TESTING)
- set(CMAKE_BUILD_TYPE Debug)
- add_executable(fenix_subset_create-debug subset_create.c)
- target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_create
- COMMAND mpirun --with-ft mpi -np 5 fenix_subset_create-debug 1)
+ COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} subset_create ${MPIEXEC_POSTFLAGS} 1)
set_tests_properties(subset_create PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
diff --git a/examples/06_subset_createv/CMakeLists.txt b/examples/06_subset_createv/CMakeLists.txt
index 3a935a7..c242648 100644
--- a/examples/06_subset_createv/CMakeLists.txt
+++ b/examples/06_subset_createv/CMakeLists.txt
@@ -12,11 +12,8 @@ add_executable(subset_createv subset_createv.c)
target_link_libraries(subset_createv fenix ${MPI_C_LIBRARIES})
if(BUILD_TESTING)
- set(CMAKE_BUILD_TYPE Debug)
- add_executable(fenix_subset_createv-debug subset_createv.c)
- target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_createv
- COMMAND mpirun --with-ft mpi -np 5 fenix_subset_createv-debug 1)
+ COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} subset_createv ${MPIEXEC_POSTFLAGS} 1)
set_tests_properties(subset_createv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..b1f7321
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_subdirectory(01_hello_world/fenix)
+add_subdirectory(02_send_recv/fenix)
+add_subdirectory(03_reduce/fenix)
+add_subdirectory(04_Isend_Irecv/fenix)
+add_subdirectory(05_subset_create)
+add_subdirectory(06_subset_createv)
diff --git a/fenixConfig.cmake.in b/fenixConfig.cmake.in
deleted file mode 100644
index 6f59550..0000000
--- a/fenixConfig.cmake.in
+++ /dev/null
@@ -1,5 +0,0 @@
-@PACKAGE_INIT@
-
-include(CMakeFindDependencyMacro)
-
-include("${CMAKE_CURRENT_LIST_DIR}/fenixTargets.cmake")
diff --git a/include/fenix.h b/include/fenix.h
index 4d7ca67..1a283bf 100644
--- a/include/fenix.h
+++ b/include/fenix.h
@@ -104,6 +104,10 @@ extern "C" {
#define FENIX_DATA_SNAPSHOT_ALL 16
#define FENIX_DATA_SUBSET_CREATED 2
+#define FENIX_ERRHANDLER_LOC 1
+#define FENIX_DATA_COMMIT_BARRIER_LOC 2
+
+
#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
typedef enum {
diff --git a/include/fenix_data_member.h b/include/fenix_data_member.h
index b37c652..391142b 100644
--- a/include/fenix_data_member.h
+++ b/include/fenix_data_member.h
@@ -67,7 +67,6 @@ typedef struct __fenix_member_entry {
int memberid;
enum states state;
void *user_data;
- MPI_Datatype current_datatype;
int datatype_size;
int current_count;
} fenix_member_entry_t;
@@ -80,7 +79,6 @@ typedef struct __fenix_member {
typedef struct __member_entry_packet {
int memberid;
- MPI_Datatype current_datatype;
int datatype_size;
int current_count;
} fenix_member_entry_packet_t;
@@ -92,7 +90,7 @@ void __fenix_ensure_member_capacity( fenix_member_t *m );
void __fenix_ensure_version_capacity_from_member( fenix_member_t *m );
fenix_member_entry_t* __fenix_data_member_add_entry(fenix_member_t* member,
- int memberid, void* data, int count, MPI_Datatype datatype);
+ int memberid, void* data, int count, int datatype_size);
int __fenix_data_member_send_metadata(int groupid, int memberid, int dest_rank);
int __fenix_data_member_recv_metadata(int groupid, int src_rank,
diff --git a/include/fenix_data_recovery.h b/include/fenix_data_recovery.h
index 856dbe5..4580cb9 100644
--- a/include/fenix_data_recovery.h
+++ b/include/fenix_data_recovery.h
@@ -101,7 +101,6 @@
typedef struct __data_entry_packet {
- MPI_Datatype datatype;
int count;
int datatype_size;
} fenix_data_entry_packet_t;
@@ -109,7 +108,7 @@ typedef struct __data_entry_packet {
int __fenix_group_create(int, MPI_Comm, int, int, int, void*, int*);
int __fenix_group_get_redundancy_policy(int, int*, int*, int*);
-int __fenix_member_create(int, int, void *, int, MPI_Datatype);
+int __fenix_member_create(int, int, void *, int, int);
int __fenix_data_wait(Fenix_Request);
int __fenix_data_test(Fenix_Request, int *);
int __fenix_member_store(int, int, Fenix_Data_subset);
diff --git a/include/fenix_ext.h b/include/fenix_ext.h
index 785a108..fd4b1a6 100644
--- a/include/fenix_ext.h
+++ b/include/fenix_ext.h
@@ -90,9 +90,13 @@ typedef struct {
//fenix_communicator_list_t* communicator_list; // singly linked list for Fenix resilient communicators
fenix_debug_opt_t options; // This is reserved to store the user options
- MPI_Comm world; // Duplicate of the MPI communicator provided by user
+ MPI_Comm *world; // Duplicate of the MPI communicator provided by user
MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks
MPI_Comm *user_world; // MPI communicator with repaired ranks
+ //Manage state of the comms. Necessary when failures happen rapidly, mussing up state
+ int new_world_exists, user_world_exists;
+
+
MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7d413a1..7c823fd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,12 +11,6 @@
configure_file (${CMAKE_SOURCE_DIR}/include/fenix-config.h.in
"${CMAKE_CURRENT_BINARY_DIR}/fenix-config.h" @ONLY)
-#configure_file(${CMAKE_SOURCE_DIR}/include/fenix.h
-# "${CMAKE_BINARY_DIR}/include/fenix.h" COPYONLY)
-
-#configure_file(${CMAKE_SOURCE_DIR}/include/fenix_process_recovery.h
-# "${CMAKE_BINARY_DIR}/include/fenix_process_recovery.h" COPYONLY)
-
#include_directories(${CMAKE_CURRENT_BINARY_DIR})
FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h)
@@ -39,25 +33,7 @@ globals.c
add_library( fenix STATIC ${Fenix_SOURCES})
-#if("a$ENV{MPICC}" STREQUAL "a")
-# message("[fenix] MPICC (MPI compiler) environment variable is not defined. Trying to find MPI compiler...")
-# find_package(MPI REQUIRED)
-# target_link_libraries(fenix MPI::MPI_C)
-#else()
-# message("[fenix] MPICC has been passed: $ENV{MPICC}")
-# set(MPI_C_COMPILER $ENV{MPICC})
-# SET(CMAKE_C_COMPILER ${MPI_C_COMPILER})
-#endif()
-
-linkMPI(fenix)
-
-target_link_libraries(fenix ${MPI_C_LIBRARIES})
-if(MPI_COMPILE_FLAGS)
- set_target_properties(fenix PROPERTIES COMPILE_FLAGS "${MPI_COMPILE_FLAGS}")
-endif()
-if(MPI_LINK_FLAGS)
- set_target_properties(fenix PROPERTIES LINK_FLAGS "${MPI_LINK_FLAGS}")
-endif()
+target_link_libraries(fenix PUBLIC MPI::MPI_C)
target_include_directories(fenix
PUBLIC
@@ -76,5 +52,3 @@ install(EXPORT fenix
FILE fenixTargets.cmake
DESTINATION cmake)
install(FILES ${Fenix_HEADERS} DESTINATION include)
-
-#target_link_libraries( mpi )
diff --git a/src/fenix.c b/src/fenix.c
index 93f29f9..6be875f 100644
--- a/src/fenix.c
+++ b/src/fenix.c
@@ -83,7 +83,7 @@ int Fenix_Data_group_create( int group_id, MPI_Comm comm, int start_time_stamp,
}
int Fenix_Data_member_create( int group_id, int member_id, void *buffer, int count, MPI_Datatype datatype ) {
- return __fenix_member_create(group_id, member_id, buffer, count, datatype);
+ return __fenix_member_create(group_id, member_id, buffer, count, __fenix_get_size(datatype));
}
int Fenix_Data_group_get_redundancy_policy( int group_id, int* policy_name, void *policy_value, int *flag ) {
diff --git a/src/fenix_data_group.c b/src/fenix_data_group.c
index 7fec469..ad453aa 100644
--- a/src/fenix_data_group.c
+++ b/src/fenix_data_group.c
@@ -77,7 +77,7 @@ fenix_data_recovery_t * __fenix_data_recovery_init() {
if (fenix.options.verbose == 41) {
verbose_print("c-rank: %d, role: %d, g-count: %zu, g-size: %zu\n",
- __fenix_get_current_rank(fenix.world), fenix.role, data_recovery->count,
+ __fenix_get_current_rank(fenix.new_world), fenix.role, data_recovery->count,
data_recovery->total_size);
}
diff --git a/src/fenix_data_member.c b/src/fenix_data_member.c
index 5cf604a..3d9d60d 100644
--- a/src/fenix_data_member.c
+++ b/src/fenix_data_member.c
@@ -75,7 +75,7 @@ fenix_member_t *__fenix_data_member_init() {
if (fenix.options.verbose == 42) {
verbose_print("c-rank: %d, role: %d, m-count: %zu, m-size: %zu\n",
- __fenix_get_current_rank(fenix.world), fenix.role, member->count,
+ __fenix_get_current_rank(fenix.new_world), fenix.role, member->count,
member->total_size);
}
@@ -88,7 +88,7 @@ fenix_member_t *__fenix_data_member_init() {
if (fenix.options.verbose == 42) {
verbose_print("c-rank: %d, role: %d, m-memberid: %d, m-state: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role,
+ __fenix_get_current_rank(fenix.new_world), fenix.role,
mentry->memberid, mentry->state);
}
}
@@ -141,7 +141,7 @@ int __fenix_find_next_member_position(fenix_member_t *member) {
}
fenix_member_entry_t* __fenix_data_member_add_entry(fenix_member_t* member,
- int memberid, void* data, int count, MPI_Datatype datatype){
+ int memberid, void* data, int count, int datatype_size){
int member_index = __fenix_find_next_member_position(member);
fenix_member_entry_t* mentry = member->member_entry + member_index;
@@ -150,11 +150,7 @@ fenix_member_entry_t* __fenix_data_member_add_entry(fenix_member_t* member,
mentry->state = OCCUPIED;
mentry->user_data = data;
mentry->current_count = count;
- mentry->current_datatype = datatype;
-
- int dsize;
- MPI_Type_size(datatype, &dsize);
- mentry->datatype_size = dsize;
+ mentry->datatype_size = datatype_size;
member->count++;
@@ -222,7 +218,6 @@ int __fenix_data_member_send_metadata(int groupid, int memberid, int dest_rank){
fenix_member_entry_packet_t packet;
packet.memberid = mentry.memberid;
- packet.current_datatype = mentry.current_datatype;
packet.datatype_size = mentry.datatype_size;
packet.current_count = mentry.current_count;
diff --git a/src/fenix_data_policy_in_memory_raid.c b/src/fenix_data_policy_in_memory_raid.c
index 40b265d..19341e2 100644
--- a/src/fenix_data_policy_in_memory_raid.c
+++ b/src/fenix_data_policy_in_memory_raid.c
@@ -703,8 +703,11 @@ int __imr_member_restore(fenix_group_t* g, int member_id,
//find_mentry returns the error status. We found the member (and corresponding data) if there are no errors.
int found_member = !(__imr_find_mentry(group, member_id, &mentry));
- int member_data_index = __fenix_search_memberid(group->base.member, member_id);
- fenix_member_entry_t member_data = group->base.member->member_entry[member_data_index];
+ fenix_member_entry_t member_data;
+ if(found_member){
+ int member_data_index = __fenix_search_memberid(group->base.member, member_id);
+ member_data = group->base.member->member_entry[member_data_index];
+ }
int recovery_locally_possible;
@@ -783,12 +786,11 @@ int __imr_member_restore(fenix_group_t* g, int member_id,
//We remake the new member just like the user would.
__fenix_member_create(group->base.groupid, packet.memberid, NULL, packet.current_count,
- packet.current_datatype);
+ packet.datatype_size);
__imr_find_mentry(group, member_id, &mentry);
int member_data_index = __fenix_search_memberid(group->base.member, member_id);
member_data = group->base.member->member_entry[member_data_index];
-
MPI_Recv((void*)&(group->num_snapshots), 1, MPI_INT, group->partners[1],
RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm, NULL);
@@ -886,7 +888,7 @@ int __imr_member_restore(fenix_group_t* g, int member_id,
//We remake the new member just like the user would.
__fenix_member_create(group->base.groupid, packet.memberid, NULL, packet.current_count,
- packet.current_datatype);
+ packet.datatype_size);
__imr_find_mentry(group, member_id, &mentry);
int member_data_index = __fenix_search_memberid(group->base.member, member_id);
diff --git a/src/fenix_data_recovery.c b/src/fenix_data_recovery.c
index da87c30..e052eb9 100644
--- a/src/fenix_data_recovery.c
+++ b/src/fenix_data_recovery.c
@@ -190,8 +190,7 @@ int __fenix_group_get_redundancy_policy(int groupid, int* policy_name, int* poli
* @param count
* @param data_type
*/
-int __fenix_member_create(int groupid, int memberid, void *data, int count, MPI_Datatype datatype ) {
-
+int __fenix_member_create(int groupid, int memberid, void *data, int count, int datatype_size ) {
int retval = -1;
int group_index = __fenix_search_groupid( groupid, fenix.data_recovery );
int member_index = -1;
@@ -219,9 +218,8 @@ int __fenix_member_create(int groupid, int memberid, void *data, int count, MPI_
//First, we'll make a fenix-core member entry, then pass that info to
//the specific data policy.
- int member_index = __fenix_find_next_member_position(member);
fenix_member_entry_t* mentry;
- mentry = __fenix_data_member_add_entry(member, memberid, data, count, datatype);
+ mentry = __fenix_data_member_add_entry(member, memberid, data, count, datatype_size);
//Pass the info along to the policy
retval = group->vtbl.member_create(group, mentry);
@@ -585,39 +583,33 @@ int __fenix_data_commit_barrier(int groupid, int *timestamp) {
} else {
fenix_group_t *group = (fenix.data_recovery->group[group_index]);
-
- //We want to make sure there aren't any revocations and also do a barrier.
- //Start by disabling Fenix error handling so we don't generate any new revokations here.
+ //We want to make sure there aren't any failed MPI operations (IE unfinished stores)
+ //But we don't want to fail to commit if a failure has happened since a successful store.
int old_failure_handling = fenix.ignore_errs;
fenix.ignore_errs = 1;
- //We'll use comm_agree as a resilient barrier, which should also give time for
- //any revocations to propogate
- int tmp_throwaway = 1;
- MPIX_Comm_agree(group->comm, &tmp_throwaway);
- //Now use iprobe to check for revocations.
- MPI_Status status;
- int ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, group->comm,
- &tmp_throwaway, &status);
+ int can_commit = 0;
- fenix.ignore_errs = old_failure_handling;
+ //We'll use comm_agree as a resilient barrier
+ //Our error handler also enters an agree, with a unique location bit set.
+ //So if we aren't all here, we've hit an error already.
+ int location = FENIX_DATA_COMMIT_BARRIER_LOC;
+ int ret = MPIX_Comm_agree(*fenix.user_world, &location);
+ if(location == FENIX_DATA_COMMIT_BARRIER_LOC) can_commit = 1;
- if(ret != MPI_ERR_REVOKED){
+ fenix.ignore_errs = old_failure_handling;
+
+ if(can_commit == 1){
retval = group->vtbl.commit(group);
}
-
- //Now that we've (hopefully) commited, we want to handle any errors we've
- //learned about w.r.t failures or revocations. No reason to put handling those off.
- if(ret != MPI_SUCCESS){
- retval = ret;
- //Just re-calling should have Fenix handle things according to whatever method
- //has been assigned.
- MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, group->comm,
- &tmp_throwaway, &status);
+ if(can_commit != 1 || ret != MPI_SUCCESS) {
+ //A rank failure has happened, lets trigger error handling if enabled.
+ int throwaway = 1;
+ MPI_Allreduce(MPI_IN_PLACE, &throwaway, 1, MPI_INT, MPI_SUM, *fenix.user_world);
}
-
+
if (timestamp != NULL) {
*timestamp = group->timestamp;
@@ -930,7 +922,6 @@ int __fenix_member_set_attribute(int groupid, int memberid, int attributename,
retval = FENIX_ERROR_INVALID_ATTRIBUTE_NAME;
}
- mentry->current_datatype = *((MPI_Datatype *)(attributevalue));
mentry->datatype_size = my_datatype_size;
retval = FENIX_SUCCESS;
break;
diff --git a/src/fenix_process_recovery.c b/src/fenix_process_recovery.c
index 5609326..b845fa6 100644
--- a/src/fenix_process_recovery.c
+++ b/src/fenix_process_recovery.c
@@ -82,9 +82,10 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
fenix.user_world = new_comm;
MPI_Comm_create_errhandler(__fenix_test_MPI, &fenix.mpi_errhandler);
-
- MPI_Comm_dup(comm, &fenix.world);
- PMPI_Comm_set_errhandler(fenix.world, fenix.mpi_errhandler);
+
+ fenix.world = malloc(sizeof(MPI_Comm));
+ MPI_Comm_dup(comm, fenix.world);
+ PMPI_Comm_set_errhandler(*fenix.world, fenix.mpi_errhandler);
fenix.finalized = 0;
fenix.spare_ranks = spare_ranks;
@@ -123,13 +124,13 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
fenix.resume_mode = __FENIX_RESUME_AT_INIT;
if (fenix.options.verbose == 0) {
verbose_print("rank: %d, role: %d, value: %s\n",
- __fenix_get_current_rank(fenix.world), fenix.role, value);
+ __fenix_get_current_rank(*fenix.world), fenix.role, value);
}
} else if (strcmp(value, "NO_JUMP") == 0) {
fenix.resume_mode = __FENIX_RESUME_NO_JUMP;
if (fenix.options.verbose == 0) {
verbose_print("rank: %d, role: %d, value: %s\n",
- __fenix_get_current_rank(fenix.world), fenix.role, value);
+ __fenix_get_current_rank(*fenix.world), fenix.role, value);
}
} else {
@@ -145,13 +146,13 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
fenix.print_unhandled = 0;
if (fenix.options.verbose == 0) {
verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n",
- __fenix_get_current_rank(fenix.world), fenix.role, value);
+ __fenix_get_current_rank(*fenix.world), fenix.role, value);
}
} else if (strcmp(value, "NO_JUMP") == 0) {
fenix.print_unhandled = 1;
if (fenix.options.verbose == 0) {
verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n",
- __fenix_get_current_rank(fenix.world), fenix.role, value);
+ __fenix_get_current_rank(*fenix.world), fenix.role, value);
}
} else {
@@ -188,7 +189,7 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
fenix.num_inital_ranks = __fenix_get_world_size(fenix.new_world);
if (fenix.options.verbose == 0) {
verbose_print("rank: %d, role: %d, number_initial_ranks: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role,
+ __fenix_get_current_rank(*fenix.world), fenix.role,
fenix.num_inital_ranks);
}
@@ -197,7 +198,7 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
if (fenix.options.verbose == 0) {
verbose_print("rank: %d, role: %d, number_initial_ranks: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role,
+ __fenix_get_current_rank(*fenix.world), fenix.role,
fenix.num_inital_ranks);
}
}
@@ -209,33 +210,53 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha
int a;
int myrank;
MPI_Status mpi_status;
- ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, fenix.world,
+ fenix.ignore_errs = 1;
+ ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix.world,
&mpi_status); // listen for a failure
+ fenix.ignore_errs = 0;
if (ret == MPI_SUCCESS) {
if (fenix.options.verbose == 0) {
verbose_print("Finalize the program; rank: %d, role: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role);
+ __fenix_get_current_rank(*fenix.world), fenix.role);
}
__fenix_finalize_spare();
} else {
fenix.repair_result = __fenix_repair_ranks();
if (fenix.options.verbose == 0) {
verbose_print("spare rank exiting from MPI_Recv - repair ranks; rank: %d, role: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role);
+ __fenix_get_current_rank(*fenix.world), fenix.role);
}
}
fenix.role = FENIX_ROLE_RECOVERED_RANK;
}
+
+ if(fenix.role != FENIX_ROLE_RECOVERED_RANK) MPI_Comm_dup(fenix.new_world, fenix.user_world);
+ fenix.user_world_exists = 1;
+
return fenix.role;
}
-int __fenix_create_new_world()
+int __fenix_spare_rank_within(MPI_Comm refcomm)
+{
+ int result = -1;
+ int current_rank = __fenix_get_current_rank(refcomm);
+ int new_world_size = __fenix_get_world_size(refcomm) - fenix.spare_ranks;
+ if (current_rank >= new_world_size) {
+ if (fenix.options.verbose == 6) {
+ verbose_print("current_rank: %d, new_world_size: %d\n", current_rank, new_world_size);
+ }
+ result = 1;
+ }
+ return result;
+}
+
+int __fenix_create_new_world_from(MPI_Comm from_comm)
{
int ret;
- if ( __fenix_spare_rank() == 1) {
- int current_rank = __fenix_get_current_rank(fenix.world);
+ if ( __fenix_spare_rank_within(from_comm) == 1) {
+ int current_rank = __fenix_get_current_rank(from_comm);
/*************************************************************************/
/** MPI_UNDEFINED makes the new communicator "undefined" at spare ranks **/
@@ -244,41 +265,44 @@ int __fenix_create_new_world()
/*************************************************************************/
if (fenix.options.verbose == 1) {
- verbose_print("rank: %d, role: %d\n", __fenix_get_current_rank(fenix.world),
+ verbose_print("rank: %d, role: %d\n", __fenix_get_current_rank(from_comm),
fenix.role);
}
- ret = PMPI_Comm_split(fenix.world, MPI_UNDEFINED, current_rank,
+ ret = PMPI_Comm_split(from_comm, MPI_UNDEFINED, current_rank,
&fenix.new_world);
- if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split: %d\n", ret); }
+ //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split: %d\n", ret); }
+ fenix.new_world_exists = 0; //Should already be this
} else {
- int current_rank = __fenix_get_current_rank(fenix.world);
+ int current_rank = __fenix_get_current_rank(from_comm);
if (fenix.options.verbose == 1) {
- verbose_print("rank: %d, role: %d\n", __fenix_get_current_rank(fenix.world),
+ verbose_print("rank: %d, role: %d\n", __fenix_get_current_rank(from_comm),
fenix.role);
}
- ret = PMPI_Comm_split(fenix.world, 0, current_rank, &fenix.new_world);
+ ret = PMPI_Comm_split(from_comm, 0, current_rank, &fenix.new_world);
+ fenix.new_world_exists = 1;
if (ret != MPI_SUCCESS){
- int len;
- char errstr[MPI_MAX_ERROR_STRING];
- MPI_Error_string(ret, errstr, &len);
- debug_print("MPI_Comm_split: %s\n", errstr);
+ fenix.new_world_exists = 0;
}
}
return ret;
}
+int __fenix_create_new_world(){
+ return __fenix_create_new_world_from(*fenix.world);
+}
+
int __fenix_repair_ranks()
{
/*********************************************************/
/* Do not forget comm_free for broken communicators */
/*********************************************************/
-
+ fenix.ignore_errs = 1;
int ret;
int survived_flag;
@@ -292,11 +316,27 @@ int __fenix_repair_ranks()
int repair_success = 0;
int num_try = 0;
int flag_g_world_freed = 0;
- MPI_Comm world_without_failures;
+ MPI_Comm world_without_failures, fixed_world;
+
+ /* current_rank means the global MPI rank before failure */
+ current_rank = __fenix_get_current_rank(*fenix.world);
+ world_size = __fenix_get_world_size(*fenix.world);
+
+ //Double check that every process is here, not in some local error handling elsewhere.
+ //Assume that other locations will converge here.
+ if(__fenix_spare_rank() != 1){
+ int location = FENIX_ERRHANDLER_LOC;
+ do {
+ location = FENIX_ERRHANDLER_LOC;
+ MPIX_Comm_agree(*fenix.user_world, &location);
+ } while(location != FENIX_ERRHANDLER_LOC);
+ }
+
while (!repair_success) {
+
repair_success = 1;
- ret = MPIX_Comm_shrink(fenix.world, &world_without_failures);
+ ret = MPIX_Comm_shrink(*fenix.world, &world_without_failures);
//if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_shrink. repair_ranks\n"); }
if (ret != MPI_SUCCESS) {
repair_success = 0;
@@ -307,23 +347,22 @@ int __fenix_repair_ranks()
/* Free up the storage for active process communicator */
/*********************************************************/
if ( __fenix_spare_rank() != 1) {
- PMPI_Comm_free(&fenix.new_world);
- PMPI_Comm_free(fenix.user_world);
+ if(fenix.new_world_exists) PMPI_Comm_free(&fenix.new_world);
+ if(fenix.user_world_exists) PMPI_Comm_free(fenix.user_world);
+ fenix.user_world_exists = 0;
+ fenix.new_world_exists = 0;
}
/*********************************************************/
/* Need closer look above */
/*********************************************************/
- /* current_rank means the global MPI rank before failure */
- current_rank = __fenix_get_current_rank(fenix.world);
survivor_world_size = __fenix_get_world_size(world_without_failures);
- world_size = __fenix_get_world_size(fenix.world);
fenix.fail_world_size = world_size - survivor_world_size;
if (fenix.options.verbose == 2) {
verbose_print(
"current_rank: %d, role: %d, world_size: %d, fail_world_size: %d, survivor_world_size: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role, world_size,
+ current_rank, fenix.role, world_size,
fenix.fail_world_size, survivor_world_size);
}
@@ -333,7 +372,7 @@ int __fenix_repair_ranks()
if (fenix.options.verbose == 2) {
verbose_print(
"current_rank: %d, role: %d, spare_ranks: %d, fail_world_size: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role, fenix.spare_ranks,
+ current_rank, fenix.role, fenix.spare_ranks,
fenix.fail_world_size);
}
@@ -360,7 +399,7 @@ int __fenix_repair_ranks()
int index;
for (index = 0; index < survivor_world_size; index++) {
verbose_print("current_rank: %d, role: %d, survivor_world[%d]: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role, index,
+ current_rank, fenix.role, index,
survivor_world[index]);
}
}
@@ -402,7 +441,7 @@ int __fenix_repair_ranks()
if (fenix.options.verbose == 2) {
verbose_print("current_rank: %d, role: %d, recovered_ranks: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role,
+ current_rank, fenix.role,
fenix.num_recovered_ranks);
}
@@ -425,7 +464,7 @@ int __fenix_repair_ranks()
if (fenix.options.verbose == 2) {
verbose_print("current_rank: %d, role: %d, active_ranks: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role,
+ current_rank, fenix.role,
active_ranks);
}
@@ -461,7 +500,6 @@ int __fenix_repair_ranks()
}
} else {
-
int active_ranks;
survivor_world = (int *) s_malloc(survivor_world_size * sizeof(int));
@@ -497,6 +535,7 @@ int __fenix_repair_ranks()
goto END_LOOP;
}
+
fenix.num_inital_ranks = 0;
fenix.num_recovered_ranks = fenix.fail_world_size;
@@ -519,7 +558,7 @@ int __fenix_repair_ranks()
if (fenix.options.verbose == 2) {
verbose_print("current_rank: %d, role: %d, active_ranks: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role, active_ranks);
+ current_rank, fenix.role, active_ranks);
}
if (current_rank >= active_ranks) { // reorder ranks
@@ -544,7 +583,7 @@ int __fenix_repair_ranks()
fenix.spare_ranks = fenix.spare_ranks - fenix.fail_world_size;
if (fenix.options.verbose == 2) {
verbose_print("current_rank: %d, role: %d, spare_ranks: %d\n",
- __fenix_get_current_rank(fenix.world), fenix.role,
+ current_rank, fenix.role,
fenix.spare_ranks);
}
}
@@ -553,13 +592,8 @@ int __fenix_repair_ranks()
/* Done with the global communicator */
/*********************************************************/
- if (!flag_g_world_freed) {
- ret = PMPI_Comm_free(&fenix.world);
- if (ret != MPI_SUCCESS) { flag_g_world_freed = 1; }
- }
- ret = PMPI_Comm_split(world_without_failures, 0, current_rank, &fenix.world);
-
- /* if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split. repair_ranks\n"); } */
+ ret = PMPI_Comm_split(world_without_failures, 0, current_rank, &fixed_world);
+
if (ret != MPI_SUCCESS) {
repair_success = 0;
if (ret != MPI_ERR_PROC_FAILED) {
@@ -568,19 +602,35 @@ int __fenix_repair_ranks()
MPI_Comm_free(&world_without_failures);
goto END_LOOP;
}
- ret = PMPI_Comm_free(&world_without_failures);
- /* As of 8/8/2016 */
- /* Need special treatment for error handling */
- __fenix_create_new_world();
+ MPI_Comm_free(&world_without_failures);
+
+ ret = __fenix_create_new_world_from(fixed_world);
+ if(ret != MPI_SUCCESS){
+ repair_success = 0;
+ MPIX_Comm_revoke(fixed_world);
+ MPI_Comm_free(&fixed_world);
+ goto END_LOOP;
+ }
- ret = PMPI_Barrier(fenix.world);
+ if(__fenix_spare_rank_within(fixed_world) == -1){
+ ret = MPI_Comm_dup(fenix.new_world, fenix.user_world);
+ if (ret != MPI_SUCCESS){
+ repair_success = 0;
+ MPIX_Comm_revoke(fixed_world);
+ MPI_Comm_free(&fixed_world);
+ goto END_LOOP;
+ }
+ fenix.user_world_exists = 1;
+ }
+
+ ret = PMPI_Barrier(fixed_world);
/* if (ret != MPI_SUCCESS) { debug_print("MPI_Barrier. repair_ranks\n"); } */
if (ret != MPI_SUCCESS) {
repair_success = 0;
- if (ret != MPI_ERR_PROC_FAILED) {
- MPIX_Comm_revoke(fenix.world);
- }
+ MPIX_Comm_revoke(fixed_world);
+ MPI_Comm_free(&fixed_world);
+ goto END_LOOP;
}
END_LOOP:
@@ -591,11 +641,14 @@ int __fenix_repair_ranks()
/*******************************************************/
/*
- if (__fenix_get_current_rank(fenix.world) == FENIX_ROOT) {
+ if (current_rank == FENIX_ROOT) {
LDEBUG("Fenix: communicators repaired\n");
}
*/
}
+
+ *fenix.world = fixed_world;
+ fenix.ignore_errs=0;
return rt_code;
}
@@ -619,18 +672,8 @@ int* __fenix_get_fail_ranks(int *survivor_world, int survivor_world_size, int fa
return fail_ranks;
}
-int __fenix_spare_rank()
-{
- int result = -1;
- int current_rank = __fenix_get_current_rank(fenix.world);
- int new_world_size = __fenix_get_world_size(fenix.world) - fenix.spare_ranks;
- if (current_rank >= new_world_size) {
- if (fenix.options.verbose == 6) {
- verbose_print("current_rank: %d, new_world_size: %d\n", current_rank, new_world_size);
- }
- result = 1;
- }
- return result;
+int __fenix_spare_rank(){
+ return __fenix_spare_rank_within(*fenix.world);
}
void __fenix_postinit(int *error)
@@ -641,9 +684,6 @@ void __fenix_postinit(int *error)
// fenix.role);
//}
- PMPI_Barrier(fenix.new_world);
-
- PMPI_Comm_dup(fenix.new_world, fenix.user_world);
if (fenix.repair_result != 0) {
*error = fenix.repair_result;
@@ -673,23 +713,20 @@ void __fenix_finalize()
// after recovery.
fenix.finalized = 1;
- //We don't want to handle failures in here as normally, we just want to continue trying to finalize.
- fenix.ignore_errs = 1;
-
int ret = MPI_Barrier( fenix.new_world );
if (ret != MPI_SUCCESS) {
__fenix_finalize();
return;
}
- if (__fenix_get_current_rank(fenix.world) == 0) {
+ if (__fenix_get_current_rank(*fenix.world) == 0) {
int spare_rank;
- MPI_Comm_size(fenix.world, &spare_rank);
+ MPI_Comm_size(*fenix.world, &spare_rank);
spare_rank--;
int a;
int i;
for (i = 0; i < fenix.spare_ranks; i++) {
- int ret = MPI_Send(&a, 1, MPI_INT, spare_rank, 1, fenix.world);
+ int ret = MPI_Send(&a, 1, MPI_INT, spare_rank, 1, *fenix.world);
if (ret != MPI_SUCCESS) {
__fenix_finalize();
return;
@@ -698,16 +735,17 @@ void __fenix_finalize()
}
}
- ret = MPI_Barrier(fenix.world);
+ ret = MPI_Barrier(*fenix.world);
if (ret != MPI_SUCCESS) {
__fenix_finalize();
return;
}
MPI_Op_free( &fenix.agree_op );
- MPI_Comm_set_errhandler( fenix.world, MPI_ERRORS_ARE_FATAL );
- MPI_Comm_free( &fenix.world );
- MPI_Comm_free( &fenix.new_world );
+ MPI_Comm_set_errhandler( *fenix.world, MPI_ERRORS_ARE_FATAL );
+ MPI_Comm_free( fenix.world );
+ free(fenix.world);
+ if(fenix.new_world_exists) MPI_Comm_free( &fenix.new_world ); //It should, but just in case. Won't update because trying to free it again ought to generate an error anyway.
if(fenix.role != FENIX_ROLE_INITIAL_RANK){
free(fenix.fail_world);
@@ -725,12 +763,12 @@ void __fenix_finalize()
void __fenix_finalize_spare()
{
fenix.fenix_init_flag = 0;
- int ret = PMPI_Barrier(fenix.world);
+ int ret = PMPI_Barrier(*fenix.world);
if (ret != MPI_SUCCESS) { debug_print("MPI_Barrier: %d\n", ret); }
MPI_Op_free(&fenix.agree_op);
- MPI_Comm_set_errhandler(fenix.world, MPI_ERRORS_ARE_FATAL);
- MPI_Comm_free(&fenix.world);
+ MPI_Comm_set_errhandler(*fenix.world, MPI_ERRORS_ARE_FATAL);
+ MPI_Comm_free(fenix.world);
/* Free callbacks */
__fenix_callback_destroy( fenix.callback_list );
@@ -747,7 +785,6 @@ void __fenix_finalize_spare()
void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...)
{
-
int ret_repair;
int index;
int ret = *pret;
@@ -758,10 +795,10 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...)
switch (ret) {
case MPI_ERR_PROC_FAILED_PENDING:
case MPI_ERR_PROC_FAILED:
- MPIX_Comm_revoke(fenix.world);
+ MPIX_Comm_revoke(*fenix.world);
MPIX_Comm_revoke(fenix.new_world);
-
- MPIX_Comm_revoke(*fenix.user_world);
+
+ if(fenix.user_world_exists) MPIX_Comm_revoke(*fenix.user_world);
__fenix_comm_list_destroy();
@@ -785,7 +822,7 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...)
return;
break;
#ifdef MPICH
- MPIX_Comm_revoke(fenix.world);
+ MPIX_Comm_revoke(*fenix.world);
MPIX_Comm_revoke(fenix.new_world);
//MPIX_Comm_revoke(*fenix.user_world);
fenix.repair_result = __fenix_repair_ranks();
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..c4f2e92
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_subdirectory(subset_internal)
+add_subdirectory(subset_merging)
+add_subdirectory(request_tracking)
+add_subdirectory(request_cancelled)
+add_subdirectory(no_jump)
+add_subdirectory(issend)
+add_subdirectory(failed_spares)
diff --git a/test/failed_spares/CMakeLists.txt b/test/failed_spares/CMakeLists.txt
index 96827f3..8fd95b3 100644
--- a/test/failed_spares/CMakeLists.txt
+++ b/test/failed_spares/CMakeLists.txt
@@ -8,8 +8,8 @@
# directory.
#
-#set(CMAKE_BUILD_TYPE Debug)
add_executable(fenix_failed_spares fenix_failed_spares.c)
-target_link_libraries(fenix_failed_spares fenix ${MPI_C_LIBRARIES})
+target_link_libraries(fenix_failed_spares fenix MPI::MPI_C)
+
add_test(NAME failed_spares
- COMMAND mpirun --with-ft mpi -n 6 fenix_failed_spares 3 1 3 4 )
+ COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 6 ${MPIEXEC_PREFLAGS} fenix_failed_spares ${MPIEXEC_POSTFLAGS} 3 1 3 4 )
diff --git a/test/issend/CMakeLists.txt b/test/issend/CMakeLists.txt
index c4f6918..f141d40 100644
--- a/test/issend/CMakeLists.txt
+++ b/test/issend/CMakeLists.txt
@@ -8,8 +8,7 @@
# directory.
#
-set(CMAKE_BUILD_TYPE Debug)
add_executable(fenix_issend_test fenix_issend_test.c)
-target_link_libraries(fenix_issend_test fenix ${MPI_C_LIBRARIES})
+target_link_libraries(fenix_issend_test fenix MPI::MPI_C)
-add_test(NAME issend COMMAND mpirun --with-ft mpi -np 5 fenix_issend_test "1")
+add_test(NAME issend COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} fenix_issend_test ${MPIEXEC_POSTFLAGS} "1")
diff --git a/test/no_jump/CMakeLists.txt b/test/no_jump/CMakeLists.txt
index b3258dd..dfc9311 100644
--- a/test/no_jump/CMakeLists.txt
+++ b/test/no_jump/CMakeLists.txt
@@ -8,8 +8,7 @@
# directory.
#
-set(CMAKE_BUILD_TYPE Debug)
add_executable(fenix_no_jump_test fenix_no_jump_test.c)
-target_link_libraries(fenix_no_jump_test fenix ${MPI_C_LIBRARIES})
+target_link_libraries(fenix_no_jump_test fenix MPI::MPI_C)
-add_test(NAME no_jump COMMAND mpirun --with-ft mpi -np 5 fenix_no_jump_test "1")
+add_test(NAME no_jump COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} fenix_no_jump_test ${MPIEXEC_POSTFLAGS} "1")
diff --git a/test/request_cancelled/CMakeLists.txt b/test/request_cancelled/CMakeLists.txt
index a59af59..97dd331 100644
--- a/test/request_cancelled/CMakeLists.txt
+++ b/test/request_cancelled/CMakeLists.txt
@@ -8,8 +8,7 @@
# directory.
#
-set(CMAKE_BUILD_TYPE Debug)
add_executable(fenix_request_cancelled_test fenix_req_cancelled_test.c)
-target_link_libraries(fenix_request_cancelled_test fenix ${MPI_C_LIBRARIES})
+target_link_libraries(fenix_request_cancelled_test fenix MPI::MPI_C)
-add_test(NAME request_cancelled COMMAND mpirun --with-ft mpi -np 5 fenix_request_cancelled_test "1")
+add_test(NAME request_cancelled COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 5 ${MPIEXEC_PREFLAGS} fenix_request_cancelled_test ${MPIEXEC_POSTFLAGS} "1")
diff --git a/test/request_tracking/CMakeLists.txt b/test/request_tracking/CMakeLists.txt
index c8269b2..8d008ed 100644
--- a/test/request_tracking/CMakeLists.txt
+++ b/test/request_tracking/CMakeLists.txt
@@ -8,9 +8,8 @@
# directory.
#
-set (CMAKE_BUILD_TYPE Debug)
add_executable(fenix_request_tracking_test fenix_request_tracking_test.c)
-target_link_libraries(fenix_request_tracking_test fenix ${MPI_C_LIBRARIES})
+target_link_libraries(fenix_request_tracking_test fenix MPI::MPI_C)
add_test(NAME request_tracking
- COMMAND mpirun -np 3 fenix_request_tracking_test)
+ COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 3 ${MPIEXEC_PREFLAGS} fenix_request_tracking_test ${MPIEXEC_POSTFLAGS})
diff --git a/test/subset_internal/CMakeLists.txt b/test/subset_internal/CMakeLists.txt
index 24b6190..4dcfc28 100644
--- a/test/subset_internal/CMakeLists.txt
+++ b/test/subset_internal/CMakeLists.txt
@@ -7,7 +7,6 @@
# For more information, see the LICENSE file in the top Fenix
# directory.
#
-set (CMAKE_BUILD_TYPE Debug)
add_executable(fenix_subset_internal_test fenix_subset_internal_test.c)
target_link_libraries(fenix_subset_internal_test fenix)
diff --git a/test/subset_merging/CMakeLists.txt b/test/subset_merging/CMakeLists.txt
index c6d5e46..603686e 100644
--- a/test/subset_merging/CMakeLists.txt
+++ b/test/subset_merging/CMakeLists.txt
@@ -8,7 +8,6 @@
# directory.
#
-set(CMAKE_BUILD_TYPE Debug)
add_executable(fenix_subset_merging_test fenix_subset_merging_test.c)
target_link_libraries(fenix_subset_merging_test fenix)