From 0798fd38ddbe27876922918b539ff8c860781b3b Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Wed, 12 Feb 2020 10:22:48 -0800 Subject: [PATCH 01/32] Add ability to query which processes failed --- examples/01_hello_world/fenix/CMakeLists.txt | 2 +- .../01_hello_world/fenix/fenix_hello_world.c | 13 ++++ examples/02_send_recv/fenix/CMakeLists.txt | 2 +- include/fenix.h | 2 + include/fenix_ext.h | 3 + src/fenix.c | 5 ++ src/fenix_process_recovery.c | 62 +++++++++++-------- 7 files changed, 60 insertions(+), 29 deletions(-) diff --git a/examples/01_hello_world/fenix/CMakeLists.txt b/examples/01_hello_world/fenix/CMakeLists.txt index 22658d2..74bae8c 100644 --- a/examples/01_hello_world/fenix/CMakeLists.txt +++ b/examples/01_hello_world/fenix/CMakeLists.txt @@ -16,5 +16,5 @@ if(BUILD_TESTING) add_executable(fenix_hello_world-debug fenix_hello_world.c) target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME hello_world - COMMAND mpirun --oversubscribe -np 3 fenix_hello_world-debug "1") + COMMAND mpirun -np 3 fenix_hello_world-debug "1") endif() diff --git a/examples/01_hello_world/fenix/fenix_hello_world.c b/examples/01_hello_world/fenix/fenix_hello_world.c index 374a80f..cd6378b 100644 --- a/examples/01_hello_world/fenix/fenix_hello_world.c +++ b/examples/01_hello_world/fenix/fenix_hello_world.c @@ -108,6 +108,19 @@ int main(int argc, char **argv) { printf("hello world: %s, old rank (MPI_COMM_WORLD): %d, new rank: %d, active ranks: %d, ranks before process failure: %d\n", processor_name, old_rank, new_rank, new_world_size, old_world_size); + + int *fails, num_fails; + num_fails = Fenix_Process_fail_list(&fails); + + char fails_str[100]; + sprintf(fails_str, "Rank %d sees failed processes [", new_rank); + for(int i = 0; i < num_fails; i++){ + sprintf(fails_str, "%s%s%d", fails_str, (i==0 ? "" : ", "), fails[i]); + } + sprintf(fails_str, "%s]\n", fails_str); + printf(fails_str); + + Fenix_Finalize(); MPI_Finalize(); diff --git a/examples/02_send_recv/fenix/CMakeLists.txt b/examples/02_send_recv/fenix/CMakeLists.txt index f3f197f..a664f82 100644 --- a/examples/02_send_recv/fenix/CMakeLists.txt +++ b/examples/02_send_recv/fenix/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_ring-debug fenix_ring.c) target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME ring - COMMAND mpirun --oversubscribe -np 5 fenix_ring-debug 1 2) + COMMAND mpirun -np 5 fenix_ring-debug 1 2) set_tests_properties(ring PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/include/fenix.h b/include/fenix.h index 94d1130..549c63f 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -216,6 +216,8 @@ int Fenix_Data_group_delete(int group_id); int Fenix_Data_member_delete(int group_id, int member_id); +int Fenix_Process_fail_list(int** fail_list); + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/include/fenix_ext.h b/include/fenix_ext.h index 9e92454..ec0e3cb 100644 --- a/include/fenix_ext.h +++ b/include/fenix_ext.h @@ -81,6 +81,9 @@ typedef struct { int role; // Role of rank: initial, survivor or repair int fenix_init_flag; + int fail_world_size; + int* fail_world; + fenix_request_store_t request_store; fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions diff --git a/src/fenix.c b/src/fenix.c index 022ec1d..b5865eb 100644 --- a/src/fenix.c +++ b/src/fenix.c @@ -181,3 +181,8 @@ int Fenix_Data_group_delete(int group_id) { int Fenix_Data_member_delete(int group_id, int member_id) { return __fenix_member_delete(group_id, member_id); } + +int Fenix_Process_fail_list(int** fail_list){ + *fail_list = fenix.fail_world; + return fenix.fail_world_size; +} diff --git a/src/fenix_process_recovery.c b/src/fenix_process_recovery.c index 0f8b1fb..6888081 100644 --- a/src/fenix_process_recovery.c +++ b/src/fenix_process_recovery.c @@ -96,6 +96,7 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha fenix.spawn_policy = spawn; fenix.recover_environment = jump_environment; fenix.role = FENIX_ROLE_INITIAL_RANK; + fenix.fail_world_size = 0; fenix.resume_mode = __FENIX_RESUME_AT_INIT; fenix.repair_result = 0; @@ -313,23 +314,23 @@ int __fenix_repair_ranks() current_rank = __fenix_get_current_rank(*fenix.world); survivor_world_size = __fenix_get_world_size(world_without_failures); world_size = __fenix_get_world_size(*fenix.world); - fail_world_size = world_size - survivor_world_size; + fenix.fail_world_size = world_size - survivor_world_size; if (fenix.options.verbose == 2) { verbose_print( "current_rank: %d, role: %d, world_size: %d, fail_world_size: %d, survivor_world_size: %d\n", __fenix_get_current_rank(*fenix.world), fenix.role, world_size, - fail_world_size, survivor_world_size); + fenix.fail_world_size, survivor_world_size); } - if (fenix.spare_ranks < fail_world_size) { + if (fenix.spare_ranks < fenix.fail_world_size) { /* Not enough spare ranks */ if (fenix.options.verbose == 2) { verbose_print( "current_rank: %d, role: %d, spare_ranks: %d, fail_world_size: %d\n", __fenix_get_current_rank(*fenix.world), fenix.role, fenix.spare_ranks, - fail_world_size); + fenix.fail_world_size); } if (fenix.spawn_policy == 1) { @@ -393,22 +394,25 @@ int __fenix_repair_ranks() fenix.num_inital_ranks = 0; /* recovered ranks must be the number of spare ranks */ - fenix.num_recovered_ranks = fail_world_size; + fenix.num_recovered_ranks = fenix.fail_world_size; if (fenix.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, recovered_ranks: %d\n", __fenix_get_current_rank(*fenix.world), fenix.role, fenix.num_recovered_ranks); } - - fail_world = (int *) s_malloc(fail_world_size * sizeof(int)); - fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, - fail_world_size); + + if(fenix.role != FENIX_ROLE_INITIAL_RANK){ + free(fenix.fail_world); + } + fenix.fail_world = (int *) s_malloc(fenix.fail_world_size * sizeof(int)); + fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, + fenix.fail_world_size); if (fenix.options.verbose == 2) { int index; - for (index = 0; index < fail_world_size; index++) { - verbose_print("fail_world[%d]: %d\n", index, fail_world[index]); + for (index = 0; index < fenix.fail_world_size; index++) { + verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]); } } @@ -425,17 +429,15 @@ int __fenix_repair_ranks() /* Assign new rank for reordering */ if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); - if (rank_offset < fail_world_size) { + if (rank_offset < fenix.fail_world_size) { if (fenix.options.verbose == 11) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", - current_rank, fail_world[rank_offset]); + current_rank, fenix.fail_world[rank_offset]); } - current_rank = fail_world[rank_offset]; + current_rank = fenix.fail_world[rank_offset]; } } - free(fail_world); - /************************************/ /* Update the number of spare ranks */ /************************************/ @@ -488,16 +490,20 @@ int __fenix_repair_ranks() } fenix.num_inital_ranks = 0; - fenix.num_recovered_ranks = fail_world_size; + fenix.num_recovered_ranks = fenix.fail_world_size; + + if(fenix.role != FENIX_ROLE_INITIAL_RANK){ + free(fenix.fail_world); + } - fail_world = (int *) s_malloc(fail_world_size * sizeof(int)); - fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fail_world_size); + fenix.fail_world = (int *) s_malloc(fenix.fail_world_size * sizeof(int)); + fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fenix.fail_world_size); free(survivor_world); if (fenix.options.verbose == 2) { int index; - for (index = 0; index < fail_world_size; index++) { - verbose_print("fail_world[%d]: %d\n", index, fail_world[index]); + for (index = 0; index < fenix.fail_world_size; index++) { + verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]); } } @@ -510,21 +516,19 @@ int __fenix_repair_ranks() if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); - if (rank_offset < fail_world_size) { + if (rank_offset < fenix.fail_world_size) { if (fenix.options.verbose == 2) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", - current_rank, fail_world[rank_offset]); + current_rank, fenix.fail_world[rank_offset]); } - current_rank = fail_world[rank_offset]; + current_rank = fenix.fail_world[rank_offset]; } } - free(fail_world); - /************************************/ /* Update the number of spare ranks */ /************************************/ - fenix.spare_ranks = fenix.spare_ranks - fail_world_size; + fenix.spare_ranks = fenix.spare_ranks - fenix.fail_world_size; if (fenix.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, spare_ranks: %d\n", __fenix_get_current_rank(*fenix.world), fenix.role, @@ -695,6 +699,10 @@ void __fenix_finalize() free( fenix.world ); free( fenix.new_world ); + if(fenix.role != FENIX_ROLE_INITIAL_RANK){ + free(fenix.fail_world); + } + /* Free Callbacks */ __fenix_callback_destroy( fenix.callback_list ); From 3582c7e9d9b63840acaca66c58ddc628d7367cb0 Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Tue, 18 Feb 2020 19:04:52 -0800 Subject: [PATCH 02/32] Add support for MPI_Test --- src/fenix_mpi_override.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index 559230c..f6b47b3 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -274,13 +274,14 @@ int MPI_Wait(MPI_Request *fenix_request, MPI_Status *status) { int ret; MPI_Request request = MPI_REQUEST_NULL; - if(*fenix_request != MPI_REQUEST_NULL) + if(*fenix_request != MPI_REQUEST_NULL){ __fenix_request_store_get(&fenix.request_store, *((int *) fenix_request), &request); + } ret = PMPI_Wait(&request, status); - if(ret == MPI_SUCCESS) { + if(ret == MPI_SUCCESS && (*fenix_request != MPI_REQUEST_NULL)) { __fenix_request_store_remove(&fenix.request_store, *((int *) fenix_request)); assert(request == MPI_REQUEST_NULL); @@ -297,11 +298,13 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[], { // The list (array_of_requests) may contain null or inactive handles. int ret, i; - for(i=0 ; i Date: Thu, 21 May 2020 06:18:27 -0700 Subject: [PATCH 03/32] Add support for testing pre-failure requests --- CMakeLists.txt | 21 +-- include/fenix.h | 1 + src/fenix_mpi_override.c | 31 +++- src/fenix_process_recovery.c | 1 + src/fenix_request_store.c | 25 ++-- src/fenix_request_store.h | 57 +++++++- test/request_cancelled/CMakeLists.txt | 15 ++ .../fenix_req_cancelled_test.c | 138 ++++++++++++++++++ 8 files changed, 263 insertions(+), 26 deletions(-) create mode 100644 test/request_cancelled/CMakeLists.txt create mode 100644 test/request_cancelled/fenix_req_cancelled_test.c diff --git a/CMakeLists.txt b/CMakeLists.txt index e90822f..aab81f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,8 @@ set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_BUILD_TYPE Release) -#set(CMAKE_BUILD_TYPE Debug) +#set(CMAKE_BUILD_TYPE Release) +set(CMAKE_BUILD_TYPE Debug) #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O0 -ggdb") #ENABLE_TESTING @@ -98,15 +98,16 @@ if(BUILD_EXAMPLES) elseif(BUILD_TESTING) #Some examples are useful tests as well. - add_subdirectory(examples/01_hello_world/fenix) - add_subdirectory(examples/02_send_recv/fenix) - add_subdirectory(examples/03_reduce/fenix) - add_subdirectory(examples/05_subset_create) - add_subdirectory(examples/06_subset_createv) + #add_subdirectory(examples/01_hello_world/fenix) + #add_subdirectory(examples/02_send_recv/fenix) + #add_subdirectory(examples/03_reduce/fenix) + #add_subdirectory(examples/05_subset_create) + #add_subdirectory(examples/06_subset_createv) endif() if(BUILD_TESTING) - add_subdirectory(test/subset_internal) - add_subdirectory(test/subset_merging) - add_subdirectory(test/request_tracking) + #add_subdirectory(test/subset_internal) + #add_subdirectory(test/subset_merging) + #add_subdirectory(test/request_tracking) + add_subdirectory(test/request_cancelled) endif() diff --git a/include/fenix.h b/include/fenix.h index 549c63f..81cb8cd 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -88,6 +88,7 @@ extern "C" { #define FENIX_ERROR_SUBSET_STRIDE -25 #define FENIX_ERROR_NODATA_FOUND -30 #define FENIX_ERROR_INTERN -40 +#define FENIX_ERROR_CANCELLED -50 #define FENIX_WARNING_SPARE_RANKS_DEPLETED 100 #define FENIX_WARNING_PARTIAL_RESTORE 101 diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index f6b47b3..b75dbde 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -337,23 +337,48 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[], int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status) { int ret; + int is_cancelled = 0; MPI_Request real_req = MPI_REQUEST_NULL; + if(*request != MPI_REQUEST_NULL){ - __fenix_request_store_get(&fenix.request_store, *((int*)request), &real_req); + if(*request == FENIX_REQUEST_CANCELLED){ + is_cancelled = 1; + } else { + int retval = + __fenix_request_store_get(&fenix.request_store, *((int*)request), &real_req); + + if(retval == FENIX_ERROR_CANCELLED) { + is_cancelled = 1; + } + + if(retval == FENIX_REQUEST_COMPLETED){ + *flag = 1; + if(status != MPI_STATUS_IGNORE) + __fenix_request_store_get_status(&fenix.request_store, *((int*)request), status); + *request = MPI_REQUEST_NULL; + return; + } + } } else { fprintf(stderr, "Found null request!\n"); } + ret = PMPI_Test(&real_req, flag, status); __fenix_test_MPI_inline(ret, "MPI_Test"); - if(*flag && *request != MPI_REQUEST_NULL && ret == MPI_SUCCESS){ + if(*flag && *request != MPI_REQUEST_NULL && *request != FENIX_REQUEST_CANCELLED && ret == MPI_SUCCESS){ //This request is done, it can be removed from the store. __fenix_request_store_remove(&fenix.request_store, *((int*)request)); *request = MPI_REQUEST_NULL; } - return ret; + if(is_cancelled){ + *request = FENIX_REQUEST_CANCELLED; + return FENIX_ERROR_CANCELLED; + } + + else return ret; } int MPI_Cancel(MPI_Request *request) diff --git a/src/fenix_process_recovery.c b/src/fenix_process_recovery.c index 6888081..cfe528e 100644 --- a/src/fenix_process_recovery.c +++ b/src/fenix_process_recovery.c @@ -761,6 +761,7 @@ void __fenix_test_MPI(int ret, const char *msg) } __fenix_request_store_waitall_removeall(&fenix.request_store); + __fenix_comm_list_destroy(); diff --git a/src/fenix_request_store.c b/src/fenix_request_store.c index fb222be..0038bf0 100644 --- a/src/fenix_request_store.c +++ b/src/fenix_request_store.c @@ -56,23 +56,28 @@ #include #include "fenix_request_store.h" +#include "fenix_ext.h" void __fenix_request_store_waitall_removeall(fenix_request_store_t *s) { int i; for(i=0 ; ifirst_unused_position ; i++) { __fenix_request_t *f = &(s->reqs.elements[i]); - if(f->valid) { + if(f->valid && !f->cancelled) { #warning "What to do with requests upon failure? Wait or Cancel?" - PMPI_Cancel(&(f->r)); - if(i == MPI_REQUEST_NULL) // This may look ugly and - // produce a warning, but it is - // necessary to make sure an - // MPI_Request NULL does not - // collide in the request store - __fenix_request_store_remove(s, -123); - else - __fenix_request_store_remove(s, i); + int rank; + MPI_Comm_rank(*fenix.new_world, &rank); + int flag; + int ret = PMPI_Test(&(f->r), &flag, &(f->status)); + + if(!flag || ret != MPI_SUCCESS){ + //This request wasn't able to finish before the failure + //We cancel it, and notify the user that it was cancelled + //PMPI_Cancel(&(f->r)); + f->cancelled = 1; + } else { + f->completed = 1; + } } } diff --git a/src/fenix_request_store.h b/src/fenix_request_store.h index 2c85a9a..56050b5 100644 --- a/src/fenix_request_store.h +++ b/src/fenix_request_store.h @@ -64,6 +64,7 @@ #include #include "fenix_stack.h" +#include "fenix.h" /* @@ -84,12 +85,18 @@ */ - typedef struct { char valid; + char cancelled; + char completed; //This should only be set if the request was completed + //prior to a failure, before the user could MPI_Test the call + MPI_Status status; //This is as "completed" MPI_Request r; } __fenix_request_t; +#define FENIX_REQUEST_CANCELLED ((MPI_Request)((int)MPI_REQUEST_NULL+1)) +#define FENIX_REQUEST_COMPLETED 180 + #define __fenix_dynamic_array_type __fenix_request_t #define __fenix_dynamic_array_typename req #include "fenix_dynamic_array.h" @@ -141,6 +148,7 @@ int __fenix_request_store_add(fenix_request_store_t *s, assert(!f->valid); memcpy(&(f->r), r, sizeof(MPI_Request)); f->valid = 1; + f->cancelled = 0; // Cannot return a position that is equivalent to MPI_REQUEST_NULL MPI_Request r_test; @@ -152,11 +160,18 @@ int __fenix_request_store_add(fenix_request_store_t *s, assert(r_test != MPI_REQUEST_NULL); } } + if(r_test == FENIX_REQUEST_CANCELLED) { + position = -124; + { + *((int *)&r_test) = position; + assert(r_test != FENIX_REQUEST_CANCELLED); + } + } return position; } static inline -void __fenix_request_store_get(fenix_request_store_t *s, +int __fenix_request_store_get(fenix_request_store_t *s, int request_id, MPI_Request *r) { @@ -169,10 +184,46 @@ void __fenix_request_store_get(fenix_request_store_t *s, MPI_Request r_test = MPI_REQUEST_NULL; request_id = *((int*) &r_test); } + if(request_id == -124) { + MPI_Request r_test = FENIX_REQUEST_CANCELLED; + request_id = *((int*) &r_test); + } + MPI_Request r_test; + *((int *)&r_test) = request_id; + if(r_test == FENIX_REQUEST_CANCELLED){ + *r = MPI_REQUEST_NULL; + return FENIX_ERROR_CANCELLED; + } __fenix_request_t *f = &(s->reqs.elements[request_id]); assert(f->valid); memcpy(r, &(f->r), sizeof(MPI_Request)); - assert(*r != MPI_REQUEST_NULL); + + if(f->cancelled) return FENIX_ERROR_CANCELLED; + if(f->completed) return FENIX_REQUEST_COMPLETED; + else return FENIX_SUCCESS; +} + +static inline +void __fenix_request_store_get_status(fenix_request_store_t *s, + int request_id, + MPI_Status *status){ + { + MPI_Request r_test; + *((int *)&r_test) = request_id; + assert(r_test != MPI_REQUEST_NULL); + } + if(request_id == -123) { + MPI_Request r_test = MPI_REQUEST_NULL; + request_id = *((int*) &r_test); + } + if(request_id == -124) { + MPI_Request r_test = FENIX_REQUEST_CANCELLED; + request_id = *((int*) &r_test); + } + + __fenix_request_t *f = &(s->reqs.elements[request_id]); + assert(f->completed); + memcpy(status, &(f->status), sizeof(MPI_Status)); } static inline diff --git a/test/request_cancelled/CMakeLists.txt b/test/request_cancelled/CMakeLists.txt new file mode 100644 index 0000000..b106e97 --- /dev/null +++ b/test/request_cancelled/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# This file is part of Fenix +# Copyright (c) 2016 Rutgers University and Sandia Corporation. +# This software is distributed under the BSD License. +# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +# the U.S. Government retains certain rights in this software. +# For more information, see the LICENSE file in the top Fenix +# directory. +# + +set(CMAKE_BUILD_TYPE Debug) +add_executable(fenix_request_cancelled_test fenix_req_cancelled_test.c) +target_link_libraries(fenix_request_cancelled_test fenix ${MPI_C_LIBRARIES}) + +add_test(NAME request_cancelled COMMAND mpirun -np 5 fenix_request_cancelled_test "1") diff --git a/test/request_cancelled/fenix_req_cancelled_test.c b/test/request_cancelled/fenix_req_cancelled_test.c new file mode 100644 index 0000000..480497b --- /dev/null +++ b/test/request_cancelled/fenix_req_cancelled_test.c @@ -0,0 +1,138 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include +#include +#include + +const int kKillID = 1; + +int main(int argc, char **argv) { + +#warning "It's a good idea to complain when not enough parameters! Should add this code to other examples too." + if (argc < 2) { + printf("Usage: %s <# spare ranks> \n", *argv); + exit(0); + } + + int old_world_size, new_world_size = - 1; + int old_rank = 1, new_rank = - 1; + int spare_ranks = atoi(argv[1]); + int buffer; + + MPI_Init(&argc, &argv); + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Comm world_comm; + MPI_Comm_dup(MPI_COMM_WORLD, &world_comm); + MPI_Comm_size(world_comm, &old_world_size); + MPI_Comm_rank(world_comm, &old_rank); + + int fenix_status; + int recovered = 0; + MPI_Comm new_comm; + int error; + MPI_Request req = MPI_REQUEST_NULL; + Fenix_Init(&fenix_status, world_comm, &new_comm, &argc, &argv, spare_ranks, 0, MPI_INFO_NULL, &error); + + MPI_Comm_size(new_comm, &new_world_size); + MPI_Comm_rank(new_comm, &new_rank); + + if (fenix_status != FENIX_ROLE_INITIAL_RANK) { + recovered = 1; + } else { + MPI_Irecv(&buffer, 1, MPI_INT, (new_rank+1)%new_world_size, 1, new_comm, &req); + //Kill rank dies before being able to send + if(new_rank == 0 || new_rank == 2) MPI_Send(&buffer, 1, MPI_INT, old_rank==0 ? new_world_size-1 : new_rank-1, 1, new_comm); + MPI_Barrier(new_comm); + } + + + if (old_rank == kKillID && recovered == 0) { + pid_t pid = getpid(); + kill(pid, SIGTERM); + } + + + MPI_Barrier(new_comm); + + //After recovery, the slow ranks send + if(new_rank == 1 || new_rank == 3 ) MPI_Send(&buffer, 1, MPI_INT, new_rank==0 ? new_world_size-1 : new_rank-1, 1, new_comm); + + MPI_Barrier(new_comm); //Lots of barriers to demonstrate a specific ordering of events. + + //Check result of old requests - cannot wait, must MPI_Test only on old pre-failure requests for now + if(new_rank != kKillID){ + int flag; + int ret = MPI_Test(&req, &flag, MPI_STATUS_IGNORE); + if(!flag || ret == FENIX_ERROR_CANCELLED){ + printf("Rank %d's request was NOT satisfied before the failure\n", new_rank); + MPI_Irecv(&buffer, 1, MPI_INT, (new_rank+1)%new_world_size, 1, new_comm, &req); //We can re-launch the IRecv if we know the + //other ranks are going to send now + } else { + printf("Rank %d's request was satisfied before the failure\n", new_rank); + } + + } + + Fenix_Finalize(); + MPI_Finalize(); + + return 0; +} From cee633d88b87b489181f596b5b60d8b2405634d1 Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Wed, 27 May 2020 11:43:18 -0700 Subject: [PATCH 04/32] Fix bug when ERR_PROC_FAILED/ERR_REVOKED discovered in MPI_Test --- CMakeLists.txt | 16 ++++++++-------- src/fenix_data_recovery.c | 4 ++-- src/fenix_mpi_override.c | 7 ++++++- src/fenix_request_store.c | 1 - src/fenix_request_store.h | 30 ++++++++++++++++++++++++++++++ 5 files changed, 46 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aab81f8..d142bb4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,16 +98,16 @@ if(BUILD_EXAMPLES) elseif(BUILD_TESTING) #Some examples are useful tests as well. - #add_subdirectory(examples/01_hello_world/fenix) - #add_subdirectory(examples/02_send_recv/fenix) - #add_subdirectory(examples/03_reduce/fenix) - #add_subdirectory(examples/05_subset_create) - #add_subdirectory(examples/06_subset_createv) + add_subdirectory(examples/01_hello_world/fenix) + add_subdirectory(examples/02_send_recv/fenix) + add_subdirectory(examples/03_reduce/fenix) + add_subdirectory(examples/05_subset_create) + add_subdirectory(examples/06_subset_createv) endif() if(BUILD_TESTING) - #add_subdirectory(test/subset_internal) - #add_subdirectory(test/subset_merging) - #add_subdirectory(test/request_tracking) + add_subdirectory(test/subset_internal) + add_subdirectory(test/subset_merging) + add_subdirectory(test/request_tracking) add_subdirectory(test/request_cancelled) endif() diff --git a/src/fenix_data_recovery.c b/src/fenix_data_recovery.c index 56778b9..ab6c473 100644 --- a/src/fenix_data_recovery.c +++ b/src/fenix_data_recovery.c @@ -103,7 +103,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, /* If so, recover the data and set the recovery */ /* for member recovery. */ - int i, group_position; + int i; int remote_need_recovery; fenix_group_t *group; MPI_Status status; @@ -149,7 +149,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, } else { /* Already created. Renew the MPI communicator */ - group = ( data_recovery->group[group_position] ); + group = ( data_recovery->group[group_index] ); group->comm = comm; /* Renew communicator */ MPI_Comm_rank(comm, &(group->current_rank)); diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index b75dbde..9b0b035 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -238,7 +238,9 @@ int MPI_Sendrecv(MPI_CONST_TYPE void* sendbuf, int sendcount, static inline void __fenix_override_request(int ret, MPI_Request *request) { - if(ret != MPI_SUCCESS) return; + if(ret != MPI_SUCCESS) { + return; + } assert(*request != MPI_REQUEST_NULL); @@ -365,6 +367,9 @@ int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status) ret = PMPI_Test(&real_req, flag, status); + if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ + __fenix_request_store_cancel(&fenix.request_store, *((int*)request)); + } __fenix_test_MPI_inline(ret, "MPI_Test"); if(*flag && *request != MPI_REQUEST_NULL && *request != FENIX_REQUEST_CANCELLED && ret == MPI_SUCCESS){ diff --git a/src/fenix_request_store.c b/src/fenix_request_store.c index 0038bf0..2878845 100644 --- a/src/fenix_request_store.c +++ b/src/fenix_request_store.c @@ -81,6 +81,5 @@ void __fenix_request_store_waitall_removeall(fenix_request_store_t *s) } } - s->first_unused_position = 0; __fenix_int_stack_clear(&(s->freed_list)); } diff --git a/src/fenix_request_store.h b/src/fenix_request_store.h index 56050b5..972e358 100644 --- a/src/fenix_request_store.h +++ b/src/fenix_request_store.h @@ -203,6 +203,36 @@ int __fenix_request_store_get(fenix_request_store_t *s, else return FENIX_SUCCESS; } +static inline +int __fenix_request_store_cancel(fenix_request_store_t *s, + int request_id) +{ + { + MPI_Request r_test; + *((int *)&r_test) = request_id; + assert(r_test != MPI_REQUEST_NULL); + } + if(request_id == -123) { + MPI_Request r_test = MPI_REQUEST_NULL; + request_id = *((int*) &r_test); + } + if(request_id == -124) { + MPI_Request r_test = FENIX_REQUEST_CANCELLED; + request_id = *((int*) &r_test); + } + MPI_Request r_test; + *((int *)&r_test) = request_id; + if(r_test == FENIX_REQUEST_CANCELLED){ + return FENIX_ERROR_CANCELLED; + } + + __fenix_request_t *f = &(s->reqs.elements[request_id]); + assert(f->valid); + f->cancelled = 1; + + return FENIX_SUCCESS; +} + static inline void __fenix_request_store_get_status(fenix_request_store_t *s, int request_id, From e4c6a3f2ea2d208d72c7f406c8d070b267fc060d Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Wed, 27 May 2020 12:45:14 -0700 Subject: [PATCH 05/32] Fix MPI_Wait w/ cancelled requests --- src/fenix_mpi_override.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index 9b0b035..e7a7506 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -274,22 +274,48 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int MPI_Wait(MPI_Request *fenix_request, MPI_Status *status) { - int ret; + int ret, is_cancelled = 1; MPI_Request request = MPI_REQUEST_NULL; if(*fenix_request != MPI_REQUEST_NULL){ - __fenix_request_store_get(&fenix.request_store, - *((int *) fenix_request), - &request); + if(*fenix_request == FENIX_REQUEST_CANCELLED){ + is_cancelled = 1; + } else { + int retval = + __fenix_request_store_get(&fenix.request_store, *((int*)fenix_request), &request); + + if(retval == FENIX_ERROR_CANCELLED) { + is_cancelled = 1; + } + + if(retval == FENIX_REQUEST_COMPLETED){ + if(status != MPI_STATUS_IGNORE) + __fenix_request_store_get_status(&fenix.request_store, *((int*)fenix_request), status); + *fenix_request = MPI_REQUEST_NULL; + return; + } + } } ret = PMPI_Wait(&request, status); - if(ret == MPI_SUCCESS && (*fenix_request != MPI_REQUEST_NULL)) { + + if(ret == MPI_SUCCESS && (*fenix_request != MPI_REQUEST_NULL) && (*fenix_request != FENIX_REQUEST_CANCELLED)) { __fenix_request_store_remove(&fenix.request_store, *((int *) fenix_request)); assert(request == MPI_REQUEST_NULL); - *fenix_request = MPI_REQUEST_NULL; + *fenix_request = MPI_REQUEST_NULL; + } + if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ +fprintf(stderr, "Cancelling request from MPI_Wait\n"); + __fenix_request_store_cancel(&fenix.request_store, *((int*)fenix_request), *status); + *fenix_request = FENIX_REQUEST_CANCELLED; } __fenix_test_MPI_inline(ret, "MPI_Wait"); + + + if(is_cancelled){ + *fenix_request = FENIX_REQUEST_CANCELLED; + return FENIX_ERROR_CANCELLED; + } return ret; } @@ -368,7 +394,7 @@ int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status) ret = PMPI_Test(&real_req, flag, status); if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ - __fenix_request_store_cancel(&fenix.request_store, *((int*)request)); + __fenix_request_store_cancel(&fenix.request_store, *((int*)request), *status); } __fenix_test_MPI_inline(ret, "MPI_Test"); From 42814f62f140626bf7e677a3959cf376f16a1784 Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Wed, 27 May 2020 15:37:20 -0700 Subject: [PATCH 06/32] Add missing file to commit --- src/fenix_request_store.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fenix_request_store.h b/src/fenix_request_store.h index 972e358..074e18b 100644 --- a/src/fenix_request_store.h +++ b/src/fenix_request_store.h @@ -205,7 +205,8 @@ int __fenix_request_store_get(fenix_request_store_t *s, static inline int __fenix_request_store_cancel(fenix_request_store_t *s, - int request_id) + int request_id, + MPI_Status status) { { MPI_Request r_test; @@ -229,6 +230,7 @@ int __fenix_request_store_cancel(fenix_request_store_t *s, __fenix_request_t *f = &(s->reqs.elements[request_id]); assert(f->valid); f->cancelled = 1; + f->status = status; return FENIX_SUCCESS; } From 58e02c86d70101506d531202f304fb58bd6e5720 Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Wed, 27 May 2020 16:40:46 -0700 Subject: [PATCH 07/32] Fix bug with MPI_STATUS_IGNORE --- src/fenix_mpi_override.c | 7 ++----- src/fenix_request_store.h | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index e7a7506..cfeaefe 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -305,8 +305,7 @@ int MPI_Wait(MPI_Request *fenix_request, MPI_Status *status) *fenix_request = MPI_REQUEST_NULL; } if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ -fprintf(stderr, "Cancelling request from MPI_Wait\n"); - __fenix_request_store_cancel(&fenix.request_store, *((int*)fenix_request), *status); + __fenix_request_store_cancel(&fenix.request_store, *((int*)fenix_request), status); *fenix_request = FENIX_REQUEST_CANCELLED; } __fenix_test_MPI_inline(ret, "MPI_Wait"); @@ -387,14 +386,12 @@ int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status) return; } } - } else { - fprintf(stderr, "Found null request!\n"); } ret = PMPI_Test(&real_req, flag, status); if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ - __fenix_request_store_cancel(&fenix.request_store, *((int*)request), *status); + __fenix_request_store_cancel(&fenix.request_store, *((int*)request), status); } __fenix_test_MPI_inline(ret, "MPI_Test"); diff --git a/src/fenix_request_store.h b/src/fenix_request_store.h index 074e18b..a6dbd00 100644 --- a/src/fenix_request_store.h +++ b/src/fenix_request_store.h @@ -206,7 +206,7 @@ int __fenix_request_store_get(fenix_request_store_t *s, static inline int __fenix_request_store_cancel(fenix_request_store_t *s, int request_id, - MPI_Status status) + MPI_Status *status) { { MPI_Request r_test; @@ -230,7 +230,7 @@ int __fenix_request_store_cancel(fenix_request_store_t *s, __fenix_request_t *f = &(s->reqs.elements[request_id]); assert(f->valid); f->cancelled = 1; - f->status = status; + if(status != NULL && status != MPI_STATUS_IGNORE) f->status = *status; return FENIX_SUCCESS; } From 3071f296392030cae765b9406285264476a04d9d Mon Sep 17 00:00:00 2001 From: "mwhitlo@sandia.gov" Date: Wed, 24 Jun 2020 08:15:28 -0700 Subject: [PATCH 08/32] Fix another bug with MPI_Test --- src/fenix_mpi_override.c | 3 +++ src/fenix_request_store.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index cfeaefe..7e686e5 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -267,6 +267,7 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int ret; ret = PMPI_Irecv(buf, count, datatype, source, tag, __fenix_replace_comm(comm), request); + __fenix_override_request(ret, request); __fenix_test_MPI_inline(ret, "MPI_Irecv"); return ret; @@ -392,7 +393,9 @@ int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status) ret = PMPI_Test(&real_req, flag, status); if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ __fenix_request_store_cancel(&fenix.request_store, *((int*)request), status); + *request = FENIX_REQUEST_CANCELLED; } + __fenix_test_MPI_inline(ret, "MPI_Test"); if(*flag && *request != MPI_REQUEST_NULL && *request != FENIX_REQUEST_CANCELLED && ret == MPI_SUCCESS){ diff --git a/src/fenix_request_store.h b/src/fenix_request_store.h index a6dbd00..4d62db7 100644 --- a/src/fenix_request_store.h +++ b/src/fenix_request_store.h @@ -122,7 +122,8 @@ void __fenix_request_store_destroy(fenix_request_store_t *s) { int valid_count = 0, i; for(i=0 ; ifirst_unused_position ; i++) - if(s->reqs.elements[i].valid) valid_count++; + if(s->reqs.elements[i].valid && + !(s->reqs.elements[i].completed || s->reqs.elements[i].cancelled) ) valid_count++; if(valid_count > 0) printf("[Fenix warning] __fenix_request_store_destroy. store contains valid elements (valid elems %d, first_unused_pos %d)\n", valid_count, s->first_unused_position); __fenix_req_dynamic_array_destroy(&(s->reqs)); From 813540a1b797659227819dd2d43afec068cad83e Mon Sep 17 00:00:00 2001 From: sriraj Date: Fri, 26 Jun 2020 21:15:21 -0700 Subject: [PATCH 09/32] Add no-jump recovery option --- CMakeLists.txt | 1 + include/fenix_ext.h | 4 + src/fenix_process_recovery.c | 19 +++- test/no_jump/CMakeLists.txt | 15 ++++ test/no_jump/fenix_no_jump_test.c | 139 ++++++++++++++++++++++++++++++ 5 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 test/no_jump/CMakeLists.txt create mode 100644 test/no_jump/fenix_no_jump_test.c diff --git a/CMakeLists.txt b/CMakeLists.txt index d142bb4..38fc5d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,4 +110,5 @@ if(BUILD_TESTING) add_subdirectory(test/subset_merging) add_subdirectory(test/request_tracking) add_subdirectory(test/request_cancelled) + add_subdirectory(test/no_jump) endif() diff --git a/include/fenix_ext.h b/include/fenix_ext.h index ec0e3cb..0956b81 100644 --- a/include/fenix_ext.h +++ b/include/fenix_ext.h @@ -84,6 +84,10 @@ typedef struct { int fail_world_size; int* fail_world; + //Save the pointer to role and error of Fenix_Init + int *ret_role; + int *ret_error; + fenix_request_store_t request_store; fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions diff --git a/src/fenix_process_recovery.c b/src/fenix_process_recovery.c index cfe528e..92e1929 100644 --- a/src/fenix_process_recovery.c +++ b/src/fenix_process_recovery.c @@ -99,6 +99,8 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha fenix.fail_world_size = 0; fenix.resume_mode = __FENIX_RESUME_AT_INIT; fenix.repair_result = 0; + fenix.ret_role = role; + fenix.ret_error = error; fenix.options.verbose = -1; // __fenix_init_opt(*argc, *argv); @@ -788,6 +790,19 @@ void __fenix_test_MPI(int ret, const char *msg) } fenix.role = FENIX_ROLE_SURVIVOR_RANK; - if(!fenix.finalized) - longjmp(*fenix.recover_environment, 1); + if(!fenix.finalized) { + switch(fenix.resume_mode) { + case __FENIX_RESUME_AT_INIT: + longjmp(*fenix.recover_environment, 1); + break; + case __FENIX_RESUME_NO_JUMP: + *(fenix.ret_role) = FENIX_ROLE_SURVIVOR_RANK; + __fenix_postinit(fenix.ret_error); + break; + default: + printf("Fenix detected error: Unknown resume mode\n"); + assert(false); + break; + } + } } diff --git a/test/no_jump/CMakeLists.txt b/test/no_jump/CMakeLists.txt new file mode 100644 index 0000000..c31efc0 --- /dev/null +++ b/test/no_jump/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# This file is part of Fenix +# Copyright (c) 2016 Rutgers University and Sandia Corporation. +# This software is distributed under the BSD License. +# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +# the U.S. Government retains certain rights in this software. +# For more information, see the LICENSE file in the top Fenix +# directory. +# + +set(CMAKE_BUILD_TYPE Debug) +add_executable(fenix_no_jump_test fenix_no_jump_test.c) +target_link_libraries(fenix_no_jump_test fenix ${MPI_C_LIBRARIES}) + +add_test(NAME no_jump COMMAND mpirun -np 5 fenix_no_jump_test "1") diff --git a/test/no_jump/fenix_no_jump_test.c b/test/no_jump/fenix_no_jump_test.c new file mode 100644 index 0000000..a0fea08 --- /dev/null +++ b/test/no_jump/fenix_no_jump_test.c @@ -0,0 +1,139 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include +#include +#include +#include + +const int kKillID = 1; + +int main(int argc, char **argv) { + +#warning "It's a good idea to complain when not enough parameters! Should add this code to other examples too." + if (argc < 2) { + printf("Usage: %s <# spare ranks> \n", *argv); + exit(0); + } + + int old_world_size, new_world_size = - 1; + int old_rank = 1, new_rank = - 1; + int spare_ranks = atoi(argv[1]); + int buffer; + + MPI_Init(&argc, &argv); + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Comm world_comm; + MPI_Comm_dup(MPI_COMM_WORLD, &world_comm); + MPI_Comm_size(world_comm, &old_world_size); + MPI_Comm_rank(world_comm, &old_rank); + + MPI_Info info; + MPI_Info_create(&info); + MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); + + int fenix_status; + int recovered = 0; + MPI_Comm new_comm; + int error; + Fenix_Init(&fenix_status, world_comm, &new_comm, &argc, &argv, spare_ranks, 0, info, &error); + + MPI_Comm_size(new_comm, &new_world_size); + MPI_Comm_rank(new_comm, &new_rank); + + if (old_rank == kKillID) { + assert(fenix_status == FENIX_ROLE_INITIAL_RANK); + pid_t pid = getpid(); + kill(pid, SIGTERM); + } + + if(new_rank == kKillID) { + assert(fenix_status == FENIX_ROLE_RECOVERED_RANK); + int sval = 33; + MPI_Send(&sval, 1, MPI_INT, kKillID-1, 1, new_comm); + } + else if(new_rank == kKillID-1) { + assert(fenix_status == FENIX_ROLE_INITIAL_RANK); + int rval = 44; + MPI_Status status; + MPI_Recv(&rval, 1, MPI_INT, kKillID, 1, new_comm, &status); + + assert(fenix_status == FENIX_ROLE_SURVIVOR_RANK); + assert(rval == 44); + printf("Rank %d did not receive new value. old value is %d\n", new_rank, rval); + + MPI_Recv(&rval, 1, MPI_INT, kKillID, 1, new_comm, &status); + assert(rval == 33); + printf("Rank %d received new value %d\n", new_rank, rval); + } + else { + assert(fenix_status == FENIX_ROLE_INITIAL_RANK); + MPI_Barrier(new_comm); + assert(fenix_status == FENIX_ROLE_SURVIVOR_RANK); + } + + MPI_Barrier(new_comm); + + Fenix_Finalize(); + MPI_Finalize(); + + return 0; +} + From 59eec3cce5aa7137e1ddc6ff6dccea57da04d472 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 06:45:20 -0700 Subject: [PATCH 10/32] small test --- .travis.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index ec3ddc5..63fa11b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,14 +14,12 @@ before_install: - cd .travis_helpers - source ./fetchULFMmpi.sh - cd ../ #Always end back at the root directory +install: + - mkdir build && cd build + - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 + - cd ../ #Always end back at the root directory script: - - cd .travis_helpers - - source fetchULFMmpi.sh #Just updates path if ULFM was built properly in before_install - - cd ../ - - mkdir build - cd build - - cmake ../ -DBUILD_TESTING=ON - - make -j4 VERBOSE=1 - make test - cd ../ #Always end back at the root directory. after_failure: From ce88c104ac5a4cdc52dc67b757427af7408cf83e Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 07:10:49 -0700 Subject: [PATCH 11/32] trying to simplify --- .travis.yml | 45 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 63fa11b..34c1e18 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,11 +9,46 @@ addons: - valgrind cache: directories: - - .travis_helpers/ulfm-install -before_install: - - cd .travis_helpers - - source ./fetchULFMmpi.sh - - cd ../ #Always end back at the root directory + - ./ulfm-install +setup_ulfm: + - echo "Configuring ULFM" + - if [ -f ulfm-install/lib/libmpi.so ]; then + echo "libmpich.so found -- nothing to build."; + cd ulfm-install; + else + ROOT=`pwd`; + echo "Downloading ULFM from repo"; + wget https://bitbucket.org/icldistcomp/ulfm2/get/ulfm2.0rc.tar.bz2; + tar -xjf ulfm2.0rc.tar.bz2; + mv icldist* ulfm-src/; + echo " - Configuring and building ULFM."; + cd ulfm-src; + echo " - Running autogen.pl"; + ./autogen.pl >./ulfm_build_output.txt 2>&1; + echo " - Running configure"; + ./configure --prefix=$ROOT/ulfm-install >>./ulfm_build_output.txt 2>&1; + echo " - Running make"; + make -j4 >>./ulfm_build_output.txt 2>&1; + echo " - Running make install"; + make install >>./ulfm_build_output.txt 2>&1; + echo " - Finished installing ULFM"; + cd ../ulfm-install/; + fi + + #Expect that any changes to the above still puts me in the install's home dir + - export MPI_HOME=`pwd` + - export PATH=$MPI_HOME/bin/:$PATH + - export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH + - export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH + - export MANPATH=$MPI_HOME/share/man:$MANPATH + + - export MPICC="`which mpicc`" + - export MPICXX="`which mpic++`" + + - export OMPI_MCA_rmaps_base_oversubscribe "1" + + #Assuming the install's home dir is one above current. + - cd ../ install: - mkdir build && cd build - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 From 428a4d8c50a7dd4a3cbb466ebd1d2ee5218cf522 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 07:16:53 -0700 Subject: [PATCH 12/32] small fix --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 34c1e18..f4f9d1c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,7 +10,7 @@ addons: cache: directories: - ./ulfm-install -setup_ulfm: +before_install: - echo "Configuring ULFM" - if [ -f ulfm-install/lib/libmpi.so ]; then echo "libmpich.so found -- nothing to build."; From 1c24eed2ed93beb3eb790b684dd0f009d2d7bfc4 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 07:28:19 -0700 Subject: [PATCH 13/32] Fix oversubscribe export --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index f4f9d1c..4a5bd62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,8 +45,8 @@ before_install: - export MPICC="`which mpicc`" - export MPICXX="`which mpic++`" - - export OMPI_MCA_rmaps_base_oversubscribe "1" - + - export OMPI_MCA_rmaps_base_oversubscribe=1 + #Assuming the install's home dir is one above current. - cd ../ install: From 52f380b96d5eba120708caecf33e9991a45ade98 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 07:40:38 -0700 Subject: [PATCH 14/32] Test working dir change requirements --- .travis.yml | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 4a5bd62..59e587e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,11 +9,11 @@ addons: - valgrind cache: directories: - - ./ulfm-install + - ulfm-install before_install: - echo "Configuring ULFM" - if [ -f ulfm-install/lib/libmpi.so ]; then - echo "libmpich.so found -- nothing to build."; + echo "libmpi.so found -- nothing to build."; cd ulfm-install; else ROOT=`pwd`; @@ -46,17 +46,11 @@ before_install: - export MPICXX="`which mpic++`" - export OMPI_MCA_rmaps_base_oversubscribe=1 - - #Assuming the install's home dir is one above current. - - cd ../ install: - mkdir build && cd build - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 - - cd ../ #Always end back at the root directory script: - - cd build - - make test - - cd ../ #Always end back at the root directory. + - cd build && make test after_failure: - echo "Failure occured, printing run logs:" - pwd From 2ea13a72e1f07350678a3aa55cf6bb01eb8af92e Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 07:44:58 -0700 Subject: [PATCH 15/32] test failure --- .travis.yml | 5 ++++- .travis_helpers/fetchULFMmpi.sh | 37 --------------------------------- 2 files changed, 4 insertions(+), 38 deletions(-) delete mode 100644 .travis_helpers/fetchULFMmpi.sh diff --git a/.travis.yml b/.travis.yml index 59e587e..3ca035c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,12 +45,15 @@ before_install: - export MPICC="`which mpicc`" - export MPICXX="`which mpic++`" - - export OMPI_MCA_rmaps_base_oversubscribe=1 + # export OMPI_MCA_rmaps_base_oversubscribe=1 + - cd ../ #Always end back at root install: - mkdir build && cd build - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 + - cd ../ #Always end back at root script: - cd build && make test + - cd ../ #Always end back at root after_failure: - echo "Failure occured, printing run logs:" - pwd diff --git a/.travis_helpers/fetchULFMmpi.sh b/.travis_helpers/fetchULFMmpi.sh deleted file mode 100644 index a5611cc..0000000 --- a/.travis_helpers/fetchULFMmpi.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -if [ -f ulfm-install/lib/libmpi.so ]; then - echo "libmpich.so found -- nothing to build." - cd ulfm-install -else - ROOT=`pwd` - echo "Downloading ULFM from repo" - wget https://bitbucket.org/icldistcomp/ulfm2/get/ulfm2.0rc.tar.bz2 - tar -xjf ulfm2.0rc.tar.bz2 - mv icldist* ulfm-src/ - echo " - Configuring and building ULFM." - cd ulfm-src - echo " - Running autogen.pl" - ./autogen.pl > ../build_output.txt - echo " - Running configure" - ./configure --prefix=$ROOT/ulfm-install >> ../build_output.txt - echo " - Running make" - make -j4 >> ../build_output.txt - echo " - Running make install" - make install >> ../build_output.txt - echo " - Finished installing ULFM" - cd ../ulfm-install/ -fi - -#Expect that any changes to the above still puts me in the install's home dir -export MPI_HOME=`pwd` -export PATH=$MPI_HOME/bin/:$PATH -export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH -export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH -export MANPATH=$MPI_HOME/share/man:$MANPATH - -export MPICC="`which mpicc`" -export MPICXX="`which mpic++`" - -#Assuming the install's home dir is one above current. -cd ../ From 0911534b21e2e77c50a0c171b966c1789f02b737 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 07:50:59 -0700 Subject: [PATCH 16/32] Finalize script for now --- .travis.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3ca035c..e48b9ec 100644 --- a/.travis.yml +++ b/.travis.yml @@ -44,8 +44,9 @@ before_install: - export MPICC="`which mpicc`" - export MPICXX="`which mpic++`" - - # export OMPI_MCA_rmaps_base_oversubscribe=1 + + #Allow oversubscription for tests, since we're potentially single core + - export OMPI_MCA_rmaps_base_oversubscribe=1 - cd ../ #Always end back at root install: - mkdir build && cd build @@ -59,4 +60,4 @@ after_failure: - pwd - cat build/Testing/Temporary/LastTest.log - echo "Printing ULFM build log tail. If no output, ULFM was built before this test run" - - tail -n100 .travis_helpers/build_output.txt + - tail -n100 /build_output.txt From 8e74165ab3bafb983a33a47b1a003a9bc064ffab Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:01:27 -0700 Subject: [PATCH 17/32] Print test logs on success, for verifying --- .travis.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e48b9ec..d7df086 100644 --- a/.travis.yml +++ b/.travis.yml @@ -55,9 +55,11 @@ install: script: - cd build && make test - cd ../ #Always end back at root +after_success: + - echo "Success, printing run logs:" + - cat build/Testing/Temporary/LastTest.log after_failure: - echo "Failure occured, printing run logs:" - - pwd - cat build/Testing/Temporary/LastTest.log - echo "Printing ULFM build log tail. If no output, ULFM was built before this test run" - tail -n100 /build_output.txt From 3b586ae33a8464a4847bc9342228e4c52f520f4d Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:03:17 -0700 Subject: [PATCH 18/32] Switch to pulling from ULFM master branch when rebuilding ULFM --- .travis.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index d7df086..d1fbc2f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,9 +18,7 @@ before_install: else ROOT=`pwd`; echo "Downloading ULFM from repo"; - wget https://bitbucket.org/icldistcomp/ulfm2/get/ulfm2.0rc.tar.bz2; - tar -xjf ulfm2.0rc.tar.bz2; - mv icldist* ulfm-src/; + git clone https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/; echo " - Configuring and building ULFM."; cd ulfm-src; echo " - Running autogen.pl"; From 59c05f070f91a723ad4e493bbafff11d1b4ffa85 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:13:24 -0700 Subject: [PATCH 19/32] print ULFM install info if building fails --- .travis.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index d1fbc2f..c86339e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,7 @@ before_install: - cd ../ #Always end back at root install: - mkdir build && cd build - - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 + - cmake ../ -DBUILD_TESTING=ON && {make -j4 VERBOSE=1 || {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt}} - cd ../ #Always end back at root script: - cd build && make test @@ -59,5 +59,3 @@ after_success: after_failure: - echo "Failure occured, printing run logs:" - cat build/Testing/Temporary/LastTest.log - - echo "Printing ULFM build log tail. If no output, ULFM was built before this test run" - - tail -n100 /build_output.txt From 5002bed837e11b7ee18eb1eea523239f67efde3d Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:19:23 -0700 Subject: [PATCH 20/32] Fix brackets in build --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c86339e..b1b9486 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,8 @@ before_install: - cd ../ #Always end back at root install: - mkdir build && cd build - - cmake ../ -DBUILD_TESTING=ON && {make -j4 VERBOSE=1 || {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt}} + - ((cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1)) || + ((echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt)) - cd ../ #Always end back at root script: - cd build && make test From bced4047c6959b4dd6f4f2de13a43a38a21d45b7 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:25:03 -0700 Subject: [PATCH 21/32] small test --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index b1b9486..61e696f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ before_install: echo " - Configuring and building ULFM."; cd ulfm-src; echo " - Running autogen.pl"; - ./autogen.pl >./ulfm_build_output.txt 2>&1; + ./autogen.pl >./ulfm_build_output.txt; echo " - Running configure"; ./configure --prefix=$ROOT/ulfm-install >>./ulfm_build_output.txt 2>&1; echo " - Running make"; From c581110879668b1ed9472175ff1971f41e17148a Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:31:56 -0700 Subject: [PATCH 22/32] Try another fix --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 61e696f..67a106c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,7 @@ before_install: - cd ../ #Always end back at root install: - mkdir build && cd build - - ((cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1)) || + - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 || ((echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt)) - cd ../ #Always end back at root script: From 1a43a8d5f044cdb980fde594d96d93a20c42d717 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:38:31 -0700 Subject: [PATCH 23/32] Trying a new OOP fix --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 67a106c..3bce706 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ before_install: echo " - Configuring and building ULFM."; cd ulfm-src; echo " - Running autogen.pl"; - ./autogen.pl >./ulfm_build_output.txt; + ./autogen.pl; echo " - Running configure"; ./configure --prefix=$ROOT/ulfm-install >>./ulfm_build_output.txt 2>&1; echo " - Running make"; @@ -48,8 +48,8 @@ before_install: - cd ../ #Always end back at root install: - mkdir build && cd build - - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 || - ((echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt)) + - {cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1} || + {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt} - cd ../ #Always end back at root script: - cd build && make test From 9b32adae598eb0539890110562d5ea9b1245c55c Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:41:55 -0700 Subject: [PATCH 24/32] another attempt --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3bce706..8fb9a7a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -47,8 +47,8 @@ before_install: - export OMPI_MCA_rmaps_base_oversubscribe=1 - cd ../ #Always end back at root install: - - mkdir build && cd build - - {cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1} || + - mkdir build + - cd build; {cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1} || {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt} - cd ../ #Always end back at root script: From df19c0c290897575098bdc8d3ecd803c8a0a22ae Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:43:25 -0700 Subject: [PATCH 25/32] Add semicolon to {} sections --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8fb9a7a..8f55579 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,8 +48,8 @@ before_install: - cd ../ #Always end back at root install: - mkdir build - - cd build; {cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1} || - {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt} + - cd build; {cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1;} || + {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt;} - cd ../ #Always end back at root script: - cd build && make test From 7410b9a4416467ba4fd7e798883231448fd2bf6d Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:44:39 -0700 Subject: [PATCH 26/32] Braces must be separate from commands by spaces --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8f55579..f746ee4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,8 +48,8 @@ before_install: - cd ../ #Always end back at root install: - mkdir build - - cd build; {cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1;} || - {echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt;} + - cd build; { cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1; } || + { echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt; } - cd ../ #Always end back at root script: - cd build && make test From b096f630e9b83a4e62aae994761888842286acd6 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:52:21 -0700 Subject: [PATCH 27/32] Try to fix travis not showing logs --- .travis.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index f746ee4..f132a21 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,7 +18,7 @@ before_install: else ROOT=`pwd`; echo "Downloading ULFM from repo"; - git clone https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/; + git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/; echo " - Configuring and building ULFM."; cd ulfm-src; echo " - Running autogen.pl"; @@ -44,12 +44,13 @@ before_install: - export MPICXX="`which mpic++`" #Allow oversubscription for tests, since we're potentially single core - - export OMPI_MCA_rmaps_base_oversubscribe=1 + - export OMPI_MCA_rmaps_base_oversubscribe=1i + + - tail -n100 ./ulfm_build_output.txt - cd ../ #Always end back at root install: - - mkdir build - - cd build; { cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1; } || - { echo "Printing ULFM build log tail. If no output, ULFM was built before this test run."; tail -n100 ../build_output.txt; } + - mkdir build && cd build + - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 - cd ../ #Always end back at root script: - cd build && make test From 74a18bd97652a8a63e561484c0811ccab5a9235a Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 08:55:09 -0700 Subject: [PATCH 28/32] Fix autogen.pl output to file --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f132a21..3d25701 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,7 @@ before_install: echo " - Configuring and building ULFM."; cd ulfm-src; echo " - Running autogen.pl"; - ./autogen.pl; + ./autogen.pl >./ulfm_build_output.txt 2>&1; echo " - Running configure"; ./configure --prefix=$ROOT/ulfm-install >>./ulfm_build_output.txt 2>&1; echo " - Running make"; From 2f5829b968fd88547e6bc48d711b9e8c79aa41b0 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 09:08:42 -0700 Subject: [PATCH 29/32] Fix ulfm install log --- .travis.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3d25701..d14401b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,18 +17,19 @@ before_install: cd ulfm-install; else ROOT=`pwd`; + mkdir ulfm-install; echo "Downloading ULFM from repo"; git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/; echo " - Configuring and building ULFM."; cd ulfm-src; echo " - Running autogen.pl"; - ./autogen.pl >./ulfm_build_output.txt 2>&1; + ./autogen.pl >../ulfm-install/ulfm_build_output.txt 2>&1; echo " - Running configure"; - ./configure --prefix=$ROOT/ulfm-install >>./ulfm_build_output.txt 2>&1; + ./configure --prefix=$ROOT/ulfm-install >>../ulfm-install/ulfm_build_output.txt 2>&1; echo " - Running make"; - make -j4 >>./ulfm_build_output.txt 2>&1; + make -j4 >>../ulfm-install/ulfm_build_output.txt 2>&1; echo " - Running make install"; - make install >>./ulfm_build_output.txt 2>&1; + make install >>../ulfm-install/ulfm_build_output.txt 2>&1; echo " - Finished installing ULFM"; cd ../ulfm-install/; fi @@ -46,7 +47,7 @@ before_install: #Allow oversubscription for tests, since we're potentially single core - export OMPI_MCA_rmaps_base_oversubscribe=1i - - tail -n100 ./ulfm_build_output.txt + - tail -n50 ./ulfm_build_output.txt - cd ../ #Always end back at root install: - mkdir build && cd build From 37ea2545ab53c0472aa82387402d806e5b3854a8 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 09:23:32 -0700 Subject: [PATCH 30/32] Fix trailing i from vim --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d14401b..ed20ef1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,7 +45,7 @@ before_install: - export MPICXX="`which mpic++`" #Allow oversubscription for tests, since we're potentially single core - - export OMPI_MCA_rmaps_base_oversubscribe=1i + - export OMPI_MCA_rmaps_base_oversubscribe=1 - tail -n50 ./ulfm_build_output.txt - cd ../ #Always end back at root From f84d62d2503112525970c1e158de6a8ec205fe71 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 09:40:08 -0700 Subject: [PATCH 31/32] Update tests for speed, and remove hard-set --oversubscribe --- examples/01_hello_world/fenix/CMakeLists.txt | 2 +- examples/02_send_recv/fenix/CMakeLists.txt | 2 +- examples/05_subset_create/CMakeLists.txt | 2 +- examples/06_subset_createv/CMakeLists.txt | 2 +- test/no_jump/CMakeLists.txt | 2 +- test/request_cancelled/CMakeLists.txt | 2 +- test/request_tracking/CMakeLists.txt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/01_hello_world/fenix/CMakeLists.txt b/examples/01_hello_world/fenix/CMakeLists.txt index 74bae8c..df8d7a1 100644 --- a/examples/01_hello_world/fenix/CMakeLists.txt +++ b/examples/01_hello_world/fenix/CMakeLists.txt @@ -16,5 +16,5 @@ if(BUILD_TESTING) add_executable(fenix_hello_world-debug fenix_hello_world.c) target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME hello_world - COMMAND mpirun -np 3 fenix_hello_world-debug "1") + COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1") endif() diff --git a/examples/02_send_recv/fenix/CMakeLists.txt b/examples/02_send_recv/fenix/CMakeLists.txt index a664f82..78b07d5 100644 --- a/examples/02_send_recv/fenix/CMakeLists.txt +++ b/examples/02_send_recv/fenix/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_ring-debug fenix_ring.c) target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME ring - COMMAND mpirun -np 5 fenix_ring-debug 1 2) + COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2) set_tests_properties(ring PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/examples/05_subset_create/CMakeLists.txt b/examples/05_subset_create/CMakeLists.txt index c8d37ee..10d9864 100644 --- a/examples/05_subset_create/CMakeLists.txt +++ b/examples/05_subset_create/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_subset_create-debug subset_create.c) target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME subset_create - COMMAND mpirun -np 5 --oversubscribe fenix_subset_create-debug 1) + COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1) set_tests_properties(subset_create PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/examples/06_subset_createv/CMakeLists.txt b/examples/06_subset_createv/CMakeLists.txt index 0cc4a5a..72112eb 100644 --- a/examples/06_subset_createv/CMakeLists.txt +++ b/examples/06_subset_createv/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_subset_createv-debug subset_createv.c) target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME subset_createv - COMMAND mpirun -np 5 --oversubscribe fenix_subset_createv-debug 1) + COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1) set_tests_properties(subset_createv PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/test/no_jump/CMakeLists.txt b/test/no_jump/CMakeLists.txt index c31efc0..fb830f5 100644 --- a/test/no_jump/CMakeLists.txt +++ b/test/no_jump/CMakeLists.txt @@ -12,4 +12,4 @@ set(CMAKE_BUILD_TYPE Debug) add_executable(fenix_no_jump_test fenix_no_jump_test.c) target_link_libraries(fenix_no_jump_test fenix ${MPI_C_LIBRARIES}) -add_test(NAME no_jump COMMAND mpirun -np 5 fenix_no_jump_test "1") +add_test(NAME no_jump COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_no_jump_test "1") diff --git a/test/request_cancelled/CMakeLists.txt b/test/request_cancelled/CMakeLists.txt index b106e97..88af22b 100644 --- a/test/request_cancelled/CMakeLists.txt +++ b/test/request_cancelled/CMakeLists.txt @@ -12,4 +12,4 @@ set(CMAKE_BUILD_TYPE Debug) add_executable(fenix_request_cancelled_test fenix_req_cancelled_test.c) target_link_libraries(fenix_request_cancelled_test fenix ${MPI_C_LIBRARIES}) -add_test(NAME request_cancelled COMMAND mpirun -np 5 fenix_request_cancelled_test "1") +add_test(NAME request_cancelled COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_request_cancelled_test "1") diff --git a/test/request_tracking/CMakeLists.txt b/test/request_tracking/CMakeLists.txt index 9dc93df..c8269b2 100644 --- a/test/request_tracking/CMakeLists.txt +++ b/test/request_tracking/CMakeLists.txt @@ -13,4 +13,4 @@ add_executable(fenix_request_tracking_test fenix_request_tracking_test.c) target_link_libraries(fenix_request_tracking_test fenix ${MPI_C_LIBRARIES}) add_test(NAME request_tracking - COMMAND mpirun --oversubscribe -np 3 fenix_request_tracking_test) + COMMAND mpirun -np 3 fenix_request_tracking_test) From d55030b1b8e7c3860071903063ae3b442c50b843 Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Mon, 29 Jun 2020 09:43:24 -0700 Subject: [PATCH 32/32] Simplify travis a bit more --- .travis.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index ed20ef1..e292727 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,17 +48,15 @@ before_install: - export OMPI_MCA_rmaps_base_oversubscribe=1 - tail -n50 ./ulfm_build_output.txt - - cd ../ #Always end back at root + - cd ../ #End back at root install: - mkdir build && cd build - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 - - cd ../ #Always end back at root script: - - cd build && make test - - cd ../ #Always end back at root + - make test after_success: - echo "Success, printing run logs:" - - cat build/Testing/Temporary/LastTest.log + - cat Testing/Temporary/LastTest.log after_failure: - echo "Failure occured, printing run logs:" - - cat build/Testing/Temporary/LastTest.log + - cat Testing/Temporary/LastTest.log