diff --git a/.travis.yml b/.travis.yml index e292727..ec3ddc5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,54 +9,24 @@ addons: - valgrind cache: directories: - - ulfm-install + - .travis_helpers/ulfm-install before_install: - - echo "Configuring ULFM" - - if [ -f ulfm-install/lib/libmpi.so ]; then - echo "libmpi.so found -- nothing to build."; - cd ulfm-install; - else - ROOT=`pwd`; - mkdir ulfm-install; - echo "Downloading ULFM from repo"; - git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/; - echo " - Configuring and building ULFM."; - cd ulfm-src; - echo " - Running autogen.pl"; - ./autogen.pl >../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Running configure"; - ./configure --prefix=$ROOT/ulfm-install >>../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Running make"; - make -j4 >>../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Running make install"; - make install >>../ulfm-install/ulfm_build_output.txt 2>&1; - echo " - Finished installing ULFM"; - cd ../ulfm-install/; - fi - - #Expect that any changes to the above still puts me in the install's home dir - - export MPI_HOME=`pwd` - - export PATH=$MPI_HOME/bin/:$PATH - - export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH - - export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH - - export MANPATH=$MPI_HOME/share/man:$MANPATH - - - export MPICC="`which mpicc`" - - export MPICXX="`which mpic++`" - - #Allow oversubscription for tests, since we're potentially single core - - export OMPI_MCA_rmaps_base_oversubscribe=1 - - - tail -n50 ./ulfm_build_output.txt - - cd ../ #End back at root -install: - - mkdir build && cd build - - cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1 + - cd .travis_helpers + - source ./fetchULFMmpi.sh + - cd ../ #Always end back at the root directory script: + - cd .travis_helpers + - source fetchULFMmpi.sh #Just updates path if ULFM was built properly in before_install + - cd ../ + - mkdir build + - cd build + - cmake ../ -DBUILD_TESTING=ON + - make -j4 VERBOSE=1 - make test -after_success: - - echo "Success, printing run logs:" - - cat Testing/Temporary/LastTest.log + - cd ../ #Always end back at the root directory. after_failure: - echo "Failure occured, printing run logs:" - - cat Testing/Temporary/LastTest.log + - pwd + - cat build/Testing/Temporary/LastTest.log + - echo "Printing ULFM build log tail. If no output, ULFM was built before this test run" + - tail -n100 .travis_helpers/build_output.txt diff --git a/.travis_helpers/fetchULFMmpi.sh b/.travis_helpers/fetchULFMmpi.sh new file mode 100644 index 0000000..a5611cc --- /dev/null +++ b/.travis_helpers/fetchULFMmpi.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ -f ulfm-install/lib/libmpi.so ]; then + echo "libmpich.so found -- nothing to build." + cd ulfm-install +else + ROOT=`pwd` + echo "Downloading ULFM from repo" + wget https://bitbucket.org/icldistcomp/ulfm2/get/ulfm2.0rc.tar.bz2 + tar -xjf ulfm2.0rc.tar.bz2 + mv icldist* ulfm-src/ + echo " - Configuring and building ULFM." + cd ulfm-src + echo " - Running autogen.pl" + ./autogen.pl > ../build_output.txt + echo " - Running configure" + ./configure --prefix=$ROOT/ulfm-install >> ../build_output.txt + echo " - Running make" + make -j4 >> ../build_output.txt + echo " - Running make install" + make install >> ../build_output.txt + echo " - Finished installing ULFM" + cd ../ulfm-install/ +fi + +#Expect that any changes to the above still puts me in the install's home dir +export MPI_HOME=`pwd` +export PATH=$MPI_HOME/bin/:$PATH +export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH +export MANPATH=$MPI_HOME/share/man:$MANPATH + +export MPICC="`which mpicc`" +export MPICXX="`which mpic++`" + +#Assuming the install's home dir is one above current. +cd ../ diff --git a/CMakeLists.txt b/CMakeLists.txt index 38fc5d9..e90822f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,8 +25,8 @@ set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -#set(CMAKE_BUILD_TYPE Release) -set(CMAKE_BUILD_TYPE Debug) +set(CMAKE_BUILD_TYPE Release) +#set(CMAKE_BUILD_TYPE Debug) #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O0 -ggdb") #ENABLE_TESTING @@ -109,6 +109,4 @@ if(BUILD_TESTING) add_subdirectory(test/subset_internal) add_subdirectory(test/subset_merging) add_subdirectory(test/request_tracking) - add_subdirectory(test/request_cancelled) - add_subdirectory(test/no_jump) endif() diff --git a/examples/01_hello_world/fenix/CMakeLists.txt b/examples/01_hello_world/fenix/CMakeLists.txt index df8d7a1..22658d2 100644 --- a/examples/01_hello_world/fenix/CMakeLists.txt +++ b/examples/01_hello_world/fenix/CMakeLists.txt @@ -16,5 +16,5 @@ if(BUILD_TESTING) add_executable(fenix_hello_world-debug fenix_hello_world.c) target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME hello_world - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1") + COMMAND mpirun --oversubscribe -np 3 fenix_hello_world-debug "1") endif() diff --git a/examples/01_hello_world/fenix/fenix_hello_world.c b/examples/01_hello_world/fenix/fenix_hello_world.c index cd6378b..374a80f 100644 --- a/examples/01_hello_world/fenix/fenix_hello_world.c +++ b/examples/01_hello_world/fenix/fenix_hello_world.c @@ -108,19 +108,6 @@ int main(int argc, char **argv) { printf("hello world: %s, old rank (MPI_COMM_WORLD): %d, new rank: %d, active ranks: %d, ranks before process failure: %d\n", processor_name, old_rank, new_rank, new_world_size, old_world_size); - - int *fails, num_fails; - num_fails = Fenix_Process_fail_list(&fails); - - char fails_str[100]; - sprintf(fails_str, "Rank %d sees failed processes [", new_rank); - for(int i = 0; i < num_fails; i++){ - sprintf(fails_str, "%s%s%d", fails_str, (i==0 ? "" : ", "), fails[i]); - } - sprintf(fails_str, "%s]\n", fails_str); - printf(fails_str); - - Fenix_Finalize(); MPI_Finalize(); diff --git a/examples/02_send_recv/fenix/CMakeLists.txt b/examples/02_send_recv/fenix/CMakeLists.txt index 78b07d5..f3f197f 100644 --- a/examples/02_send_recv/fenix/CMakeLists.txt +++ b/examples/02_send_recv/fenix/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_ring-debug fenix_ring.c) target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME ring - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2) + COMMAND mpirun --oversubscribe -np 5 fenix_ring-debug 1 2) set_tests_properties(ring PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/examples/05_subset_create/CMakeLists.txt b/examples/05_subset_create/CMakeLists.txt index 10d9864..c8d37ee 100644 --- a/examples/05_subset_create/CMakeLists.txt +++ b/examples/05_subset_create/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_subset_create-debug subset_create.c) target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME subset_create - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1) + COMMAND mpirun -np 5 --oversubscribe fenix_subset_create-debug 1) set_tests_properties(subset_create PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/examples/06_subset_createv/CMakeLists.txt b/examples/06_subset_createv/CMakeLists.txt index 72112eb..0cc4a5a 100644 --- a/examples/06_subset_createv/CMakeLists.txt +++ b/examples/06_subset_createv/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_subset_createv-debug subset_createv.c) target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME subset_createv - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1) + COMMAND mpirun -np 5 --oversubscribe fenix_subset_createv-debug 1) set_tests_properties(subset_createv PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/include/fenix.h b/include/fenix.h index 81cb8cd..94d1130 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -88,7 +88,6 @@ extern "C" { #define FENIX_ERROR_SUBSET_STRIDE -25 #define FENIX_ERROR_NODATA_FOUND -30 #define FENIX_ERROR_INTERN -40 -#define FENIX_ERROR_CANCELLED -50 #define FENIX_WARNING_SPARE_RANKS_DEPLETED 100 #define FENIX_WARNING_PARTIAL_RESTORE 101 @@ -217,8 +216,6 @@ int Fenix_Data_group_delete(int group_id); int Fenix_Data_member_delete(int group_id, int member_id); -int Fenix_Process_fail_list(int** fail_list); - #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/include/fenix_ext.h b/include/fenix_ext.h index 0956b81..9e92454 100644 --- a/include/fenix_ext.h +++ b/include/fenix_ext.h @@ -81,13 +81,6 @@ typedef struct { int role; // Role of rank: initial, survivor or repair int fenix_init_flag; - int fail_world_size; - int* fail_world; - - //Save the pointer to role and error of Fenix_Init - int *ret_role; - int *ret_error; - fenix_request_store_t request_store; fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions diff --git a/src/fenix.c b/src/fenix.c index b5865eb..022ec1d 100644 --- a/src/fenix.c +++ b/src/fenix.c @@ -181,8 +181,3 @@ int Fenix_Data_group_delete(int group_id) { int Fenix_Data_member_delete(int group_id, int member_id) { return __fenix_member_delete(group_id, member_id); } - -int Fenix_Process_fail_list(int** fail_list){ - *fail_list = fenix.fail_world; - return fenix.fail_world_size; -} diff --git a/src/fenix_data_recovery.c b/src/fenix_data_recovery.c index ab6c473..56778b9 100644 --- a/src/fenix_data_recovery.c +++ b/src/fenix_data_recovery.c @@ -103,7 +103,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, /* If so, recover the data and set the recovery */ /* for member recovery. */ - int i; + int i, group_position; int remote_need_recovery; fenix_group_t *group; MPI_Status status; @@ -149,7 +149,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, } else { /* Already created. Renew the MPI communicator */ - group = ( data_recovery->group[group_index] ); + group = ( data_recovery->group[group_position] ); group->comm = comm; /* Renew communicator */ MPI_Comm_rank(comm, &(group->current_rank)); diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index 7e686e5..559230c 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -238,9 +238,7 @@ int MPI_Sendrecv(MPI_CONST_TYPE void* sendbuf, int sendcount, static inline void __fenix_override_request(int ret, MPI_Request *request) { - if(ret != MPI_SUCCESS) { - return; - } + if(ret != MPI_SUCCESS) return; assert(*request != MPI_REQUEST_NULL); @@ -267,7 +265,6 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int ret; ret = PMPI_Irecv(buf, count, datatype, source, tag, __fenix_replace_comm(comm), request); - __fenix_override_request(ret, request); __fenix_test_MPI_inline(ret, "MPI_Irecv"); return ret; @@ -275,47 +272,21 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int MPI_Wait(MPI_Request *fenix_request, MPI_Status *status) { - int ret, is_cancelled = 1; + int ret; MPI_Request request = MPI_REQUEST_NULL; - if(*fenix_request != MPI_REQUEST_NULL){ - if(*fenix_request == FENIX_REQUEST_CANCELLED){ - is_cancelled = 1; - } else { - int retval = - __fenix_request_store_get(&fenix.request_store, *((int*)fenix_request), &request); - - if(retval == FENIX_ERROR_CANCELLED) { - is_cancelled = 1; - } - - if(retval == FENIX_REQUEST_COMPLETED){ - if(status != MPI_STATUS_IGNORE) - __fenix_request_store_get_status(&fenix.request_store, *((int*)fenix_request), status); - *fenix_request = MPI_REQUEST_NULL; - return; - } - } - } + if(*fenix_request != MPI_REQUEST_NULL) + __fenix_request_store_get(&fenix.request_store, + *((int *) fenix_request), + &request); ret = PMPI_Wait(&request, status); - - if(ret == MPI_SUCCESS && (*fenix_request != MPI_REQUEST_NULL) && (*fenix_request != FENIX_REQUEST_CANCELLED)) { + if(ret == MPI_SUCCESS) { __fenix_request_store_remove(&fenix.request_store, *((int *) fenix_request)); assert(request == MPI_REQUEST_NULL); - *fenix_request = MPI_REQUEST_NULL; - } - if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){ - __fenix_request_store_cancel(&fenix.request_store, *((int*)fenix_request), status); - *fenix_request = FENIX_REQUEST_CANCELLED; + *fenix_request = MPI_REQUEST_NULL; } __fenix_test_MPI_inline(ret, "MPI_Wait"); - - - if(is_cancelled){ - *fenix_request = FENIX_REQUEST_CANCELLED; - return FENIX_ERROR_CANCELLED; - } return ret; } @@ -326,13 +297,11 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[], { // The list (array_of_requests) may contain null or inactive handles. int ret, i; - for(i=0 ; i= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); - if (rank_offset < fenix.fail_world_size) { + if (rank_offset < fail_world_size) { if (fenix.options.verbose == 11) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", - current_rank, fenix.fail_world[rank_offset]); + current_rank, fail_world[rank_offset]); } - current_rank = fenix.fail_world[rank_offset]; + current_rank = fail_world[rank_offset]; } } + free(fail_world); + /************************************/ /* Update the number of spare ranks */ /************************************/ @@ -492,20 +488,16 @@ int __fenix_repair_ranks() } fenix.num_inital_ranks = 0; - fenix.num_recovered_ranks = fenix.fail_world_size; - - if(fenix.role != FENIX_ROLE_INITIAL_RANK){ - free(fenix.fail_world); - } + fenix.num_recovered_ranks = fail_world_size; - fenix.fail_world = (int *) s_malloc(fenix.fail_world_size * sizeof(int)); - fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fenix.fail_world_size); + fail_world = (int *) s_malloc(fail_world_size * sizeof(int)); + fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fail_world_size); free(survivor_world); if (fenix.options.verbose == 2) { int index; - for (index = 0; index < fenix.fail_world_size; index++) { - verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]); + for (index = 0; index < fail_world_size; index++) { + verbose_print("fail_world[%d]: %d\n", index, fail_world[index]); } } @@ -518,19 +510,21 @@ int __fenix_repair_ranks() if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); - if (rank_offset < fenix.fail_world_size) { + if (rank_offset < fail_world_size) { if (fenix.options.verbose == 2) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", - current_rank, fenix.fail_world[rank_offset]); + current_rank, fail_world[rank_offset]); } - current_rank = fenix.fail_world[rank_offset]; + current_rank = fail_world[rank_offset]; } } + free(fail_world); + /************************************/ /* Update the number of spare ranks */ /************************************/ - fenix.spare_ranks = fenix.spare_ranks - fenix.fail_world_size; + fenix.spare_ranks = fenix.spare_ranks - fail_world_size; if (fenix.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, spare_ranks: %d\n", __fenix_get_current_rank(*fenix.world), fenix.role, @@ -701,10 +695,6 @@ void __fenix_finalize() free( fenix.world ); free( fenix.new_world ); - if(fenix.role != FENIX_ROLE_INITIAL_RANK){ - free(fenix.fail_world); - } - /* Free Callbacks */ __fenix_callback_destroy( fenix.callback_list ); @@ -763,7 +753,6 @@ void __fenix_test_MPI(int ret, const char *msg) } __fenix_request_store_waitall_removeall(&fenix.request_store); - __fenix_comm_list_destroy(); @@ -790,19 +779,6 @@ void __fenix_test_MPI(int ret, const char *msg) } fenix.role = FENIX_ROLE_SURVIVOR_RANK; - if(!fenix.finalized) { - switch(fenix.resume_mode) { - case __FENIX_RESUME_AT_INIT: - longjmp(*fenix.recover_environment, 1); - break; - case __FENIX_RESUME_NO_JUMP: - *(fenix.ret_role) = FENIX_ROLE_SURVIVOR_RANK; - __fenix_postinit(fenix.ret_error); - break; - default: - printf("Fenix detected error: Unknown resume mode\n"); - assert(false); - break; - } - } + if(!fenix.finalized) + longjmp(*fenix.recover_environment, 1); } diff --git a/src/fenix_request_store.c b/src/fenix_request_store.c index 2878845..fb222be 100644 --- a/src/fenix_request_store.c +++ b/src/fenix_request_store.c @@ -56,30 +56,26 @@ #include #include "fenix_request_store.h" -#include "fenix_ext.h" void __fenix_request_store_waitall_removeall(fenix_request_store_t *s) { int i; for(i=0 ; ifirst_unused_position ; i++) { __fenix_request_t *f = &(s->reqs.elements[i]); - if(f->valid && !f->cancelled) { + if(f->valid) { #warning "What to do with requests upon failure? Wait or Cancel?" - int rank; - MPI_Comm_rank(*fenix.new_world, &rank); - int flag; - int ret = PMPI_Test(&(f->r), &flag, &(f->status)); - - if(!flag || ret != MPI_SUCCESS){ - //This request wasn't able to finish before the failure - //We cancel it, and notify the user that it was cancelled - //PMPI_Cancel(&(f->r)); - f->cancelled = 1; - } else { - f->completed = 1; - } + PMPI_Cancel(&(f->r)); + if(i == MPI_REQUEST_NULL) // This may look ugly and + // produce a warning, but it is + // necessary to make sure an + // MPI_Request NULL does not + // collide in the request store + __fenix_request_store_remove(s, -123); + else + __fenix_request_store_remove(s, i); } } + s->first_unused_position = 0; __fenix_int_stack_clear(&(s->freed_list)); } diff --git a/src/fenix_request_store.h b/src/fenix_request_store.h index 4d62db7..2c85a9a 100644 --- a/src/fenix_request_store.h +++ b/src/fenix_request_store.h @@ -64,7 +64,6 @@ #include #include "fenix_stack.h" -#include "fenix.h" /* @@ -85,18 +84,12 @@ */ + typedef struct { char valid; - char cancelled; - char completed; //This should only be set if the request was completed - //prior to a failure, before the user could MPI_Test the call - MPI_Status status; //This is as "completed" MPI_Request r; } __fenix_request_t; -#define FENIX_REQUEST_CANCELLED ((MPI_Request)((int)MPI_REQUEST_NULL+1)) -#define FENIX_REQUEST_COMPLETED 180 - #define __fenix_dynamic_array_type __fenix_request_t #define __fenix_dynamic_array_typename req #include "fenix_dynamic_array.h" @@ -122,8 +115,7 @@ void __fenix_request_store_destroy(fenix_request_store_t *s) { int valid_count = 0, i; for(i=0 ; ifirst_unused_position ; i++) - if(s->reqs.elements[i].valid && - !(s->reqs.elements[i].completed || s->reqs.elements[i].cancelled) ) valid_count++; + if(s->reqs.elements[i].valid) valid_count++; if(valid_count > 0) printf("[Fenix warning] __fenix_request_store_destroy. store contains valid elements (valid elems %d, first_unused_pos %d)\n", valid_count, s->first_unused_position); __fenix_req_dynamic_array_destroy(&(s->reqs)); @@ -149,7 +141,6 @@ int __fenix_request_store_add(fenix_request_store_t *s, assert(!f->valid); memcpy(&(f->r), r, sizeof(MPI_Request)); f->valid = 1; - f->cancelled = 0; // Cannot return a position that is equivalent to MPI_REQUEST_NULL MPI_Request r_test; @@ -161,18 +152,11 @@ int __fenix_request_store_add(fenix_request_store_t *s, assert(r_test != MPI_REQUEST_NULL); } } - if(r_test == FENIX_REQUEST_CANCELLED) { - position = -124; - { - *((int *)&r_test) = position; - assert(r_test != FENIX_REQUEST_CANCELLED); - } - } return position; } static inline -int __fenix_request_store_get(fenix_request_store_t *s, +void __fenix_request_store_get(fenix_request_store_t *s, int request_id, MPI_Request *r) { @@ -185,78 +169,10 @@ int __fenix_request_store_get(fenix_request_store_t *s, MPI_Request r_test = MPI_REQUEST_NULL; request_id = *((int*) &r_test); } - if(request_id == -124) { - MPI_Request r_test = FENIX_REQUEST_CANCELLED; - request_id = *((int*) &r_test); - } - MPI_Request r_test; - *((int *)&r_test) = request_id; - if(r_test == FENIX_REQUEST_CANCELLED){ - *r = MPI_REQUEST_NULL; - return FENIX_ERROR_CANCELLED; - } __fenix_request_t *f = &(s->reqs.elements[request_id]); assert(f->valid); memcpy(r, &(f->r), sizeof(MPI_Request)); - - if(f->cancelled) return FENIX_ERROR_CANCELLED; - if(f->completed) return FENIX_REQUEST_COMPLETED; - else return FENIX_SUCCESS; -} - -static inline -int __fenix_request_store_cancel(fenix_request_store_t *s, - int request_id, - MPI_Status *status) -{ - { - MPI_Request r_test; - *((int *)&r_test) = request_id; - assert(r_test != MPI_REQUEST_NULL); - } - if(request_id == -123) { - MPI_Request r_test = MPI_REQUEST_NULL; - request_id = *((int*) &r_test); - } - if(request_id == -124) { - MPI_Request r_test = FENIX_REQUEST_CANCELLED; - request_id = *((int*) &r_test); - } - MPI_Request r_test; - *((int *)&r_test) = request_id; - if(r_test == FENIX_REQUEST_CANCELLED){ - return FENIX_ERROR_CANCELLED; - } - - __fenix_request_t *f = &(s->reqs.elements[request_id]); - assert(f->valid); - f->cancelled = 1; - if(status != NULL && status != MPI_STATUS_IGNORE) f->status = *status; - - return FENIX_SUCCESS; -} - -static inline -void __fenix_request_store_get_status(fenix_request_store_t *s, - int request_id, - MPI_Status *status){ - { - MPI_Request r_test; - *((int *)&r_test) = request_id; - assert(r_test != MPI_REQUEST_NULL); - } - if(request_id == -123) { - MPI_Request r_test = MPI_REQUEST_NULL; - request_id = *((int*) &r_test); - } - if(request_id == -124) { - MPI_Request r_test = FENIX_REQUEST_CANCELLED; - request_id = *((int*) &r_test); - } - - __fenix_request_t *f = &(s->reqs.elements[request_id]); - assert(f->completed); - memcpy(status, &(f->status), sizeof(MPI_Status)); + assert(*r != MPI_REQUEST_NULL); } static inline diff --git a/test/no_jump/CMakeLists.txt b/test/no_jump/CMakeLists.txt deleted file mode 100644 index fb830f5..0000000 --- a/test/no_jump/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# -# This file is part of Fenix -# Copyright (c) 2016 Rutgers University and Sandia Corporation. -# This software is distributed under the BSD License. -# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -# the U.S. Government retains certain rights in this software. -# For more information, see the LICENSE file in the top Fenix -# directory. -# - -set(CMAKE_BUILD_TYPE Debug) -add_executable(fenix_no_jump_test fenix_no_jump_test.c) -target_link_libraries(fenix_no_jump_test fenix ${MPI_C_LIBRARIES}) - -add_test(NAME no_jump COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_no_jump_test "1") diff --git a/test/no_jump/fenix_no_jump_test.c b/test/no_jump/fenix_no_jump_test.c deleted file mode 100644 index a0fea08..0000000 --- a/test/no_jump/fenix_no_jump_test.c +++ /dev/null @@ -1,139 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// -// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| -// _| _| _|_| _| _| _| _| -// _|_|_| _|_|_| _| _| _| _| _| -// _| _| _| _|_| _| _| _| -// _| _|_|_|_| _| _| _|_|_| _| _| -// -// -// -// -// Copyright (C) 2016 Rutgers University and Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Michael Heroux, and Matthew Whitlock -// -// Questions? Contact Keita Teranishi (knteran@sandia.gov) and -// Marc Gamell (mgamell@cac.rutgers.edu) -// -// ************************************************************************ -//@HEADER -*/ - -#include -#include -#include -#include -#include -#include -#include - -const int kKillID = 1; - -int main(int argc, char **argv) { - -#warning "It's a good idea to complain when not enough parameters! Should add this code to other examples too." - if (argc < 2) { - printf("Usage: %s <# spare ranks> \n", *argv); - exit(0); - } - - int old_world_size, new_world_size = - 1; - int old_rank = 1, new_rank = - 1; - int spare_ranks = atoi(argv[1]); - int buffer; - - MPI_Init(&argc, &argv); - - MPI_Barrier(MPI_COMM_WORLD); - MPI_Comm world_comm; - MPI_Comm_dup(MPI_COMM_WORLD, &world_comm); - MPI_Comm_size(world_comm, &old_world_size); - MPI_Comm_rank(world_comm, &old_rank); - - MPI_Info info; - MPI_Info_create(&info); - MPI_Info_set(info, "FENIX_RESUME_MODE", "NO_JUMP"); - - int fenix_status; - int recovered = 0; - MPI_Comm new_comm; - int error; - Fenix_Init(&fenix_status, world_comm, &new_comm, &argc, &argv, spare_ranks, 0, info, &error); - - MPI_Comm_size(new_comm, &new_world_size); - MPI_Comm_rank(new_comm, &new_rank); - - if (old_rank == kKillID) { - assert(fenix_status == FENIX_ROLE_INITIAL_RANK); - pid_t pid = getpid(); - kill(pid, SIGTERM); - } - - if(new_rank == kKillID) { - assert(fenix_status == FENIX_ROLE_RECOVERED_RANK); - int sval = 33; - MPI_Send(&sval, 1, MPI_INT, kKillID-1, 1, new_comm); - } - else if(new_rank == kKillID-1) { - assert(fenix_status == FENIX_ROLE_INITIAL_RANK); - int rval = 44; - MPI_Status status; - MPI_Recv(&rval, 1, MPI_INT, kKillID, 1, new_comm, &status); - - assert(fenix_status == FENIX_ROLE_SURVIVOR_RANK); - assert(rval == 44); - printf("Rank %d did not receive new value. old value is %d\n", new_rank, rval); - - MPI_Recv(&rval, 1, MPI_INT, kKillID, 1, new_comm, &status); - assert(rval == 33); - printf("Rank %d received new value %d\n", new_rank, rval); - } - else { - assert(fenix_status == FENIX_ROLE_INITIAL_RANK); - MPI_Barrier(new_comm); - assert(fenix_status == FENIX_ROLE_SURVIVOR_RANK); - } - - MPI_Barrier(new_comm); - - Fenix_Finalize(); - MPI_Finalize(); - - return 0; -} - diff --git a/test/request_cancelled/CMakeLists.txt b/test/request_cancelled/CMakeLists.txt deleted file mode 100644 index 88af22b..0000000 --- a/test/request_cancelled/CMakeLists.txt +++ /dev/null @@ -1,15 +0,0 @@ -# -# This file is part of Fenix -# Copyright (c) 2016 Rutgers University and Sandia Corporation. -# This software is distributed under the BSD License. -# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -# the U.S. Government retains certain rights in this software. -# For more information, see the LICENSE file in the top Fenix -# directory. -# - -set(CMAKE_BUILD_TYPE Debug) -add_executable(fenix_request_cancelled_test fenix_req_cancelled_test.c) -target_link_libraries(fenix_request_cancelled_test fenix ${MPI_C_LIBRARIES}) - -add_test(NAME request_cancelled COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_request_cancelled_test "1") diff --git a/test/request_cancelled/fenix_req_cancelled_test.c b/test/request_cancelled/fenix_req_cancelled_test.c deleted file mode 100644 index 480497b..0000000 --- a/test/request_cancelled/fenix_req_cancelled_test.c +++ /dev/null @@ -1,138 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// -// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| -// _| _| _|_| _| _| _| _| -// _|_|_| _|_|_| _| _| _| _| _| -// _| _| _| _|_| _| _| _| -// _| _|_|_|_| _| _| _|_|_| _| _| -// -// -// -// -// Copyright (C) 2016 Rutgers University and Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Michael Heroux, and Matthew Whitlock -// -// Questions? Contact Keita Teranishi (knteran@sandia.gov) and -// Marc Gamell (mgamell@cac.rutgers.edu) -// -// ************************************************************************ -//@HEADER -*/ - -#include -#include -#include -#include -#include -#include - -const int kKillID = 1; - -int main(int argc, char **argv) { - -#warning "It's a good idea to complain when not enough parameters! Should add this code to other examples too." - if (argc < 2) { - printf("Usage: %s <# spare ranks> \n", *argv); - exit(0); - } - - int old_world_size, new_world_size = - 1; - int old_rank = 1, new_rank = - 1; - int spare_ranks = atoi(argv[1]); - int buffer; - - MPI_Init(&argc, &argv); - - MPI_Barrier(MPI_COMM_WORLD); - MPI_Comm world_comm; - MPI_Comm_dup(MPI_COMM_WORLD, &world_comm); - MPI_Comm_size(world_comm, &old_world_size); - MPI_Comm_rank(world_comm, &old_rank); - - int fenix_status; - int recovered = 0; - MPI_Comm new_comm; - int error; - MPI_Request req = MPI_REQUEST_NULL; - Fenix_Init(&fenix_status, world_comm, &new_comm, &argc, &argv, spare_ranks, 0, MPI_INFO_NULL, &error); - - MPI_Comm_size(new_comm, &new_world_size); - MPI_Comm_rank(new_comm, &new_rank); - - if (fenix_status != FENIX_ROLE_INITIAL_RANK) { - recovered = 1; - } else { - MPI_Irecv(&buffer, 1, MPI_INT, (new_rank+1)%new_world_size, 1, new_comm, &req); - //Kill rank dies before being able to send - if(new_rank == 0 || new_rank == 2) MPI_Send(&buffer, 1, MPI_INT, old_rank==0 ? new_world_size-1 : new_rank-1, 1, new_comm); - MPI_Barrier(new_comm); - } - - - if (old_rank == kKillID && recovered == 0) { - pid_t pid = getpid(); - kill(pid, SIGTERM); - } - - - MPI_Barrier(new_comm); - - //After recovery, the slow ranks send - if(new_rank == 1 || new_rank == 3 ) MPI_Send(&buffer, 1, MPI_INT, new_rank==0 ? new_world_size-1 : new_rank-1, 1, new_comm); - - MPI_Barrier(new_comm); //Lots of barriers to demonstrate a specific ordering of events. - - //Check result of old requests - cannot wait, must MPI_Test only on old pre-failure requests for now - if(new_rank != kKillID){ - int flag; - int ret = MPI_Test(&req, &flag, MPI_STATUS_IGNORE); - if(!flag || ret == FENIX_ERROR_CANCELLED){ - printf("Rank %d's request was NOT satisfied before the failure\n", new_rank); - MPI_Irecv(&buffer, 1, MPI_INT, (new_rank+1)%new_world_size, 1, new_comm, &req); //We can re-launch the IRecv if we know the - //other ranks are going to send now - } else { - printf("Rank %d's request was satisfied before the failure\n", new_rank); - } - - } - - Fenix_Finalize(); - MPI_Finalize(); - - return 0; -} diff --git a/test/request_tracking/CMakeLists.txt b/test/request_tracking/CMakeLists.txt index c8269b2..9dc93df 100644 --- a/test/request_tracking/CMakeLists.txt +++ b/test/request_tracking/CMakeLists.txt @@ -13,4 +13,4 @@ add_executable(fenix_request_tracking_test fenix_request_tracking_test.c) target_link_libraries(fenix_request_tracking_test fenix ${MPI_C_LIBRARIES}) add_test(NAME request_tracking - COMMAND mpirun -np 3 fenix_request_tracking_test) + COMMAND mpirun --oversubscribe -np 3 fenix_request_tracking_test)