Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 46 additions & 16 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,54 @@ addons:
- valgrind
cache:
directories:
- .travis_helpers/ulfm-install
- ulfm-install
before_install:
- cd .travis_helpers
- source ./fetchULFMmpi.sh
- cd ../ #Always end back at the root directory
- echo "Configuring ULFM"
- if [ -f ulfm-install/lib/libmpi.so ]; then
echo "libmpi.so found -- nothing to build.";
cd ulfm-install;
else
ROOT=`pwd`;
mkdir ulfm-install;
echo "Downloading ULFM from repo";
git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/;
echo " - Configuring and building ULFM.";
cd ulfm-src;
echo " - Running autogen.pl";
./autogen.pl >../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running configure";
./configure --prefix=$ROOT/ulfm-install >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running make";
make -j4 >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running make install";
make install >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Finished installing ULFM";
cd ../ulfm-install/;
fi

#Expect that any changes to the above still puts me in the install's home dir
- export MPI_HOME=`pwd`
- export PATH=$MPI_HOME/bin/:$PATH
- export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH
- export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH
- export MANPATH=$MPI_HOME/share/man:$MANPATH

- export MPICC="`which mpicc`"
- export MPICXX="`which mpic++`"

#Allow oversubscription for tests, since we're potentially single core
- export OMPI_MCA_rmaps_base_oversubscribe=1

- tail -n50 ./ulfm_build_output.txt
- cd ../ #End back at root
install:
- mkdir build && cd build
- cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1
script:
- cd .travis_helpers
- source fetchULFMmpi.sh #Just updates path if ULFM was built properly in before_install
- cd ../
- mkdir build
- cd build
- cmake ../ -DBUILD_TESTING=ON
- make -j4 VERBOSE=1
- make test
- cd ../ #Always end back at the root directory.
after_success:
- echo "Success, printing run logs:"
- cat Testing/Temporary/LastTest.log
after_failure:
- echo "Failure occured, printing run logs:"
- pwd
- cat build/Testing/Temporary/LastTest.log
- echo "Printing ULFM build log tail. If no output, ULFM was built before this test run"
- tail -n100 .travis_helpers/build_output.txt
- cat Testing/Temporary/LastTest.log
37 changes: 0 additions & 37 deletions .travis_helpers/fetchULFMmpi.sh

This file was deleted.

7 changes: 5 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_BUILD_TYPE Release)
set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O0 -ggdb")

#ENABLE_TESTING
Expand Down Expand Up @@ -109,4 +109,7 @@ if(BUILD_TESTING)
add_subdirectory(test/subset_internal)
add_subdirectory(test/subset_merging)
add_subdirectory(test/request_tracking)
add_subdirectory(test/request_cancelled)
add_subdirectory(test/no_jump)
add_subdirectory(test/issend)
endif()
2 changes: 1 addition & 1 deletion examples/01_hello_world/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ if(BUILD_TESTING)
add_executable(fenix_hello_world-debug fenix_hello_world.c)
target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME hello_world
COMMAND mpirun --oversubscribe -np 3 fenix_hello_world-debug "1")
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1")
endif()
14 changes: 13 additions & 1 deletion examples/01_hello_world/fenix/fenix_hello_world.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ const int kKillID = 1;

int main(int argc, char **argv) {

#warning "It's a good idea to complain when not enough parameters! Should add this code to other examples too."
if (argc < 2) {
printf("Usage: %s <# spare ranks> \n", *argv);
exit(0);
Expand Down Expand Up @@ -108,6 +107,19 @@ int main(int argc, char **argv) {

printf("hello world: %s, old rank (MPI_COMM_WORLD): %d, new rank: %d, active ranks: %d, ranks before process failure: %d\n",
processor_name, old_rank, new_rank, new_world_size, old_world_size);

int *fails, num_fails;
num_fails = Fenix_Process_fail_list(&fails);

char fails_str[100];
sprintf(fails_str, "Rank %d sees failed processes [", new_rank);
for(int i = 0; i < num_fails; i++){
sprintf(fails_str, "%s%s%d", fails_str, (i==0 ? "" : ", "), fails[i]);
}
sprintf(fails_str, "%s]\n", fails_str);
printf(fails_str);



Fenix_Finalize();
MPI_Finalize();
Expand Down
2 changes: 1 addition & 1 deletion examples/02_send_recv/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_ring-debug fenix_ring.c)
target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME ring
COMMAND mpirun --oversubscribe -np 5 fenix_ring-debug 1 2)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2)
set_tests_properties(ring PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/05_subset_create/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_create-debug subset_create.c)
target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_create
COMMAND mpirun -np 5 --oversubscribe fenix_subset_create-debug 1)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1)
set_tests_properties(subset_create PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
5 changes: 5 additions & 0 deletions examples/05_subset_create/subset_create.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ fprintf(stderr, "Started\n");
int subset[500];
MPI_Status status;

if (argc < 2) {
printf("Usage: %s <# spare ranks> \n", *argv);
exit(0);
}

int fenix_role;
MPI_Comm world_comm;
MPI_Comm new_comm;
Expand Down
2 changes: 1 addition & 1 deletion examples/06_subset_createv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_createv-debug subset_createv.c)
target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_createv
COMMAND mpirun -np 5 --oversubscribe fenix_subset_createv-debug 1)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1)
set_tests_properties(subset_createv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
5 changes: 5 additions & 0 deletions examples/06_subset_createv/subset_createv.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,11 @@ int main(int argc, char **argv) {
int subset[1000];
MPI_Status status;

if (argc < 2) {
printf("Usage: %s <# spare ranks> \n", *argv);
exit(0);
}

int fenix_role;
MPI_Comm world_comm;
MPI_Comm new_comm;
Expand Down
5 changes: 5 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ extern "C" {
#define FENIX_ERROR_SUBSET_STRIDE -25
#define FENIX_ERROR_NODATA_FOUND -30
#define FENIX_ERROR_INTERN -40
#define FENIX_ERROR_CANCELLED -50
#define FENIX_WARNING_SPARE_RANKS_DEPLETED 100
#define FENIX_WARNING_PARTIAL_RESTORE 101

Expand Down Expand Up @@ -216,6 +217,10 @@ int Fenix_Data_group_delete(int group_id);

int Fenix_Data_member_delete(int group_id, int member_id);

int Fenix_Process_fail_list(int** fail_list);

int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
Expand Down
21 changes: 15 additions & 6 deletions include/fenix_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
#include "fenix_opt.h"
#include "fenix_data_group.h"
#include "fenix_process_recovery.h"
#include "fenix_request_store.h"

typedef struct {
int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
Expand All @@ -71,7 +70,6 @@ typedef struct {
int resume_mode; // Defines how program resumes after process recovery
int spawn_policy; // Indicate dynamic process spawning
int spare_ranks; // Spare ranks entered by user to repair failed ranks
int replace_comm_flag; // Internal global variable to describe the status of MPI communicator
int repair_result; // Internal global variable to store the result of MPI communicator repair
int finalized;
jmp_buf *recover_environment; // Calling environment to fill the jmp_buf structure
Expand All @@ -81,17 +79,28 @@ typedef struct {
int role; // Role of rank: initial, survivor or repair
int fenix_init_flag;

fenix_request_store_t request_store;
int fail_world_size;
int* fail_world;

//Save the pointer to role and error of Fenix_Init
int *ret_role;
int *ret_error;

fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions
//fenix_communicator_list_t* communicator_list; // singly linked list for Fenix resilient communicators
fenix_debug_opt_t options; // This is reserved to store the user options

MPI_Comm *world; // Duplicate of the MPI communicator provided by user
MPI_Comm *new_world; // Global MPI communicator identical to g_world but without spare ranks
MPI_Comm world; // Duplicate of the MPI communicator provided by user
MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks
MPI_Comm *user_world; // MPI communicator with repaired ranks
MPI_Comm original_comm; // Keep the information of the original global MPI Communicator (this will be umodified until Fenix_finalize)
MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API


MPI_Errhandler mpi_errhandler; // This stores callback info for our custom error handler
int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!)
int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type.



fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure
} fenix_t;
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_process_recovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,6 @@ void __fenix_finalize();

void __fenix_finalize_spare();

void __fenix_test_MPI(int, const char *);
void __fenix_test_MPI(MPI_Comm*, int*, ...);

#endif
1 change: 0 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ fenix_data_member.c
fenix_data_subset.c
fenix_comm_list.c
fenix_callbacks.c
fenix_request_store.c
globals.c
)

Expand Down
20 changes: 20 additions & 0 deletions src/fenix.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,23 @@ int Fenix_Data_group_delete(int group_id) {
int Fenix_Data_member_delete(int group_id, int member_id) {
return __fenix_member_delete(group_id, member_id);
}

int Fenix_Process_fail_list(int** fail_list){
*fail_list = fenix.fail_world;
return fenix.fail_world_size;
}

int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status){

//We know this may return as "COMM_REVOKED", but we know the error was already handled
int old_ignore_setting = fenix.ignore_errs;
fenix.ignore_errs = 1;

int flag;
int ret = PMPI_Test(request, &flag, status);

fenix.ignore_errs = old_ignore_setting;

//Request was (potentially) cancelled if ret is MPI_ERR_PROC_FAILED
return ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED;
}
2 changes: 1 addition & 1 deletion src/fenix_callbacks.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ void __fenix_callback_invoke_all(int error)
{
fenix_callback_list_t *current = fenix.callback_list;
while (current != NULL) {
(current->callback->x)((MPI_Comm) * fenix.new_world, error,
(current->callback->x)((MPI_Comm) fenix.new_world, error,
(void *) current->callback->y);
current = current->next;
}
Expand Down
3 changes: 2 additions & 1 deletion src/fenix_comm_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@
#include <stdio.h>
#include <fenix.h>
#include <fenix_process_recovery.h>

#include <mpi-ext.h>

fenix_comm_list_t my_list = {NULL, NULL};

int __fenix_comm_push(MPI_Comm *comm) {
Expand Down
10 changes: 5 additions & 5 deletions src/fenix_data_group.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ fenix_data_recovery_t * __fenix_data_recovery_init() {

if (fenix.options.verbose == 41) {
verbose_print("c-rank: %d, role: %d, g-count: %zu, g-size: %zu\n",
__fenix_get_current_rank(*fenix.world), fenix.role, data_recovery->count,
__fenix_get_current_rank(fenix.world), fenix.role, data_recovery->count,
data_recovery->total_size);
}

Expand All @@ -94,7 +94,7 @@ int __fenix_member_delete(int groupid, int memberid) {

if (fenix.options.verbose == 38) {
verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n",
__fenix_get_current_rank(*fenix.new_world), fenix.role, group_index,
__fenix_get_current_rank(fenix.new_world), fenix.role, group_index,
member_index);
}

Expand Down Expand Up @@ -124,7 +124,7 @@ int __fenix_member_delete(int groupid, int memberid) {
fenix_member_entry_t *mentry = &(member->member_entry[member_index]);

verbose_print("c-rank: %d, role: %d, m-count: %zu, m-state: %d",
__fenix_get_current_rank(*fenix.new_world), fenix.role,
__fenix_get_current_rank(fenix.new_world), fenix.role,
member->count, mentry->state);
}

Expand Down Expand Up @@ -172,7 +172,7 @@ int __fenix_group_delete(int groupid) {

if (fenix.options.verbose == 37) {
verbose_print("c-rank: %d, group_index: %d\n",
__fenix_get_current_rank(*fenix.new_world), group_index);
__fenix_get_current_rank(fenix.new_world), group_index);
}

if (group_index == -1) {
Expand Down Expand Up @@ -221,7 +221,7 @@ void __fenix_data_recovery_reinit(fenix_data_recovery_t *data_recovery,

if (fenix.options.verbose == 48) {
verbose_print("c-rank: %d, role: %d, g-size: %zu\n",
__fenix_get_current_rank(*fenix.new_world), fenix.role,
__fenix_get_current_rank(fenix.new_world), fenix.role,
data_recovery->total_size);
}
}
Expand Down
Loading