Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
0798fd3
Add ability to query which processes failed
Matthew-Whitlock Feb 12, 2020
3582c7e
Add support for MPI_Test
Matthew-Whitlock Feb 19, 2020
dfc7d58
Add support for testing pre-failure requests
Matthew-Whitlock May 21, 2020
cee633d
Fix bug when ERR_PROC_FAILED/ERR_REVOKED discovered in MPI_Test
Matthew-Whitlock May 27, 2020
e4c6a3f
Fix MPI_Wait w/ cancelled requests
Matthew-Whitlock May 27, 2020
42814f6
Add missing file to commit
Matthew-Whitlock May 27, 2020
58e02c8
Fix bug with MPI_STATUS_IGNORE
Matthew-Whitlock May 27, 2020
3071f29
Fix another bug with MPI_Test
Matthew-Whitlock Jun 24, 2020
813540a
Add no-jump recovery option
srirajpaul Jun 27, 2020
59eec3c
small test
Matthew-Whitlock Jun 29, 2020
ce88c10
trying to simplify
Matthew-Whitlock Jun 29, 2020
428a4d8
small fix
Matthew-Whitlock Jun 29, 2020
1c24eed
Fix oversubscribe export
Matthew-Whitlock Jun 29, 2020
52f380b
Test working dir change requirements
Matthew-Whitlock Jun 29, 2020
2ea13a7
test failure
Matthew-Whitlock Jun 29, 2020
0911534
Finalize script for now
Matthew-Whitlock Jun 29, 2020
8e74165
Print test logs on success, for verifying
Matthew-Whitlock Jun 29, 2020
3b586ae
Switch to pulling from ULFM master branch when rebuilding ULFM
Matthew-Whitlock Jun 29, 2020
59c05f0
print ULFM install info if building fails
Matthew-Whitlock Jun 29, 2020
5002bed
Fix brackets in build
Matthew-Whitlock Jun 29, 2020
bced404
small test
Matthew-Whitlock Jun 29, 2020
c581110
Try another fix
Matthew-Whitlock Jun 29, 2020
1a43a8d
Trying a new OOP fix
Matthew-Whitlock Jun 29, 2020
9b32ada
another attempt
Matthew-Whitlock Jun 29, 2020
df19c0c
Add semicolon to {} sections
Matthew-Whitlock Jun 29, 2020
7410b9a
Braces must be separate from commands by spaces
Matthew-Whitlock Jun 29, 2020
b096f63
Try to fix travis not showing logs
Matthew-Whitlock Jun 29, 2020
74a18bd
Fix autogen.pl output to file
Matthew-Whitlock Jun 29, 2020
2f5829b
Fix ulfm install log
Matthew-Whitlock Jun 29, 2020
37ea254
Fix trailing i from vim
Matthew-Whitlock Jun 29, 2020
f84d62d
Update tests for speed, and remove hard-set --oversubscribe
Matthew-Whitlock Jun 29, 2020
d55030b
Simplify travis a bit more
Matthew-Whitlock Jun 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 46 additions & 16 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,54 @@ addons:
- valgrind
cache:
directories:
- .travis_helpers/ulfm-install
- ulfm-install
before_install:
- cd .travis_helpers
- source ./fetchULFMmpi.sh
- cd ../ #Always end back at the root directory
- echo "Configuring ULFM"
- if [ -f ulfm-install/lib/libmpi.so ]; then
echo "libmpi.so found -- nothing to build.";
cd ulfm-install;
else
ROOT=`pwd`;
mkdir ulfm-install;
echo "Downloading ULFM from repo";
git clone --recursive https://bitbucket.org/icldistcomp/ulfm2.git ulfm-src/;
echo " - Configuring and building ULFM.";
cd ulfm-src;
echo " - Running autogen.pl";
./autogen.pl >../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running configure";
./configure --prefix=$ROOT/ulfm-install >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running make";
make -j4 >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Running make install";
make install >>../ulfm-install/ulfm_build_output.txt 2>&1;
echo " - Finished installing ULFM";
cd ../ulfm-install/;
fi

#Expect that any changes to the above still puts me in the install's home dir
- export MPI_HOME=`pwd`
- export PATH=$MPI_HOME/bin/:$PATH
- export LD_LIBRARY_PATH=$MPI_HOME/lib:$LD_LIBRARY_PATH
- export DYLD_LIBRARY_PATH=$MPI_HOME/lib:$DYLD_LIBRARY_PATH
- export MANPATH=$MPI_HOME/share/man:$MANPATH

- export MPICC="`which mpicc`"
- export MPICXX="`which mpic++`"

#Allow oversubscription for tests, since we're potentially single core
- export OMPI_MCA_rmaps_base_oversubscribe=1

- tail -n50 ./ulfm_build_output.txt
- cd ../ #End back at root
install:
- mkdir build && cd build
- cmake ../ -DBUILD_TESTING=ON && make -j4 VERBOSE=1
script:
- cd .travis_helpers
- source fetchULFMmpi.sh #Just updates path if ULFM was built properly in before_install
- cd ../
- mkdir build
- cd build
- cmake ../ -DBUILD_TESTING=ON
- make -j4 VERBOSE=1
- make test
- cd ../ #Always end back at the root directory.
after_success:
- echo "Success, printing run logs:"
- cat Testing/Temporary/LastTest.log
after_failure:
- echo "Failure occured, printing run logs:"
- pwd
- cat build/Testing/Temporary/LastTest.log
- echo "Printing ULFM build log tail. If no output, ULFM was built before this test run"
- tail -n100 .travis_helpers/build_output.txt
- cat Testing/Temporary/LastTest.log
37 changes: 0 additions & 37 deletions .travis_helpers/fetchULFMmpi.sh

This file was deleted.

6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

set(CMAKE_BUILD_TYPE Release)
#set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_BUILD_TYPE Release)
set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -O0 -ggdb")

#ENABLE_TESTING
Expand Down Expand Up @@ -109,4 +109,6 @@ if(BUILD_TESTING)
add_subdirectory(test/subset_internal)
add_subdirectory(test/subset_merging)
add_subdirectory(test/request_tracking)
add_subdirectory(test/request_cancelled)
add_subdirectory(test/no_jump)
endif()
2 changes: 1 addition & 1 deletion examples/01_hello_world/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ if(BUILD_TESTING)
add_executable(fenix_hello_world-debug fenix_hello_world.c)
target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME hello_world
COMMAND mpirun --oversubscribe -np 3 fenix_hello_world-debug "1")
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1")
endif()
13 changes: 13 additions & 0 deletions examples/01_hello_world/fenix/fenix_hello_world.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,19 @@ int main(int argc, char **argv) {

printf("hello world: %s, old rank (MPI_COMM_WORLD): %d, new rank: %d, active ranks: %d, ranks before process failure: %d\n",
processor_name, old_rank, new_rank, new_world_size, old_world_size);

int *fails, num_fails;
num_fails = Fenix_Process_fail_list(&fails);

char fails_str[100];
sprintf(fails_str, "Rank %d sees failed processes [", new_rank);
for(int i = 0; i < num_fails; i++){
sprintf(fails_str, "%s%s%d", fails_str, (i==0 ? "" : ", "), fails[i]);
}
sprintf(fails_str, "%s]\n", fails_str);
printf(fails_str);



Fenix_Finalize();
MPI_Finalize();
Expand Down
2 changes: 1 addition & 1 deletion examples/02_send_recv/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_ring-debug fenix_ring.c)
target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME ring
COMMAND mpirun --oversubscribe -np 5 fenix_ring-debug 1 2)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2)
set_tests_properties(ring PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/05_subset_create/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_create-debug subset_create.c)
target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_create
COMMAND mpirun -np 5 --oversubscribe fenix_subset_create-debug 1)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1)
set_tests_properties(subset_create PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/06_subset_createv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_createv-debug subset_createv.c)
target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_createv
COMMAND mpirun -np 5 --oversubscribe fenix_subset_createv-debug 1)
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1)
set_tests_properties(subset_createv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
3 changes: 3 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ extern "C" {
#define FENIX_ERROR_SUBSET_STRIDE -25
#define FENIX_ERROR_NODATA_FOUND -30
#define FENIX_ERROR_INTERN -40
#define FENIX_ERROR_CANCELLED -50
#define FENIX_WARNING_SPARE_RANKS_DEPLETED 100
#define FENIX_WARNING_PARTIAL_RESTORE 101

Expand Down Expand Up @@ -216,6 +217,8 @@ int Fenix_Data_group_delete(int group_id);

int Fenix_Data_member_delete(int group_id, int member_id);

int Fenix_Process_fail_list(int** fail_list);

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
Expand Down
7 changes: 7 additions & 0 deletions include/fenix_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ typedef struct {
int role; // Role of rank: initial, survivor or repair
int fenix_init_flag;

int fail_world_size;
int* fail_world;

//Save the pointer to role and error of Fenix_Init
int *ret_role;
int *ret_error;

fenix_request_store_t request_store;

fenix_callback_list_t* callback_list; // singly linked list for user-defined Fenix callback functions
Expand Down
5 changes: 5 additions & 0 deletions src/fenix.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,8 @@ int Fenix_Data_group_delete(int group_id) {
int Fenix_Data_member_delete(int group_id, int member_id) {
return __fenix_member_delete(group_id, member_id);
}

int Fenix_Process_fail_list(int** fail_list){
*fail_list = fenix.fail_world;
return fenix.fail_world_size;
}
4 changes: 2 additions & 2 deletions src/fenix_data_recovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth,
/* If so, recover the data and set the recovery */
/* for member recovery. */

int i, group_position;
int i;
int remote_need_recovery;
fenix_group_t *group;
MPI_Status status;
Expand Down Expand Up @@ -149,7 +149,7 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth,

} else { /* Already created. Renew the MPI communicator */

group = ( data_recovery->group[group_position] );
group = ( data_recovery->group[group_index] );
group->comm = comm; /* Renew communicator */
MPI_Comm_rank(comm, &(group->current_rank));

Expand Down
101 changes: 88 additions & 13 deletions src/fenix_mpi_override.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,9 @@ int MPI_Sendrecv(MPI_CONST_TYPE void* sendbuf, int sendcount,
static inline
void __fenix_override_request(int ret, MPI_Request *request)
{
if(ret != MPI_SUCCESS) return;
if(ret != MPI_SUCCESS) {
return;
}

assert(*request != MPI_REQUEST_NULL);

Expand All @@ -265,28 +267,55 @@ int MPI_Irecv(void *buf, int count, MPI_Datatype datatype,
int ret;
ret = PMPI_Irecv(buf, count, datatype, source, tag,
__fenix_replace_comm(comm), request);

__fenix_override_request(ret, request);
__fenix_test_MPI_inline(ret, "MPI_Irecv");
return ret;
}

int MPI_Wait(MPI_Request *fenix_request, MPI_Status *status)
{
int ret;
int ret, is_cancelled = 1;
MPI_Request request = MPI_REQUEST_NULL;
if(*fenix_request != MPI_REQUEST_NULL)
__fenix_request_store_get(&fenix.request_store,
*((int *) fenix_request),
&request);
if(*fenix_request != MPI_REQUEST_NULL){
if(*fenix_request == FENIX_REQUEST_CANCELLED){
is_cancelled = 1;
} else {
int retval =
__fenix_request_store_get(&fenix.request_store, *((int*)fenix_request), &request);

if(retval == FENIX_ERROR_CANCELLED) {
is_cancelled = 1;
}

if(retval == FENIX_REQUEST_COMPLETED){
if(status != MPI_STATUS_IGNORE)
__fenix_request_store_get_status(&fenix.request_store, *((int*)fenix_request), status);
*fenix_request = MPI_REQUEST_NULL;
return;
}
}
}

ret = PMPI_Wait(&request, status);
if(ret == MPI_SUCCESS) {

if(ret == MPI_SUCCESS && (*fenix_request != MPI_REQUEST_NULL) && (*fenix_request != FENIX_REQUEST_CANCELLED)) {
__fenix_request_store_remove(&fenix.request_store,
*((int *) fenix_request));
assert(request == MPI_REQUEST_NULL);
*fenix_request = MPI_REQUEST_NULL;
*fenix_request = MPI_REQUEST_NULL;
}
if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){
__fenix_request_store_cancel(&fenix.request_store, *((int*)fenix_request), status);
*fenix_request = FENIX_REQUEST_CANCELLED;
}
__fenix_test_MPI_inline(ret, "MPI_Wait");


if(is_cancelled){
*fenix_request = FENIX_REQUEST_CANCELLED;
return FENIX_ERROR_CANCELLED;
}
return ret;
}

Expand All @@ -297,11 +326,13 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[],
{
// The list (array_of_requests) may contain null or inactive handles.
int ret, i;
for(i=0 ; i<count ; i++)
if(array_of_fenix_requests[i] != MPI_REQUEST_NULL)
__fenix_request_store_getremove(&fenix.request_store,
for(i=0 ; i<count ; i++){
if(array_of_fenix_requests[i] != MPI_REQUEST_NULL){
__fenix_request_store_getremove(&fenix.request_store,
*((int *)&(array_of_fenix_requests[i])),
&(array_of_fenix_requests[i]));
}
}

ret = PMPI_Waitall(count, array_of_fenix_requests, array_of_statuses);
__fenix_test_MPI_inline(ret, "MPI_Waitall");
Expand Down Expand Up @@ -333,8 +364,52 @@ int MPI_Waitall(int count, MPI_Request array_of_fenix_requests[],

int MPI_Test(MPI_Request *request, int *flag, MPI_Status *status)
{
#warning "TODO"
printf("Fenix: need to implement MPI_Test\n");
int ret;
int is_cancelled = 0;
MPI_Request real_req = MPI_REQUEST_NULL;

if(*request != MPI_REQUEST_NULL){
if(*request == FENIX_REQUEST_CANCELLED){
is_cancelled = 1;
} else {
int retval =
__fenix_request_store_get(&fenix.request_store, *((int*)request), &real_req);

if(retval == FENIX_ERROR_CANCELLED) {
is_cancelled = 1;
}

if(retval == FENIX_REQUEST_COMPLETED){
*flag = 1;
if(status != MPI_STATUS_IGNORE)
__fenix_request_store_get_status(&fenix.request_store, *((int*)request), status);
*request = MPI_REQUEST_NULL;
return;
}
}
}


ret = PMPI_Test(&real_req, flag, status);
if(ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED){
__fenix_request_store_cancel(&fenix.request_store, *((int*)request), status);
*request = FENIX_REQUEST_CANCELLED;
}

__fenix_test_MPI_inline(ret, "MPI_Test");

if(*flag && *request != MPI_REQUEST_NULL && *request != FENIX_REQUEST_CANCELLED && ret == MPI_SUCCESS){
//This request is done, it can be removed from the store.
__fenix_request_store_remove(&fenix.request_store, *((int*)request));
*request = MPI_REQUEST_NULL;
}

if(is_cancelled){
*request = FENIX_REQUEST_CANCELLED;
return FENIX_ERROR_CANCELLED;
}

else return ret;
}

int MPI_Cancel(MPI_Request *request)
Expand Down
Loading