diff --git a/include/fenix.hpp b/include/fenix.hpp index 7112c71..1549247 100644 --- a/include/fenix.hpp +++ b/include/fenix.hpp @@ -72,7 +72,7 @@ */ int Fenix_Callback_register(std::function callback); -namespace Fenix { +namespace fenix { /** * @brief Registers a callback that throws a CommException @@ -84,6 +84,6 @@ namespace Fenix { */ int register_exception_callback(); -} // namespace Fenix +} // namespace fenix #endif diff --git a/include/fenix_exception.hpp b/include/fenix_exception.hpp index c2fc081..8958a02 100644 --- a/include/fenix_exception.hpp +++ b/include/fenix_exception.hpp @@ -60,7 +60,7 @@ #include #include -namespace Fenix { +namespace fenix { struct CommException : public std::exception { MPI_Comm repaired_comm; @@ -69,6 +69,6 @@ struct CommException : public std::exception { repaired_comm(comm), fenix_err(err) { }; }; -} // namespace Fenix +} // namespace fenix #endif diff --git a/include/fenix_ext.hpp b/include/fenix_ext.hpp index 0026325..39a03d8 100644 --- a/include/fenix_ext.hpp +++ b/include/fenix_ext.hpp @@ -107,11 +107,9 @@ typedef struct { int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!) int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type. - - fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure } fenix_t; -extern fenix_t fenix; +inline fenix_t fenix_rt; #endif // __FENIX_EXT_H__ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5a8b7b0..50a0233 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -27,7 +27,6 @@ fenix_data_policy_in_memory_raid.cpp fenix_data_member.cpp fenix_data_subset.cpp fenix_callbacks.cpp -globals.cpp ) add_library( fenix STATIC ${Fenix_SOURCES}) diff --git a/src/fenix.cpp b/src/fenix.cpp index a383ad6..4126b19 100644 --- a/src/fenix.cpp +++ b/src/fenix.cpp @@ -78,7 +78,7 @@ int Fenix_Callback_pop() { } int Fenix_Initialized(int *flag) { - *flag = (fenix.fenix_init_flag) ? 1 : 0; + *flag = (fenix_rt.fenix_init_flag) ? 1 : 0; return FENIX_SUCCESS; } @@ -197,20 +197,20 @@ int Fenix_Data_member_delete(int group_id, int member_id) { } int Fenix_Process_fail_list(int** fail_list){ - *fail_list = fenix.fail_world; - return fenix.fail_world_size; + *fail_list = fenix_rt.fail_world; + return fenix_rt.fail_world_size; } int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status){ //We know this may return as "COMM_REVOKED", but we know the error was already handled - int old_ignore_setting = fenix.ignore_errs; - fenix.ignore_errs = 1; + int old_ignore_setting = fenix_rt.ignore_errs; + fenix_rt.ignore_errs = 1; int flag; int ret = PMPI_Test(request, &flag, status); - fenix.ignore_errs = old_ignore_setting; + fenix_rt.ignore_errs = old_ignore_setting; //Request was (potentially) cancelled if ret is MPI_ERR_PROC_FAILED return ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED; diff --git a/src/fenix_callbacks.cpp b/src/fenix_callbacks.cpp index 5f981ba..3a6b3ee 100644 --- a/src/fenix_callbacks.cpp +++ b/src/fenix_callbacks.cpp @@ -67,25 +67,25 @@ int __fenix_callback_register(fenix_callback_func& recover) { - if(!fenix.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED; + if(!fenix_rt.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED; - fenix.callbacks.push_back(recover); + fenix_rt.callbacks.push_back(recover); return FENIX_SUCCESS; } int __fenix_callback_pop(){ - if(!fenix.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED; - if(fenix.callbacks.empty()) return FENIX_ERROR_CALLBACK_NOT_REGISTERED; + if(!fenix_rt.fenix_init_flag) return FENIX_ERROR_UNINITIALIZED; + if(fenix_rt.callbacks.empty()) return FENIX_ERROR_CALLBACK_NOT_REGISTERED; - fenix.callbacks.pop_back(); + fenix_rt.callbacks.pop_back(); return FENIX_SUCCESS; } void __fenix_callback_invoke_all(int error) { - for(auto it = fenix.callbacks.rbegin(); it != fenix.callbacks.rend(); it++){ - (*it)(*fenix.user_world, error); + for(auto it = fenix_rt.callbacks.rbegin(); it != fenix_rt.callbacks.rend(); it++){ + (*it)(*fenix_rt.user_world, error); } } diff --git a/src/fenix_data_group.cpp b/src/fenix_data_group.cpp index 1d9e5ff..3dbb108 100644 --- a/src/fenix_data_group.cpp +++ b/src/fenix_data_group.cpp @@ -75,9 +75,9 @@ fenix_data_recovery_t * __fenix_data_recovery_init() { data_recovery->group = (fenix_group_t **) s_malloc( __FENIX_DEFAULT_GROUP_SIZE * sizeof(fenix_group_t *)); - if (fenix.options.verbose == 41) { + if (fenix_rt.options.verbose == 41) { verbose_print("c-rank: %d, role: %d, g-count: %zu, g-size: %zu\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, data_recovery->count, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, data_recovery->count, data_recovery->total_size); } @@ -86,15 +86,15 @@ fenix_data_recovery_t * __fenix_data_recovery_init() { int __fenix_member_delete(int groupid, int memberid) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); int member_index = -1; if(group_index !=-1){ - member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid); } - if (fenix.options.verbose == 38) { + if (fenix_rt.options.verbose == 38) { verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index); } @@ -107,7 +107,7 @@ int __fenix_member_delete(int groupid, int memberid) { memberid); retval = FENIX_ERROR_INVALID_MEMBERID; } else { - fenix_data_recovery_t *data_recovery = fenix.data_recovery; + fenix_data_recovery_t *data_recovery = fenix_rt.data_recovery; fenix_group_t *group = (data_recovery->group[group_index]); retval = group->vtbl.member_delete(group, memberid); @@ -119,12 +119,12 @@ int __fenix_member_delete(int groupid, int memberid) { mentry->state = DELETED; } - if (fenix.options.verbose == 38) { + if (fenix_rt.options.verbose == 38) { fenix_member_t *member = group->member; fenix_member_entry_t *mentry = &(member->member_entry[member_index]); verbose_print("c-rank: %d, role: %d, m-count: %zu, m-state: %d", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, member->count, mentry->state); } @@ -168,11 +168,11 @@ int __fenix_data_recovery_remove_group(fenix_data_recovery_t* data_recovery, int */ int __fenix_group_delete(int groupid) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); - if (fenix.options.verbose == 37) { + if (fenix_rt.options.verbose == 37) { verbose_print("c-rank: %d, group_index: %d\n", - __fenix_get_current_rank(fenix.new_world), group_index); + __fenix_get_current_rank(fenix_rt.new_world), group_index); } if (group_index == -1) { @@ -180,7 +180,7 @@ int __fenix_group_delete(int groupid) { retval = FENIX_ERROR_INVALID_GROUPID; } else { /* Delete Process */ - fenix_data_recovery_t *data_recovery = fenix.data_recovery; + fenix_data_recovery_t *data_recovery = fenix_rt.data_recovery; fenix_group_t *group = (data_recovery->group[group_index]); retval = __fenix_group_delete_direct(group); @@ -219,9 +219,9 @@ void __fenix_data_recovery_reinit(fenix_data_recovery_t *data_recovery, (data_recovery->total_size) * sizeof(fenix_group_t *)); - if (fenix.options.verbose == 48) { + if (fenix_rt.options.verbose == 48) { verbose_print("c-rank: %d, role: %d, g-size: %zu\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, data_recovery->total_size); } } @@ -239,7 +239,7 @@ void __fenix_ensure_data_recovery_capacity(fenix_data_recovery_t* data_recovery) sizeof(fenix_group_t *)); data_recovery->total_size = data_recovery->total_size * 2; - if (fenix.options.verbose == 51) { + if (fenix_rt.options.verbose == 51) { verbose_print("g-count: %zu, g-size: %zu\n", data_recovery->count, data_recovery->total_size); } } diff --git a/src/fenix_data_member.cpp b/src/fenix_data_member.cpp index 7971d85..7ac1338 100644 --- a/src/fenix_data_member.cpp +++ b/src/fenix_data_member.cpp @@ -73,9 +73,9 @@ fenix_member_t *__fenix_data_member_init() { member->member_entry = (fenix_member_entry_t *) s_malloc( __FENIX_DEFAULT_MEMBER_SIZE * sizeof(fenix_member_entry_t)); - if (fenix.options.verbose == 42) { + if (fenix_rt.options.verbose == 42) { verbose_print("c-rank: %d, role: %d, m-count: %zu, m-size: %zu\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, member->count, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, member->count, member->total_size); } @@ -86,9 +86,9 @@ fenix_member_t *__fenix_data_member_init() { mentry->memberid = -1; mentry->state = EMPTY; - if (fenix.options.verbose == 42) { + if (fenix_rt.options.verbose == 42) { verbose_print("c-rank: %d, role: %d, m-memberid: %d, m-state: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, mentry->memberid, mentry->state); } } @@ -106,7 +106,7 @@ void __fenix_data_member_destroy( fenix_member_t *member ) { * @param */ int __fenix_search_memberid(fenix_member_t* member, int key) { - fenix_data_recovery_t *data_recovery = fenix.data_recovery; + fenix_data_recovery_t *data_recovery = fenix_rt.data_recovery; int member_index, found = -1, index = -1; for (member_index = 0; (found != 1) && (member_index < member->total_size); member_index++) { @@ -170,9 +170,9 @@ void __fenix_ensure_member_capacity(fenix_member_t *m) { sizeof(fenix_member_entry_t)); member->total_size = member->total_size * 2; - if (fenix.options.verbose == 52) { + if (fenix_rt.options.verbose == 52) { verbose_print("c-rank: %d, role: %d, m-count: %zu, m-size: %zu\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, member->count, member->total_size); } @@ -182,10 +182,10 @@ void __fenix_ensure_member_capacity(fenix_member_t *m) { mentry->memberid = -1; mentry->state = EMPTY; - if (fenix.options.verbose == 52) { + if (fenix_rt.options.verbose == 52) { verbose_print( "c-rank: %d, role: %d, member[%d] m-memberid: %d, m-state: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, member_index, mentry->memberid, mentry->state); } } @@ -196,7 +196,7 @@ void __fenix_ensure_member_capacity(fenix_member_t *m) { int __fenix_data_member_send_metadata(int groupid, int memberid, int dest_rank){ int retval = -1; - fenix_data_recovery_t* data_recovery = fenix.data_recovery; + fenix_data_recovery_t* data_recovery = fenix_rt.data_recovery; int group_index = __fenix_search_groupid(groupid, data_recovery); int member_index; if(group_index != -1){ @@ -234,7 +234,7 @@ int __fenix_data_member_recv_metadata(int groupid, int src_rank, fenix_member_entry_packet_t* packet){ int retval = -1; - fenix_data_recovery_t* data_recovery = fenix.data_recovery; + fenix_data_recovery_t* data_recovery = fenix_rt.data_recovery; int group_index = __fenix_search_groupid(groupid, data_recovery); if(group_index == -1){ @@ -269,9 +269,9 @@ void __fenix_data_member_reinit(fenix_member_t *m, fenix_two_container_packet_t member->member_entry = (fenix_member_entry_t *) s_realloc(member->member_entry, (member->total_size) * sizeof(fenix_member_entry_t)); - if (fenix.options.verbose == 50) { + if (fenix_rt.options.verbose == 50) { verbose_print("c-rank: %d, role: %d, m-count: %zu, m-size: %zu\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, member->count, member->total_size); } @@ -282,9 +282,9 @@ void __fenix_data_member_reinit(fenix_member_t *m, fenix_two_container_packet_t fenix_member_entry_t *mentry = &(member->member_entry[member_index]); mentry->memberid = -1; mentry->state = mystatus; - if (fenix.options.verbose == 50) { + if (fenix_rt.options.verbose == 50) { verbose_print("c-rank: %d, role: %d, m-memberid: %d, m-state: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, mentry->memberid, mentry->state); } } diff --git a/src/fenix_data_policy_in_memory_raid.cpp b/src/fenix_data_policy_in_memory_raid.cpp index 70e0d1b..e4d188d 100644 --- a/src/fenix_data_policy_in_memory_raid.cpp +++ b/src/fenix_data_policy_in_memory_raid.cpp @@ -1212,7 +1212,7 @@ void __imr_sync_timestamps(fenix_imr_group_t* group){ //Now fix members if(need_reset && group->entries_count > 0) { - if(fenix.options.verbose == 1){ + if(fenix_rt.options.verbose == 1){ verbose_print("Outdated timestamps on rank %d. All members will require full recovery.\n", group->base.current_rank); } diff --git a/src/fenix_data_recovery.cpp b/src/fenix_data_recovery.cpp index 914d1b0..9ff7d60 100644 --- a/src/fenix_data_recovery.cpp +++ b/src/fenix_data_recovery.cpp @@ -77,11 +77,11 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, int retval = -1; /* Retrieve the array index of the group maintained under the cover. */ - int group_index = __fenix_search_groupid( groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid( groupid, fenix_rt.data_recovery ); - if (fenix.options.verbose == 12) { + if (fenix_rt.options.verbose == 12) { - verbose_print("c-rank: %d, group_index: %d\n", __fenix_get_current_rank(fenix.new_world), group_index); + verbose_print("c-rank: %d, group_index: %d\n", __fenix_get_current_rank(fenix_rt.new_world), group_index); } @@ -107,13 +107,13 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, fenix_group_t *group; MPI_Status status; - fenix_data_recovery_t *data_recovery = fenix.data_recovery; + fenix_data_recovery_t *data_recovery = fenix_rt.data_recovery; /* Initialize Group. The group hasn't been created. */ /* I am either a brand-new process or a recovered process. */ if (group_index == -1 ) { - if (fenix.options.verbose == 12 && __fenix_get_current_rank(comm) == 0) { + if (fenix_rt.options.verbose == 12 && __fenix_get_current_rank(comm) == 0) { printf("this is a new group!\n"); } @@ -138,10 +138,10 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, //Update the count AFTER finding next group position. data_recovery->count++; - if ( fenix.options.verbose == 12) { + if ( fenix_rt.options.verbose == 12) { verbose_print( "c-rank: %d, g-groupid: %d, g-timestart: %d, g-depth: %d\n", - __fenix_get_current_rank(fenix.new_world), group->groupid, + __fenix_get_current_rank(fenix_rt.new_world), group->groupid, group->timestart, group->depth); } @@ -166,14 +166,14 @@ int __fenix_group_create( int groupid, MPI_Comm comm, int timestart, int depth, int __fenix_group_get_redundancy_policy(int groupid, int* policy_name, int* policy_value, int* flag){ int retval = -1; - int group_index = __fenix_search_groupid( groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid( groupid, fenix_rt.data_recovery ); if(group_index == -1){ debug_print("ERROR Fenix_Data_member_create: group_id <%d> does not exist\n", groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t* group = fenix.data_recovery->group[group_index]; + fenix_group_t* group = fenix_rt.data_recovery->group[group_index]; retval = group->vtbl.get_redundant_policy(group, policy_name, policy_value, flag); } @@ -191,13 +191,13 @@ int __fenix_group_get_redundancy_policy(int groupid, int* policy_name, int* poli */ int __fenix_member_create(int groupid, int memberid, void *data, int count, int datatype_size ) { int retval = -1; - int group_index = __fenix_search_groupid( groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid( groupid, fenix_rt.data_recovery ); int member_index = -1; - if(group_index != -1) member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid ); + if(group_index != -1) member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid ); - if (fenix.options.verbose == 13) { + if (fenix_rt.options.verbose == 13) { verbose_print("c-rank: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), + __fenix_get_current_rank(fenix_rt.new_world), group_index, member_index); } @@ -212,7 +212,7 @@ int __fenix_member_create(int groupid, int memberid, void *data, int count, int } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); fenix_member_t *member = group->member; //First, we'll make a fenix-core member entry, then pass that info to @@ -284,18 +284,18 @@ int __fenix_data_test(Fenix_Request request, int *flag) { int __fenix_member_store(int groupid, int memberid, Fenix_Data_subset specifier) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); int member_index = -1; /* Check if the member id already exists. If so, the index of the storage space is assigned */ if (group_index !=-1 && memberid != FENIX_DATA_MEMBER_ALL) { - member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid ); + member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid ); } - if (fenix.options.verbose == 18 && fenix.data_recovery->group[group_index]->current_rank== 0 ) { + if (fenix_rt.options.verbose == 18 && fenix_rt.data_recovery->group[group_index]->current_rank== 0 ) { verbose_print( "c-rank: %d, role: %d, group_index: %d, member_index: %d memberid: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index, memberid); } @@ -307,7 +307,7 @@ int __fenix_member_store(int groupid, int memberid, Fenix_Data_subset specifier) memberid); retval = FENIX_ERROR_INVALID_MEMBERID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.member_store(group, memberid, specifier); } return retval; @@ -324,18 +324,18 @@ int __fenix_member_istore(int groupid, int memberid, Fenix_Data_subset specifier Fenix_Request *request) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); int member_index = -1; /* Check if the member id already exists. If so, the index of the storage space is assigned */ if (group_index !=-1 && memberid != FENIX_DATA_MEMBER_ALL) { - member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid ); + member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid ); } - if (fenix.options.verbose == 18 && fenix.data_recovery->group[group_index]->current_rank== 0 ) { + if (fenix_rt.options.verbose == 18 && fenix_rt.data_recovery->group[group_index]->current_rank== 0 ) { verbose_print( "c-rank: %d, role: %d, group_index: %d, member_index: %d memberid: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index, memberid); } @@ -347,7 +347,7 @@ int __fenix_member_istore(int groupid, int memberid, Fenix_Data_subset specifier memberid); retval = FENIX_ERROR_INVALID_MEMBERID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.member_istore(group, memberid, specifier, request); } return retval; @@ -392,8 +392,8 @@ void __fenix_subset(fenix_group_t *group, fenix_member_entry_t *me, Fenix_Data_s lentry_packet.entry_count = lentry->count; lentry_packet.entry_size = subset_total_size; - int current_rank = __fenix_get_current_rank(fenix.new_world); - int current_role = fenix.role; + int current_rank = __fenix_get_current_rank(fenix_rt.new_world); + int current_role = fenix_rt.role; MPI_Sendrecv(&lentry_packet, sizeof(member_store_packet_t), MPI_BYTE, ge->out_rank, STORE_SIZE_TAG, &rentry_packet, sizeof(member_store_packet_t), MPI_BYTE, @@ -467,7 +467,7 @@ int __fenix_member_storev(int group_id, int member_id, Fenix_Data_subset subset_ * Using the same routine for v and non-v routine. */ int retval = -1; - int group_index = __fenix_search_groupid( group_id, fenix.data_recovery ); + int group_index = __fenix_search_groupid( group_id, fenix_rt.data_recovery ); int member_index = __fenix_search_memberid(group_index, member_id); if (group_index == -1) { debug_print("ERROR Fenix_Data_member_storev: group_id <%d> does not exist\n", @@ -478,7 +478,7 @@ int __fenix_member_storev(int group_id, int member_id, Fenix_Data_subset subset_ member_id); retval = FENIX_ERROR_INVALID_MEMBERID; } else { - fenix_group_t *group = fenix.data_recovery; + fenix_group_t *group = fenix_rt.data_recovery; fenix_group_entry_t *gentry = &(group->group_entry[group_index]); fenix_member_t *member = &(gentry->member); __fenix_ensure_version_capacity(member); @@ -516,7 +516,7 @@ int __fenix_member_istorev(int group_id, int member_id, Fenix_Data_subset subset member_id); retval = FENIX_ERROR_INVALID_MEMBERID; } else { - fenix_group_t *group = fenix.data_recovery; + fenix_group_t *group = fenix_rt.data_recovery; fenix_group_entry_t *gentry = &(group->group_entry[group_index]); fenix_member_t *member = &(gentry->member); __fenix_ensure_version_capacity(member); @@ -540,15 +540,15 @@ int __fenix_data_commit(int groupid, int *timestamp) { /* No communication is performed */ /* Return the new timestamp */ int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); - if (fenix.options.verbose == 22) { - verbose_print("c-rank: %d, role: %d, group_index: %d\n", __fenix_get_current_rank(fenix.new_world), fenix.role, group_index); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); + if (fenix_rt.options.verbose == 22) { + verbose_print("c-rank: %d, role: %d, group_index: %d\n", __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index); } if (group_index == -1) { debug_print("ERROR Fenix_Data_commit: group_id <%d> does not exist\n", groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); if (group->timestamp != -1) group->timestamp++; else group->timestamp = group->timestart; @@ -571,21 +571,21 @@ int __fenix_data_commit(int groupid, int *timestamp) { */ int __fenix_data_commit_barrier(int groupid, int *timestamp) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); - if (fenix.options.verbose == 23) { + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); + if (fenix_rt.options.verbose == 23) { verbose_print("c-rank: %d, role: %d, group_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index); + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index); } if (group_index == -1) { debug_print("ERROR Fenix_Data_commit: group_id <%d> does not exist\n", groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); //We want to make sure there aren't any failed MPI operations (IE unfinished stores) //But we don't want to fail to commit if a failure has happened since a successful store. - int old_failure_handling = fenix.ignore_errs; - fenix.ignore_errs = 1; + int old_failure_handling = fenix_rt.ignore_errs; + fenix_rt.ignore_errs = 1; int can_commit = 0; @@ -594,10 +594,10 @@ int __fenix_data_commit_barrier(int groupid, int *timestamp) { //So if we aren't all here, we've hit an error already. int location = FENIX_DATA_COMMIT_BARRIER_LOC; - int ret = MPIX_Comm_agree(*fenix.user_world, &location); + int ret = MPIX_Comm_agree(*fenix_rt.user_world, &location); if(location == FENIX_DATA_COMMIT_BARRIER_LOC) can_commit = 1; - fenix.ignore_errs = old_failure_handling; + fenix_rt.ignore_errs = old_failure_handling; if(can_commit == 1){ if (group->timestamp != -1) group->timestamp++; @@ -609,7 +609,7 @@ int __fenix_data_commit_barrier(int groupid, int *timestamp) { if(can_commit != 1 || ret != MPI_SUCCESS) { //A rank failure has happened, lets trigger error handling if enabled. int throwaway = 1; - MPI_Allreduce(MPI_IN_PLACE, &throwaway, 1, MPI_INT, MPI_SUM, *fenix.user_world); + MPI_Allreduce(MPI_IN_PLACE, &throwaway, 1, MPI_INT, MPI_SUM, *fenix_rt.user_world); } @@ -632,15 +632,15 @@ int __fenix_data_commit_barrier(int groupid, int *timestamp) { int __fenix_member_restore(int groupid, int memberid, void *data, int maxcount, int timestamp, Fenix_Data_subset* data_found) { int retval = FENIX_SUCCESS; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery); int member_index = -1; - if(group_index != -1) member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + if(group_index != -1) member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid); - if (fenix.options.verbose == 25) { + if (fenix_rt.options.verbose == 25) { verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index); } @@ -649,7 +649,7 @@ int __fenix_member_restore(int groupid, int memberid, void *data, int maxcount, groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.member_restore(group, memberid, data, maxcount, timestamp, data_found); } return retval; @@ -666,15 +666,15 @@ int __fenix_member_restore(int groupid, int memberid, void *data, int maxcount, int __fenix_member_lrestore(int groupid, int memberid, void *data, int maxcount, int timestamp, Fenix_Data_subset* data_found) { int retval = FENIX_SUCCESS; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery); int member_index = -1; - if(group_index != -1) member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + if(group_index != -1) member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid); - if (fenix.options.verbose == 25) { + if (fenix_rt.options.verbose == 25) { verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index); } @@ -683,7 +683,7 @@ int __fenix_member_lrestore(int groupid, int memberid, void *data, int maxcount, groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.member_lrestore(group, memberid, data, maxcount, timestamp, data_found); } return retval; @@ -701,14 +701,14 @@ int __fenix_member_lrestore(int groupid, int memberid, void *data, int maxcount, int __fenix_member_restore_from_rank(int groupid, int memberid, void *target_buffer, int max_count, int time_stamp, int source_rank) { int retval = FENIX_SUCCESS; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery); int member_index = -1; - if(group_index != -1) member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + if(group_index != -1) member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid); - if (fenix.options.verbose == 25) { + if (fenix_rt.options.verbose == 25) { verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index); } @@ -717,7 +717,7 @@ int __fenix_member_restore_from_rank(int groupid, int memberid, void *target_buf groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.member_restore_from_rank(group, memberid, target_buffer, max_count, time_stamp, source_rank); } @@ -733,12 +733,12 @@ int __fenix_member_restore_from_rank(int groupid, int memberid, void *target_buf */ int __fenix_get_number_of_members(int group_id, int *num_members) { int retval = -1; - int group_index = __fenix_search_groupid(group_id, fenix.data_recovery ); + int group_index = __fenix_search_groupid(group_id, fenix_rt.data_recovery ); if (group_index == -1) { debug_print("ERROR Fenix_Data_commit: group_id <%d> does not exist\n", group_id); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); *num_members = group->member->count; retval = FENIX_SUCCESS; } @@ -753,12 +753,12 @@ int __fenix_get_number_of_members(int group_id, int *num_members) { */ int __fenix_get_member_at_position(int group_id, int *member_id, int position) { int retval = -1; - int group_index = __fenix_search_groupid(group_id, fenix.data_recovery); + int group_index = __fenix_search_groupid(group_id, fenix_rt.data_recovery); if (group_index == -1) { debug_print("ERROR Fenix_Data_commit: group_id <%d> does not exist\n", group_id); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); fenix_member_t *member = group->member; if (position < 0 || position > (member->total_size) - 1) { debug_print( @@ -782,12 +782,12 @@ int __fenix_get_member_at_position(int group_id, int *member_id, int position) { */ int __fenix_get_number_of_snapshots(int group_id, int *num_snapshots) { int retval = -1; - int group_index = __fenix_search_groupid(group_id, fenix.data_recovery ); + int group_index = __fenix_search_groupid(group_id, fenix_rt.data_recovery ); if (group_index == -1) { debug_print("ERROR Fenix_Data_commit: group_id <%d> does not exist\n", group_id); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.get_number_of_snapshots(group, num_snapshots); } return retval; @@ -801,16 +801,16 @@ int __fenix_get_number_of_snapshots(int group_id, int *num_snapshots) { */ int __fenix_get_snapshot_at_position(int groupid, int position, int *timestamp) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); - if (fenix.options.verbose == 33) { + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); + if (fenix_rt.options.verbose == 33) { verbose_print("c-rank: %d, role: %d, group_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index); + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index); } if (group_index == -1) { debug_print("ERROR Fenix_Data_commit: group_id <%d> does not exist\n", groupid); retval = FENIX_ERROR_INVALID_GROUPID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); *timestamp = group->timestamp - position; } return retval; @@ -828,16 +828,16 @@ int __fenix_get_snapshot_at_position(int groupid, int position, int *timestamp) int __fenix_member_get_attribute(int groupid, int memberid, int attributename, void *attributevalue, int *flag, int sourcerank) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); int member_index = -1; if(group_index != -1){ - member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid); } - if (fenix.options.verbose == 34) { + if (fenix_rt.options.verbose == 34) { verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index); } if (group_index == -1) { @@ -849,7 +849,7 @@ int __fenix_member_get_attribute(int groupid, int memberid, int attributename, memberid); retval = FENIX_ERROR_INVALID_MEMBERID; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); fenix_member_t *member = group->member; fenix_member_entry_t *mentry = &(member->member_entry[member_index]); @@ -871,16 +871,16 @@ int __fenix_member_get_attribute(int groupid, int memberid, int attributename, int __fenix_member_set_attribute(int groupid, int memberid, int attributename, void *attributevalue, int *flag) { int retval = -1; - int group_index = __fenix_search_groupid(groupid, fenix.data_recovery ); + int group_index = __fenix_search_groupid(groupid, fenix_rt.data_recovery ); int member_index = -1; if(group_index != -1){ - member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + member_index = __fenix_search_memberid(fenix_rt.data_recovery->group[group_index]->member, memberid); } - if (fenix.options.verbose == 35) { + if (fenix_rt.options.verbose == 35) { verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", - __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + __fenix_get_current_rank(fenix_rt.new_world), fenix_rt.role, group_index, member_index); } @@ -895,7 +895,7 @@ int __fenix_member_set_attribute(int groupid, int memberid, int attributename, } else { int my_datatype_size; int myerr; - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); fenix_member_t *member = group->member; fenix_member_entry_t *mentry = &(member->member_entry[member_index]); @@ -948,7 +948,7 @@ int __fenix_member_set_attribute(int groupid, int memberid, int attributename, */ int __fenix_snapshot_delete(int group_id, int time_stamp) { int retval = -1; - int group_index = __fenix_search_groupid(group_id, fenix.data_recovery ); + int group_index = __fenix_search_groupid(group_id, fenix_rt.data_recovery ); if (group_index == -1) { debug_print("ERROR Fenix_Data_snapshot_delete: group_id <%d> does not exist\n", group_id); @@ -959,7 +959,7 @@ int __fenix_snapshot_delete(int group_id, int time_stamp) { time_stamp); retval = FENIX_ERROR_INVALID_TIMESTAMP; } else { - fenix_group_t *group = (fenix.data_recovery->group[group_index]); + fenix_group_t *group = (fenix_rt.data_recovery->group[group_index]); retval = group->vtbl.snapshot_delete(group, time_stamp); } return retval; @@ -980,7 +980,7 @@ void __fenix_store_single() { */ void __feninx_dr_print_store() { int group, member, version, local, remote; - fenix_data_recovery_t *current = fenix.data_recovery; + fenix_data_recovery_t *current = fenix_rt.data_recovery; int group_count = current->count; for (group = 0; group < group_count; group++) { int member_count = current->group[group]->member->count; @@ -991,14 +991,14 @@ void __feninx_dr_print_store() { int *local_data = current->group[group]->member->member_entry[member].version->local_entry[version].data; for (local = 0; local < local_data_count; local++) { //printf("*** store rank[%d] group[%d] member[%d] local[%d]: %d\n", - //get_current_rank(fenix.new_world), group, member, local, + //get_current_rank(fenix_rt.new_world), group, member, local, //local_data[local]); } int remote_data_count = current->group[group]->member->member_entry[member].version->remote_entry[version].count; int *remote_data = current->group[group]->member->member_entry[member].version->remote_entry[version].data; for (remote = 0; remote < remote_data_count; remote++) { printf("*** store rank[%d] group[%d] member[%d] remote[%d]: %d\n", - __fenix_get_current_rank(fenix.new_world), group, member, remote, + __fenix_get_current_rank(fenix_rt.new_world), group, member, remote, remote_data[remote]); } } @@ -1011,14 +1011,14 @@ void __feninx_dr_print_store() { */ void __fenix_dr_print_restore() { - fenix_data_recovery_t *current = fenix.data_recovery; + fenix_data_recovery_t *current = fenix_rt.data_recovery; int group_count = current->count; int member_count = current->group[0]->member->count; int version_count = current->group[0]->member->member_entry[0].version->count; int local_data_count = current->group[0]->member->member_entry[0].version->local_entry[0].count; int remote_data_count = current->group[0]->member->member_entry[0].version->remote_entry[0].count; printf("*** restore rank: %d; group: %d; member: %d; local: %d; remote: %d\n", - __fenix_get_current_rank(fenix.new_world), group_count, member_count, + __fenix_get_current_rank(fenix_rt.new_world), group_count, member_count, local_data_count, remote_data_count); } @@ -1028,13 +1028,13 @@ void __fenix_dr_print_restore() { */ void __fenix_dr_print_datastructure() { int group_index, member_index, version_index, remote_data_index, local_data_index; - fenix_data_recovery_t *current = fenix.data_recovery; + fenix_data_recovery_t *current = fenix_rt.data_recovery; if (!current) { return; } - printf("\n\ncurrent_rank: %d\n", __fenix_get_current_rank(fenix.new_world)); + printf("\n\ncurrent_rank: %d\n", __fenix_get_current_rank(fenix_rt.new_world)); int group_size = current->total_size; for (group_index = 0; group_index < group_size; group_index++) { int depth = current->group[group_index]->depth; diff --git a/src/fenix_exception.cpp b/src/fenix_exception.cpp index 6208243..500f433 100644 --- a/src/fenix_exception.cpp +++ b/src/fenix_exception.cpp @@ -1,7 +1,7 @@ #include "fenix_exception.hpp" #include "fenix.h" -namespace Fenix { +namespace fenix { int register_exception_callback(){ return Fenix_Callback_register( @@ -11,4 +11,4 @@ int register_exception_callback(){ ); } -} // namespace Fenix +} // namespace fenix diff --git a/src/fenix_opt.cpp b/src/fenix_opt.cpp index 5a44298..d31cd64 100644 --- a/src/fenix_opt.cpp +++ b/src/fenix_opt.cpp @@ -76,17 +76,15 @@ void __fenix_init_opt(int argc, char **argv) { /* Initialize the value */ - fenix.options.verbose = -1; + fenix_rt.options.verbose = -1; for( i = 0; i < argc; i++ ) { - if( strcmp(argv[i],"--fenix_v") == 0 || strcmp(argv[i],"--FENIX_V") == 0 ) { - printf("Inside if %d\n",i); if( i+1 < argc ) { - fenix.options.verbose = atoi(argv[i+1]); + fenix_rt.options.verbose = atoi(argv[i+1]); } } } diff --git a/src/fenix_process_recovery.cpp b/src/fenix_process_recovery.cpp index f785d15..eb3b6c2 100644 --- a/src/fenix_process_recovery.cpp +++ b/src/fenix_process_recovery.cpp @@ -74,30 +74,30 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha { int ret; - *role = fenix.role; + *role = fenix_rt.role; *error = 0; - fenix.user_world = new_comm; + fenix_rt.user_world = new_comm; - MPI_Comm_create_errhandler(__fenix_test_MPI, &fenix.mpi_errhandler); + MPI_Comm_create_errhandler(__fenix_test_MPI, &fenix_rt.mpi_errhandler); - fenix.world = (MPI_Comm *)malloc(sizeof(MPI_Comm)); - MPI_Comm_dup(comm, fenix.world); - PMPI_Comm_set_errhandler(*fenix.world, fenix.mpi_errhandler); - - fenix.finalized = 0; - fenix.spare_ranks = spare_ranks; - fenix.spawn_policy = spawn; - fenix.recover_environment = jump_environment; - fenix.role = FENIX_ROLE_INITIAL_RANK; - fenix.fail_world_size = 0; - fenix.ignore_errs = 0; - fenix.resume_mode = __FENIX_RESUME_AT_INIT; - fenix.repair_result = 0; - fenix.ret_role = role; - fenix.ret_error = error; - - fenix.options.verbose = -1; + fenix_rt.world = (MPI_Comm *)malloc(sizeof(MPI_Comm)); + MPI_Comm_dup(comm, fenix_rt.world); + PMPI_Comm_set_errhandler(*fenix_rt.world, fenix_rt.mpi_errhandler); + + fenix_rt.finalized = 0; + fenix_rt.spare_ranks = spare_ranks; + fenix_rt.spawn_policy = spawn; + fenix_rt.recover_environment = jump_environment; + fenix_rt.role = FENIX_ROLE_INITIAL_RANK; + fenix_rt.fail_world_size = 0; + fenix_rt.ignore_errs = 0; + fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT; + fenix_rt.repair_result = 0; + fenix_rt.ret_role = role; + fenix_rt.ret_error = error; + + fenix_rt.options.verbose = -1; // __fenix_init_opt(*argc, *argv); // For request tracking, make sure we can save at least an integer @@ -108,7 +108,7 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha } - MPI_Op_create((MPI_User_function *) __fenix_ranks_agree, 1, &fenix.agree_op); + MPI_Op_create((MPI_User_function *) __fenix_ranks_agree, 1, &fenix_rt.agree_op); /* Check the values in info */ if (info != MPI_INFO_NULL) { @@ -119,21 +119,21 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha MPI_Info_get(info, "FENIX_RESUME_MODE", vallen, value, &flag); if (flag == 1) { if (strcmp(value, "Fenix_init") == 0) { - fenix.resume_mode = __FENIX_RESUME_AT_INIT; - if (fenix.options.verbose == 0) { + fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT; + if (fenix_rt.options.verbose == 0) { verbose_print("rank: %d, role: %d, value: %s\n", - __fenix_get_current_rank(*fenix.world), fenix.role, value); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); } } else if (strcmp(value, "NO_JUMP") == 0) { - fenix.resume_mode = __FENIX_RESUME_NO_JUMP; - if (fenix.options.verbose == 0) { + fenix_rt.resume_mode = __FENIX_RESUME_NO_JUMP; + if (fenix_rt.options.verbose == 0) { verbose_print("rank: %d, role: %d, value: %s\n", - __fenix_get_current_rank(*fenix.world), fenix.role, value); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); } } else { /* No support. Setting it to Fenix_init */ - fenix.resume_mode = __FENIX_RESUME_AT_INIT; + fenix_rt.resume_mode = __FENIX_RESUME_AT_INIT; } } @@ -141,34 +141,34 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha MPI_Info_get(info, "FENIX_UNHANDLED_MODE", vallen, value, &flag); if (flag == 1) { if (strcmp(value, "SILENT") == 0) { - fenix.print_unhandled = 0; - if (fenix.options.verbose == 0) { + fenix_rt.print_unhandled = 0; + if (fenix_rt.options.verbose == 0) { verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n", - __fenix_get_current_rank(*fenix.world), fenix.role, value); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); } } else if (strcmp(value, "NO_JUMP") == 0) { - fenix.print_unhandled = 1; - if (fenix.options.verbose == 0) { + fenix_rt.print_unhandled = 1; + if (fenix_rt.options.verbose == 0) { verbose_print("rank: %d, role: %d, UNHANDLED_MODE: %s\n", - __fenix_get_current_rank(*fenix.world), fenix.role, value); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, value); } } else { /* No support. Setting it to silent */ - fenix.print_unhandled = 0; + fenix_rt.print_unhandled = 0; } } } - if (fenix.spare_ranks >= __fenix_get_world_size(comm)) { + if (fenix_rt.spare_ranks >= __fenix_get_world_size(comm)) { debug_print("Fenix: <%d> spare ranks requested are unavailable\n", - fenix.spare_ranks); + fenix_rt.spare_ranks); } - fenix.data_recovery = __fenix_data_recovery_init(); + fenix_rt.data_recovery = __fenix_data_recovery_init(); /*****************************************************/ - /* Note: fenix.new_world is only valid for the */ + /* Note: fenix_rt.new_world is only valid for the */ /* active MPI ranks. Spare ranks do not */ /* allocate any communicator content with this.*/ /* Any MPI calls in spare ranks with new_world */ @@ -184,66 +184,66 @@ int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, cha } if ( __fenix_spare_rank() != 1) { - fenix.num_inital_ranks = __fenix_get_world_size(fenix.new_world); - if (fenix.options.verbose == 0) { + fenix_rt.num_inital_ranks = __fenix_get_world_size(fenix_rt.new_world); + if (fenix_rt.options.verbose == 0) { verbose_print("rank: %d, role: %d, number_initial_ranks: %d\n", - __fenix_get_current_rank(*fenix.world), fenix.role, - fenix.num_inital_ranks); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, + fenix_rt.num_inital_ranks); } } else { - fenix.num_inital_ranks = spare_ranks; + fenix_rt.num_inital_ranks = spare_ranks; - if (fenix.options.verbose == 0) { + if (fenix_rt.options.verbose == 0) { verbose_print("rank: %d, role: %d, number_initial_ranks: %d\n", - __fenix_get_current_rank(*fenix.world), fenix.role, - fenix.num_inital_ranks); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role, + fenix_rt.num_inital_ranks); } } - fenix.num_survivor_ranks = 0; - fenix.num_recovered_ranks = 0; + fenix_rt.num_survivor_ranks = 0; + fenix_rt.num_recovered_ranks = 0; while ( __fenix_spare_rank() == 1) { int a; int myrank; MPI_Status mpi_status; - fenix.ignore_errs = 1; - ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix.world, + fenix_rt.ignore_errs = 1; + ret = PMPI_Recv(&a, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix_rt.world, &mpi_status); // listen for a failure - fenix.ignore_errs = 0; + fenix_rt.ignore_errs = 0; if (ret == MPI_SUCCESS) { - if (fenix.options.verbose == 0) { + if (fenix_rt.options.verbose == 0) { verbose_print("Finalize the program; rank: %d, role: %d\n", - __fenix_get_current_rank(*fenix.world), fenix.role); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role); } __fenix_finalize_spare(); } else if(ret == MPI_ERR_REVOKED){ - fenix.repair_result = __fenix_repair_ranks(); - if (fenix.options.verbose == 0) { + fenix_rt.repair_result = __fenix_repair_ranks(); + if (fenix_rt.options.verbose == 0) { verbose_print("spare rank exiting from MPI_Recv - repair ranks; rank: %d, role: %d\n", - __fenix_get_current_rank(*fenix.world), fenix.role); + __fenix_get_current_rank(*fenix_rt.world), fenix_rt.role); } } else { - MPIX_Comm_ack_failed(*fenix.world, __fenix_get_world_size(*fenix.world), &a); + MPIX_Comm_ack_failed(*fenix_rt.world, __fenix_get_world_size(*fenix_rt.world), &a); } - fenix.role = FENIX_ROLE_RECOVERED_RANK; + fenix_rt.role = FENIX_ROLE_RECOVERED_RANK; } - if(fenix.role != FENIX_ROLE_RECOVERED_RANK) MPI_Comm_dup(fenix.new_world, fenix.user_world); - fenix.user_world_exists = 1; + if(fenix_rt.role != FENIX_ROLE_RECOVERED_RANK) MPI_Comm_dup(fenix_rt.new_world, fenix_rt.user_world); + fenix_rt.user_world_exists = 1; - return fenix.role; + return fenix_rt.role; } int __fenix_spare_rank_within(MPI_Comm refcomm) { int result = -1; int current_rank = __fenix_get_current_rank(refcomm); - int new_world_size = __fenix_get_world_size(refcomm) - fenix.spare_ranks; + int new_world_size = __fenix_get_world_size(refcomm) - fenix_rt.spare_ranks; if (current_rank >= new_world_size) { - if (fenix.options.verbose == 6) { + if (fenix_rt.options.verbose == 6) { verbose_print("current_rank: %d, new_world_size: %d\n", current_rank, new_world_size); } result = 1; @@ -264,29 +264,29 @@ int __fenix_create_new_world_from(MPI_Comm from_comm) /** Use of the new communicator triggers the program to abort . **/ /*************************************************************************/ - if (fenix.options.verbose == 1) { + if (fenix_rt.options.verbose == 1) { verbose_print("rank: %d, role: %d\n", __fenix_get_current_rank(from_comm), - fenix.role); + fenix_rt.role); } ret = PMPI_Comm_split(from_comm, MPI_UNDEFINED, current_rank, - &fenix.new_world); + &fenix_rt.new_world); //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split: %d\n", ret); } - fenix.new_world_exists = 0; //Should already be this + fenix_rt.new_world_exists = 0; //Should already be this } else { int current_rank = __fenix_get_current_rank(from_comm); - if (fenix.options.verbose == 1) { + if (fenix_rt.options.verbose == 1) { verbose_print("rank: %d, role: %d\n", __fenix_get_current_rank(from_comm), - fenix.role); + fenix_rt.role); } - ret = PMPI_Comm_split(from_comm, 0, current_rank, &fenix.new_world); - fenix.new_world_exists = 1; + ret = PMPI_Comm_split(from_comm, 0, current_rank, &fenix_rt.new_world); + fenix_rt.new_world_exists = 1; if (ret != MPI_SUCCESS){ - fenix.new_world_exists = 0; + fenix_rt.new_world_exists = 0; } } @@ -294,7 +294,7 @@ int __fenix_create_new_world_from(MPI_Comm from_comm) } int __fenix_create_new_world(){ - return __fenix_create_new_world_from(*fenix.world); + return __fenix_create_new_world_from(*fenix_rt.world); } int __fenix_repair_ranks() @@ -302,7 +302,7 @@ int __fenix_repair_ranks() /*********************************************************/ /* Do not forget comm_free for broken communicators */ /*********************************************************/ - fenix.ignore_errs = 1; + fenix_rt.ignore_errs = 1; int ret; int survived_flag; @@ -320,8 +320,8 @@ int __fenix_repair_ranks() /* current_rank means the global MPI rank before failure */ - current_rank = __fenix_get_current_rank(*fenix.world); - world_size = __fenix_get_world_size(*fenix.world); + current_rank = __fenix_get_current_rank(*fenix_rt.world); + world_size = __fenix_get_world_size(*fenix_rt.world); //Double check that every process is here, not in some local error handling elsewhere. //Assume that other locations will converge here. @@ -329,14 +329,14 @@ int __fenix_repair_ranks() int location = FENIX_ERRHANDLER_LOC; do { location = FENIX_ERRHANDLER_LOC; - MPIX_Comm_agree(*fenix.user_world, &location); + MPIX_Comm_agree(*fenix_rt.user_world, &location); } while(location != FENIX_ERRHANDLER_LOC); } while (!repair_success) { repair_success = 1; - ret = MPIX_Comm_shrink(*fenix.world, &world_without_failures); + ret = MPIX_Comm_shrink(*fenix_rt.world, &world_without_failures); //if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_shrink. repair_ranks\n"); } if (ret != MPI_SUCCESS) { repair_success = 0; @@ -347,42 +347,42 @@ int __fenix_repair_ranks() /* Free up the storage for active process communicator */ /*********************************************************/ if ( __fenix_spare_rank() != 1) { - if(fenix.new_world_exists) PMPI_Comm_free(&fenix.new_world); - if(fenix.user_world_exists) PMPI_Comm_free(fenix.user_world); - fenix.user_world_exists = 0; - fenix.new_world_exists = 0; + if(fenix_rt.new_world_exists) PMPI_Comm_free(&fenix_rt.new_world); + if(fenix_rt.user_world_exists) PMPI_Comm_free(fenix_rt.user_world); + fenix_rt.user_world_exists = 0; + fenix_rt.new_world_exists = 0; } /*********************************************************/ /* Need closer look above */ /*********************************************************/ survivor_world_size = __fenix_get_world_size(world_without_failures); - fenix.fail_world_size = world_size - survivor_world_size; + fenix_rt.fail_world_size = world_size - survivor_world_size; - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { verbose_print( "current_rank: %d, role: %d, world_size: %d, fail_world_size: %d, survivor_world_size: %d\n", - current_rank, fenix.role, world_size, - fenix.fail_world_size, survivor_world_size); + current_rank, fenix_rt.role, world_size, + fenix_rt.fail_world_size, survivor_world_size); } - if (fenix.spare_ranks < fenix.fail_world_size) { + if (fenix_rt.spare_ranks < fenix_rt.fail_world_size) { /* Not enough spare ranks */ - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { verbose_print( "current_rank: %d, role: %d, spare_ranks: %d, fail_world_size: %d\n", - current_rank, fenix.role, fenix.spare_ranks, - fenix.fail_world_size); + current_rank, fenix_rt.role, fenix_rt.spare_ranks, + fenix_rt.fail_world_size); } - if (fenix.spawn_policy == 1) { - debug_print("Spawn policy <%d>is not supported\n", fenix.spawn_policy); + if (fenix_rt.spawn_policy == 1) { + debug_print("Spawn policy <%d>is not supported\n", fenix_rt.spawn_policy); } else { rt_code = FENIX_WARNING_SPARE_RANKS_DEPLETED; - if (fenix.spare_ranks != 0) { + if (fenix_rt.spare_ranks != 0) { /***************************************/ /* Fill the ranks in increasing order */ @@ -395,11 +395,11 @@ int __fenix_repair_ranks() ret = PMPI_Allgather(¤t_rank, 1, MPI_INT, survivor_world, 1, MPI_INT, world_without_failures); - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { int index; for (index = 0; index < survivor_world_size; index++) { verbose_print("current_rank: %d, role: %d, survivor_world[%d]: %d\n", - current_rank, fenix.role, index, + current_rank, fenix_rt.role, index, survivor_world[index]); } } @@ -416,11 +416,11 @@ int __fenix_repair_ranks() } survived_flag = 0; - if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) { + if (fenix_rt.role == FENIX_ROLE_SURVIVOR_RANK) { survived_flag = 1; } - ret = PMPI_Allreduce(&survived_flag, &fenix.num_survivor_ranks, 1, + ret = PMPI_Allreduce(&survived_flag, &fenix_rt.num_survivor_ranks, 1, MPI_INT, MPI_SUM, world_without_failures); //if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); } @@ -434,37 +434,37 @@ int __fenix_repair_ranks() goto END_LOOP; } - fenix.num_inital_ranks = 0; + fenix_rt.num_inital_ranks = 0; /* recovered ranks must be the number of spare ranks */ - fenix.num_recovered_ranks = fenix.fail_world_size; + fenix_rt.num_recovered_ranks = fenix_rt.fail_world_size; - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, recovered_ranks: %d\n", - current_rank, fenix.role, - fenix.num_recovered_ranks); + current_rank, fenix_rt.role, + fenix_rt.num_recovered_ranks); } - if(fenix.role != FENIX_ROLE_INITIAL_RANK){ - free(fenix.fail_world); + if(fenix_rt.role != FENIX_ROLE_INITIAL_RANK){ + free(fenix_rt.fail_world); } - fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, - fenix.fail_world_size); + fenix_rt.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, + fenix_rt.fail_world_size); - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { int index; - for (index = 0; index < fenix.fail_world_size; index++) { - verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]); + for (index = 0; index < fenix_rt.fail_world_size; index++) { + verbose_print("fail_world[%d]: %d\n", index, fenix_rt.fail_world[index]); } } free(survivor_world); - active_ranks = world_size - fenix.spare_ranks; + active_ranks = world_size - fenix_rt.spare_ranks; - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, active_ranks: %d\n", - current_rank, fenix.role, + current_rank, fenix_rt.role, active_ranks); } @@ -472,23 +472,23 @@ int __fenix_repair_ranks() if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); - for(int fail_i = 0; fail_i < fenix.fail_world_size; fail_i++){ - if(fenix.fail_world[fail_i] > current_rank) rank_offset--; + for(int fail_i = 0; fail_i < fenix_rt.fail_world_size; fail_i++){ + if(fenix_rt.fail_world[fail_i] > current_rank) rank_offset--; } - if (rank_offset < fenix.fail_world_size) { - if (fenix.options.verbose == 11) { + if (rank_offset < fenix_rt.fail_world_size) { + if (fenix_rt.options.verbose == 11) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", - current_rank, fenix.fail_world[rank_offset]); + current_rank, fenix_rt.fail_world[rank_offset]); } - current_rank = fenix.fail_world[rank_offset]; + current_rank = fenix_rt.fail_world[rank_offset]; } } /************************************/ /* Update the number of spare ranks */ /************************************/ - fenix.spare_ranks = 0; + fenix_rt.spare_ranks = 0; //debug_print("not enough spare ranks to repair rank failures. repair_ranks\n"); } @@ -518,11 +518,11 @@ int __fenix_repair_ranks() } survived_flag = 0; - if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) { + if (fenix_rt.role == FENIX_ROLE_SURVIVOR_RANK) { survived_flag = 1; } - ret = PMPI_Allreduce(&survived_flag, &fenix.num_survivor_ranks, 1, + ret = PMPI_Allreduce(&survived_flag, &fenix_rt.num_survivor_ranks, 1, MPI_INT, MPI_SUM, world_without_failures); //if (ret != MPI_SUCCESS) { debug_print("MPI_Allreduce. repair_ranks\n"); } if (ret != MPI_SUCCESS) { @@ -536,55 +536,55 @@ int __fenix_repair_ranks() } - fenix.num_inital_ranks = 0; - fenix.num_recovered_ranks = fenix.fail_world_size; + fenix_rt.num_inital_ranks = 0; + fenix_rt.num_recovered_ranks = fenix_rt.fail_world_size; - if(fenix.role != FENIX_ROLE_INITIAL_RANK){ - free(fenix.fail_world); + if(fenix_rt.role != FENIX_ROLE_INITIAL_RANK){ + free(fenix_rt.fail_world); } - fenix.fail_world = (int *) s_malloc(fenix.fail_world_size * sizeof(int)); - fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fenix.fail_world_size); + fenix_rt.fail_world = (int *) s_malloc(fenix_rt.fail_world_size * sizeof(int)); + fenix_rt.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fenix_rt.fail_world_size); free(survivor_world); - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { int index; - for (index = 0; index < fenix.fail_world_size; index++) { - verbose_print("fail_world[%d]: %d\n", index, fenix.fail_world[index]); + for (index = 0; index < fenix_rt.fail_world_size; index++) { + verbose_print("fail_world[%d]: %d\n", index, fenix_rt.fail_world[index]); } } - active_ranks = world_size - fenix.spare_ranks; + active_ranks = world_size - fenix_rt.spare_ranks; - if (fenix.options.verbose == 2) { + if (fenix_rt.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, active_ranks: %d\n", - current_rank, fenix.role, active_ranks); + current_rank, fenix_rt.role, active_ranks); } if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); - for(int fail_i = 0; fail_i < fenix.fail_world_size; fail_i++){ - if(fenix.fail_world[fail_i] > current_rank) rank_offset--; + for(int fail_i = 0; fail_i < fenix_rt.fail_world_size; fail_i++){ + if(fenix_rt.fail_world[fail_i] > current_rank) rank_offset--; } - if (rank_offset < fenix.fail_world_size) { - if (fenix.options.verbose == 2) { + if (rank_offset < fenix_rt.fail_world_size) { + if (fenix_rt.options.verbose == 2) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d (offset %d)\n", - current_rank, fenix.fail_world[rank_offset], rank_offset); + current_rank, fenix_rt.fail_world[rank_offset], rank_offset); } - current_rank = fenix.fail_world[rank_offset]; + current_rank = fenix_rt.fail_world[rank_offset]; } } /************************************/ /* Update the number of spare ranks */ /************************************/ - fenix.spare_ranks = fenix.spare_ranks - fenix.fail_world_size; - if (fenix.options.verbose == 2) { + fenix_rt.spare_ranks = fenix_rt.spare_ranks - fenix_rt.fail_world_size; + if (fenix_rt.options.verbose == 2) { verbose_print("current_rank: %d, role: %d, spare_ranks: %d\n", - current_rank, fenix.role, - fenix.spare_ranks); + current_rank, fenix_rt.role, + fenix_rt.spare_ranks); } } @@ -614,14 +614,14 @@ int __fenix_repair_ranks() } if(__fenix_spare_rank_within(fixed_world) == -1){ - ret = MPI_Comm_dup(fenix.new_world, fenix.user_world); + ret = MPI_Comm_dup(fenix_rt.new_world, fenix_rt.user_world); if (ret != MPI_SUCCESS){ repair_success = 0; MPIX_Comm_revoke(fixed_world); MPI_Comm_free(&fixed_world); goto END_LOOP; } - fenix.user_world_exists = 1; + fenix_rt.user_world_exists = 1; } ret = PMPI_Barrier(fixed_world); @@ -647,8 +647,8 @@ int __fenix_repair_ranks() */ } - *fenix.world = fixed_world; - fenix.ignore_errs=0; + *fenix_rt.world = fixed_world; + fenix_rt.ignore_errs=0; return rt_code; } @@ -662,7 +662,7 @@ int* __fenix_get_fail_ranks(int *survivor_world, int survivor_world_size, int fa int i; for (i = 0; i < survivor_world_size + fail_world_size; i++) { if (__fenix_binary_search(survivor_world, survivor_world_size, i) != 1) { - if (fenix.options.verbose == 14) { + if (fenix_rt.options.verbose == 14) { verbose_print("fail_rank: %d, fail_ranks[%d]: %d\n", i, failed_pos, fail_ranks[failed_pos++]); } @@ -673,133 +673,133 @@ int* __fenix_get_fail_ranks(int *survivor_world, int survivor_world_size, int fa } int __fenix_spare_rank(){ - return __fenix_spare_rank_within(*fenix.world); + return __fenix_spare_rank_within(*fenix_rt.world); } void __fenix_postinit(int *error) { - //if (fenix.options.verbose == 9) { - // verbose_print(" postinit: current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix.new_world), - // fenix.role); + //if (fenix_rt.options.verbose == 9) { + // verbose_print(" postinit: current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix_rt.new_world), + // fenix_rt.role); //} - if(fenix.new_world_exists){ + if(fenix_rt.new_world_exists){ //Set up dummy irecv to use for checking for failures. - MPI_Irecv(&fenix.dummy_recv_buffer, 1, MPI_INT, MPI_ANY_SOURCE, - 34095347, fenix.new_world, &fenix.check_failures_req); + MPI_Irecv(&fenix_rt.dummy_recv_buffer, 1, MPI_INT, MPI_ANY_SOURCE, + 34095347, fenix_rt.new_world, &fenix_rt.check_failures_req); } - if (fenix.repair_result != 0) { - *error = fenix.repair_result; + if (fenix_rt.repair_result != 0) { + *error = fenix_rt.repair_result; } - fenix.fenix_init_flag = 1; + fenix_rt.fenix_init_flag = 1; #if 0 - if (fenix.role != FENIX_ROLE_INITIAL_RANK) { + if (fenix_rt.role != FENIX_ROLE_INITIAL_RANK) { init_data_recovery(); } #endif - if (fenix.role == FENIX_ROLE_SURVIVOR_RANK) { + if (fenix_rt.role == FENIX_ROLE_SURVIVOR_RANK) { __fenix_callback_invoke_all(*error); } - if (fenix.options.verbose == 9) { - verbose_print("After barrier. current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix.new_world), - fenix.role); + if (fenix_rt.options.verbose == 9) { + verbose_print("After barrier. current_rank: %d, role: %d\n", __fenix_get_current_rank(fenix_rt.new_world), + fenix_rt.role); } } int __fenix_detect_failures(int do_recovery){ - if(!fenix.new_world_exists) return FENIX_ERROR_UNINITIALIZED; + if(!fenix_rt.new_world_exists) return FENIX_ERROR_UNINITIALIZED; - int old_ignore_errs = fenix.ignore_errs; - fenix.ignore_errs = !do_recovery; + int old_ignore_errs = fenix_rt.ignore_errs; + fenix_rt.ignore_errs = !do_recovery; int req_completed; - int ret = MPI_Test(&fenix.check_failures_req, &req_completed, MPI_STATUS_IGNORE); + int ret = MPI_Test(&fenix_rt.check_failures_req, &req_completed, MPI_STATUS_IGNORE); if(req_completed) ret = FENIX_ERROR_INTERN; - fenix.ignore_errs = old_ignore_errs; + fenix_rt.ignore_errs = old_ignore_errs; return ret; } void __fenix_finalize() { int location = FENIX_FINALIZE_LOC; - MPIX_Comm_agree(*fenix.user_world, &location); + MPIX_Comm_agree(*fenix_rt.user_world, &location); if(location != FENIX_FINALIZE_LOC){ //Some ranks are in error recovery, so trigger error handling. - MPIX_Comm_revoke(*fenix.user_world); - MPI_Barrier(*fenix.user_world); + MPIX_Comm_revoke(*fenix_rt.user_world); + MPI_Barrier(*fenix_rt.user_world); //In case no-jump enabled after recovery return __fenix_finalize(); } - int first_spare_rank = __fenix_get_world_size(*fenix.user_world); - int last_spare_rank = __fenix_get_world_size(*fenix.world) - 1; + int first_spare_rank = __fenix_get_world_size(*fenix_rt.user_world); + int last_spare_rank = __fenix_get_world_size(*fenix_rt.world) - 1; //If we've reached here, we will finalized regardless of further errors. - fenix.ignore_errs = 1; - while(!fenix.finalized){ - int user_rank = __fenix_get_current_rank(*fenix.user_world); + fenix_rt.ignore_errs = 1; + while(!fenix_rt.finalized){ + int user_rank = __fenix_get_current_rank(*fenix_rt.user_world); if (user_rank == 0) { for (int i = first_spare_rank; i <= last_spare_rank; i++) { //We don't care if a spare failed, ignore return value int unused; - MPI_Send(&unused, 1, MPI_INT, i, 1, *fenix.world); + MPI_Send(&unused, 1, MPI_INT, i, 1, *fenix_rt.world); } } //We need to confirm that rank 0 didn't fail, since it could have //failed before notifying some spares to leave. int need_retry = user_rank == 0 ? 0 : 1; - MPIX_Comm_agree(*fenix.user_world, &need_retry); + MPIX_Comm_agree(*fenix_rt.user_world, &need_retry); if(need_retry == 1){ //Rank 0 didn't contribute, so we need to retry. - MPIX_Comm_shrink(*fenix.user_world, fenix.user_world); + MPIX_Comm_shrink(*fenix_rt.user_world, fenix_rt.user_world); continue; } else { //If rank 0 did contribute, we know sends made it, and regardless //of any other failures we finalize. - fenix.finalized = 1; + fenix_rt.finalized = 1; } } //Now we do one last agree w/ the spares to let them know they can actually //finalize int unused; - MPIX_Comm_agree(*fenix.world, &unused); + MPIX_Comm_agree(*fenix_rt.world, &unused); - MPI_Op_free( &fenix.agree_op ); - MPI_Comm_set_errhandler( *fenix.world, MPI_ERRORS_ARE_FATAL ); - MPI_Comm_free( fenix.world ); - free(fenix.world); - if(fenix.new_world_exists) MPI_Comm_free( &fenix.new_world ); //It should, but just in case. Won't update because trying to free it again ought to generate an error anyway. - - if(fenix.role != FENIX_ROLE_INITIAL_RANK){ - free(fenix.fail_world); + MPI_Op_free( &fenix_rt.agree_op ); + MPI_Comm_set_errhandler( *fenix_rt.world, MPI_ERRORS_ARE_FATAL ); + MPI_Comm_free( fenix_rt.world ); + free(fenix_rt.world); + if(fenix_rt.new_world_exists) MPI_Comm_free( &fenix_rt.new_world ); //It should, but just in case. Won't update because trying to free it again ought to generate an error anyway. + + if(fenix_rt.role != FENIX_ROLE_INITIAL_RANK){ + free(fenix_rt.fail_world); } /* Free data recovery interface */ - __fenix_data_recovery_destroy( fenix.data_recovery ); + __fenix_data_recovery_destroy( fenix_rt.data_recovery ); /* Free up any C++ data structures, reset default variables */ - fenix = {}; + fenix_rt = {}; } void __fenix_finalize_spare() { - fenix.fenix_init_flag = 0; + fenix_rt.fenix_init_flag = 0; int unused; MPI_Request agree_req, recv_req = MPI_REQUEST_NULL; - MPIX_Comm_iagree(*fenix.world, &unused, &agree_req); + MPIX_Comm_iagree(*fenix_rt.world, &unused, &agree_req); while(true){ int completed = 0; MPI_Test(&agree_req, &completed, MPI_STATUS_IGNORE); @@ -808,24 +808,24 @@ void __fenix_finalize_spare() int ret = MPI_Test(&recv_req, &completed, MPI_STATUS_IGNORE); if(completed){ //We may get duplicate messages informing us to exit - MPI_Irecv(&unused, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix.world, &recv_req); + MPI_Irecv(&unused, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *fenix_rt.world, &recv_req); } if(ret != MPI_SUCCESS){ - MPIX_Comm_ack_failed(*fenix.world, __fenix_get_world_size(*fenix.world), &unused); + MPIX_Comm_ack_failed(*fenix_rt.world, __fenix_get_world_size(*fenix_rt.world), &unused); } } if(recv_req != MPI_REQUEST_NULL) MPI_Cancel(&recv_req); - MPI_Op_free(&fenix.agree_op); - MPI_Comm_set_errhandler(*fenix.world, MPI_ERRORS_ARE_FATAL); - MPI_Comm_free(fenix.world); + MPI_Op_free(&fenix_rt.agree_op); + MPI_Comm_set_errhandler(*fenix_rt.world, MPI_ERRORS_ARE_FATAL); + MPI_Comm_free(fenix_rt.world); /* Free data recovery interface */ - __fenix_data_recovery_destroy( fenix.data_recovery ); + __fenix_data_recovery_destroy( fenix_rt.data_recovery ); /* Free up any C++ data structures, reset default variables */ - fenix = {}; + fenix_rt = {}; /* Future version do not close MPI. Jump to where Fenix_Finalize is called. */ MPI_Finalize(); @@ -837,28 +837,28 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...) int ret_repair; int index; int ret = *pret; - if(!fenix.fenix_init_flag || __fenix_spare_rank() == 1 || fenix.ignore_errs) { + if(!fenix_rt.fenix_init_flag || __fenix_spare_rank() == 1 || fenix_rt.ignore_errs) { return; } switch (ret) { case MPI_ERR_PROC_FAILED_PENDING: case MPI_ERR_PROC_FAILED: - MPIX_Comm_revoke(*fenix.world); - MPIX_Comm_revoke(fenix.new_world); + MPIX_Comm_revoke(*fenix_rt.world); + MPIX_Comm_revoke(fenix_rt.new_world); - if(fenix.user_world_exists) MPIX_Comm_revoke(*fenix.user_world); + if(fenix_rt.user_world_exists) MPIX_Comm_revoke(*fenix_rt.user_world); - fenix.repair_result = __fenix_repair_ranks(); + fenix_rt.repair_result = __fenix_repair_ranks(); break; case MPI_ERR_REVOKED: - fenix.repair_result = __fenix_repair_ranks(); + fenix_rt.repair_result = __fenix_repair_ranks(); break; case MPI_ERR_INTERN: printf("Fenix detected error: MPI_ERR_INTERN\n"); default: - if(fenix.print_unhandled){ + if(fenix_rt.print_unhandled){ int len; char errstr[MPI_MAX_ERROR_STRING]; MPI_Error_string(ret, errstr, &len); @@ -869,15 +869,15 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...) } - fenix.role = FENIX_ROLE_SURVIVOR_RANK; - if(!fenix.finalized) { - switch(fenix.resume_mode) { + fenix_rt.role = FENIX_ROLE_SURVIVOR_RANK; + if(!fenix_rt.finalized) { + switch(fenix_rt.resume_mode) { case __FENIX_RESUME_AT_INIT: - longjmp(*fenix.recover_environment, 1); + longjmp(*fenix_rt.recover_environment, 1); break; case __FENIX_RESUME_NO_JUMP: - *(fenix.ret_role) = FENIX_ROLE_SURVIVOR_RANK; - __fenix_postinit(fenix.ret_error); + *(fenix_rt.ret_role) = FENIX_ROLE_SURVIVOR_RANK; + __fenix_postinit(fenix_rt.ret_error); break; default: printf("Fenix detected error: Unknown resume mode\n"); diff --git a/src/globals.cpp b/src/globals.cpp deleted file mode 100644 index 8285983..0000000 --- a/src/globals.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// -// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| -// _| _| _|_| _| _| _| _| -// _|_|_| _|_|_| _| _| _| _| _| -// _| _| _| _|_| _| _| _| -// _| _|_|_|_| _| _| _|_|_| _| _| -// -// -// -// -// Copyright (C) 2018 Rutgers University and Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// Michael Heroux, and Matthew Whitlock -// -// Questions? Contact Keita Teranishi (knteran@sandia.gov) and -// Marc Gamell (mgamell@cac.rutgers.edu) -// -// ************************************************************************ -//@HEADER -*/ - -#include "fenix_ext.hpp" - -fenix_t fenix = { - .fenix_init_flag = 0 -}; diff --git a/test/exception_throw/fenix_exceptions.cpp b/test/exception_throw/fenix_exceptions.cpp index 92fc9a0..8142b12 100644 --- a/test/exception_throw/fenix_exceptions.cpp +++ b/test/exception_throw/fenix_exceptions.cpp @@ -76,7 +76,7 @@ int main(int argc, char **argv) { MPI_Info_set(info, "FENIX_UNHANDLED_MODE", "NO_JUMP"); Fenix_Init(&fenix_role, MPI_COMM_WORLD, &res_comm, &argc, &argv, 0, 0, info, &error); - Fenix::register_exception_callback(); + fenix::register_exception_callback(); if(fenix_role == FENIX_ROLE_SURVIVOR_RANK){ printf("FAILURE: longjmp instead of exception\n"); @@ -92,7 +92,7 @@ int main(int argc, char **argv) { MPI_Barrier(res_comm); printf("FAILURE: barrier finished without fault\n"); status = 1; - } catch (Fenix::CommException e){ + } catch (fenix::CommException e){ printf("SUCCESS: caught CommException\n"); } }