Skip to content

Commit

Permalink
Fix testing of suicide for daemons
Browse files Browse the repository at this point in the history
We don't support a cmd line option for this as it isn't
something a user should ever do. Instead, we use two
MCA params to specify it:

prte_daemon_fail <N> - specifies the daemon rank that
should commit suicide

prte_daemon_fail_delay <N> - time in seconds the target
rank should wait before dying. A value of zero means
no delay, just die after calling init. This is the
default value.

Signed-off-by: Ralph Castain <rhc@pmix.org>
  • Loading branch information
rhc54 committed Feb 9, 2024
1 parent 781e5d2 commit 618dd0a
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 54 deletions.
5 changes: 1 addition & 4 deletions src/docs/show-help-files/help-prte.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.. -*- rst -*-
Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
$COPYRIGHT$
Expand Down Expand Up @@ -79,9 +79,6 @@ option to the help request as ``--help <option>``.
* - ``--leave-session-attached``
- Do not discard stdout/stderr of remote PRRTE daemons

* - ``--test-suicide <arg0>``
- Direct that the specified daemon suicide after delay

* - ``--display <arg0>``
- Comma-delimited list of options for displaying information

Expand Down
5 changes: 1 addition & 4 deletions src/docs/show-help-files/help-prterun.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
.. -*- rst -*-
Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved.
$COPYRIGHT$
Expand Down Expand Up @@ -107,9 +107,6 @@ option to the help request as ``--help <option>``.
- Direct the specified processes to stop at an
application-controlled location

* - ``--test-suicide <arg0>``
- Direct that the specified daemon suicide after delay

* - ``--do-not-launch``
- Perform all necessary operations to prepare to launch the
application, but do not actually launch it (usually used to
Expand Down
4 changes: 1 addition & 3 deletions src/mca/schizo/prte/schizo_prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
* Copyright (c) 2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018-2022 IBM Corporation. All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting All rights reserved.
* Copyright (c) 2021-2024 Nanook Consulting All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -98,7 +98,6 @@ static struct option prteoptions[] = {
PMIX_OPTION_DEFINE(PRTE_CLI_SET_SID, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_PID, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_URI, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_TEST_SUICIDE, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_DEFAULT_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_SINGLETON, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_KEEPALIVE, PMIX_ARG_REQD),
Expand Down Expand Up @@ -152,7 +151,6 @@ static struct option prterunoptions[] = {
PMIX_OPTION_DEFINE(PRTE_CLI_SET_SID, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_PID, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_REPORT_URI, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_TEST_SUICIDE, PMIX_ARG_NONE),
PMIX_OPTION_DEFINE(PRTE_CLI_DEFAULT_HOSTFILE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_KEEPALIVE, PMIX_ARG_REQD),
PMIX_OPTION_DEFINE(PRTE_CLI_LAUNCH_AGENT, PMIX_ARG_REQD),
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ bool prte_show_resolved_nodenames = false;
bool prte_do_not_resolve = false;
int prte_hostname_cutoff = 1000;

int prted_debug_failure = -1;
pmix_rank_t prted_debug_failure = PMIX_RANK_INVALID;
int prted_debug_failure_delay = -1;
bool prte_never_launched = false;
bool prte_devel_level_output = false;
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/prte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ PRTE_EXPORT extern int prte_hostname_cutoff;
PRTE_EXPORT extern bool prte_do_not_resolve;

/* debug flags */
PRTE_EXPORT extern int prted_debug_failure;
PRTE_EXPORT extern pmix_rank_t prted_debug_failure;
PRTE_EXPORT extern int prted_debug_failure_delay;

PRTE_EXPORT extern bool prte_never_launched;
Expand Down
56 changes: 56 additions & 0 deletions src/tools/prte/prte.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
#include "src/mca/schizo/base/base.h"
#include "src/mca/state/base/base.h"
#include "src/runtime/prte_globals.h"
#include "src/runtime/prte_wait.h"
#include "src/runtime/runtime.h"

#include "include/prte.h"
Expand Down Expand Up @@ -228,6 +229,32 @@ static void setup_sighandler(int signal, prte_event_t *ev, prte_event_cbfunc_t c
prte_event_signal_add(ev, NULL);
}

static void shutdown_callback(int fd, short flags, void *arg)
{
prte_timer_t *tm = (prte_timer_t *) arg;
prte_job_t *jdata;
PRTE_HIDE_UNUSED_PARAMS(fd, flags);

if (NULL != tm) {
/* release the timer */
PMIX_RELEASE(tm);
}

/* if we were ordered to abort, do so */
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
prte_odls.kill_local_procs(NULL);
// mark that we are finalizing so the session directory will cleanup
prte_finalizing = true;
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
}

int main(int argc, char *argv[])
{
int rc = 1, i;
Expand Down Expand Up @@ -850,6 +877,35 @@ int main(int argc, char *argv[])
goto DONE;
}

// see if we are to suicide
if (PMIX_RANK_INVALID != prted_debug_failure) {
/* are we the specified vpid? */
if (PRTE_PROC_MY_NAME->rank == prted_debug_failure ||
prted_debug_failure == PMIX_RANK_WILDCARD) {
/* if the user specified we delay, then setup a timer
* and have it kill us
*/
if (0 < prted_debug_failure_delay) {
PRTE_TIMER_EVENT(prted_debug_failure_delay, 0, shutdown_callback);

} else {
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));

/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);

/* return with non-zero status */
ret = PRTE_ERROR_DEFAULT_EXIT_CODE;
goto DONE;
}
}
}

opt = pmix_cmd_line_get_param(&results, PRTE_CLI_REPORT_PID);
if (NULL != opt) {
/* if the string is a "-", then output to stdout */
Expand Down
47 changes: 8 additions & 39 deletions src/tools/prted/prted.c
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ static void report_prted(void);
static pmix_data_buffer_t *bucket, *mybucket = NULL;
static int ncollected = 0;
static bool node_regex_waiting = false;
static bool prted_abort = false;
static char *prte_parent_uri = NULL;
static pmix_cli_result_t results;

Expand Down Expand Up @@ -415,27 +414,19 @@ int main(int argc, char *argv[])
}
}

if ((int) PMIX_RANK_INVALID != prted_debug_failure) {
prted_abort = false;
/* some vpid was ordered to fail. The value can be positive
* or negative, depending upon the desired method for failure,
* so need to check both here
*/
if (0 > prted_debug_failure) {
prted_debug_failure = -1 * prted_debug_failure;
prted_abort = true;
}
if (PMIX_RANK_INVALID != prted_debug_failure) {
/* are we the specified vpid? */
if ((int) PRTE_PROC_MY_NAME->rank == prted_debug_failure) {
if (PRTE_PROC_MY_NAME->rank == prted_debug_failure ||
prted_debug_failure == PMIX_RANK_WILDCARD) {
/* if the user specified we delay, then setup a timer
* and have it kill us
*/
if (0 < prted_debug_failure_delay) {
PRTE_TIMER_EVENT(prted_debug_failure_delay, 0, shutdown_callback);

} else {
pmix_output(0, "%s is executing clean %s", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
prted_abort ? "abort" : "abnormal termination");
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));

/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
Expand All @@ -444,12 +435,7 @@ int main(int argc, char *argv[])
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);

/* if we were ordered to abort, do so */
if (prted_abort) {
abort();
}

/* otherwise, return with non-zero status */
/* return with non-zero status */
ret = PRTE_ERROR_DEFAULT_EXIT_CODE;
goto DONE;
}
Expand Down Expand Up @@ -822,7 +808,6 @@ int main(int argc, char *argv[])
static void shutdown_callback(int fd, short flags, void *arg)
{
prte_timer_t *tm = (prte_timer_t *) arg;
bool suicide = false;
prte_job_t *jdata;
PRTE_HIDE_UNUSED_PARAMS(fd, flags);

Expand All @@ -832,31 +817,15 @@ static void shutdown_callback(int fd, short flags, void *arg)
}

/* if we were ordered to abort, do so */
if (prted_abort) {
if (pmix_cmd_line_is_taken(&results, PRTE_CLI_TEST_SUICIDE)) {
suicide = true;
}
pmix_output(0, "%s is executing %s abort", PRTE_NAME_PRINT(PRTE_PROC_MY_NAME),
suicide ? "suicide" : "clean");
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just kill our
* local procs, forcibly cleanup the local session_dir tree, and abort
*/
if (suicide) {
exit(1);
}
prte_odls.kill_local_procs(NULL);
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);
abort();
}
pmix_output(0, "%s is executing clean abnormal termination",
PRTE_NAME_PRINT(PRTE_PROC_MY_NAME));
/* do -not- call finalize as this will send a message to the HNP
* indicating clean termination! Instead, just forcibly cleanup
* the local session_dir tree and exit
*/
prte_odls.kill_local_procs(NULL);
// mark that we are finalizing so the session directory will cleanup
prte_finalizing = true;
jdata = prte_get_job_data_object(PRTE_PROC_MY_NAME->nspace);
PMIX_RELEASE(jdata);
exit(PRTE_ERROR_DEFAULT_EXIT_CODE);
Expand Down
4 changes: 2 additions & 2 deletions src/util/session_dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ void prte_job_session_dir_finalize(prte_job_t *jdata)
if (PMIX_CHECK_NSPACE(PRTE_PROC_MY_NAME->nspace, jdata->nspace)) {
if (prte_finalizing) {
if (NULL != prte_process_info.top_session_dir) {
pmix_os_dirpath_destroy(prte_process_info.top_session_dir, false, _check_file);
pmix_os_dirpath_destroy(prte_process_info.top_session_dir, true, _check_file);
rmdir(prte_process_info.top_session_dir);
free(prte_process_info.top_session_dir);
prte_process_info.top_session_dir = NULL;
Expand All @@ -336,7 +336,7 @@ void prte_job_session_dir_finalize(prte_job_t *jdata)
return;
}

pmix_os_dirpath_destroy(jdata->session_dir, false, _check_file);
pmix_os_dirpath_destroy(jdata->session_dir, true, _check_file);
/* if the job-level session dir is now empty, remove it */
rmdir(jdata->session_dir);
free(jdata->session_dir);
Expand Down

0 comments on commit 618dd0a

Please sign in to comment.