Skip to content

Commit

Permalink
Add a new "pctrl" tool for requesting job control ops
Browse files Browse the repository at this point in the history
Create a new tool for cmd line requests for job control
operations. Add a new attribute to support requests to
define new psets. Add new definitions for pctrl cmd line
options. Add some missing function definitions to pmix.h
for backing of macros.

Cleanup pmix_server_get of pset names - need to return
the data in the form expected by client-get.

Signed-off-by: Ralph Castain <rhc@pmix.org>
  • Loading branch information
rhc54 committed Sep 7, 2023
1 parent 439c0a0 commit dfab3b3
Show file tree
Hide file tree
Showing 10 changed files with 890 additions and 5 deletions.
1 change: 1 addition & 0 deletions config/pmix.m4
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,7 @@ AC_DEFUN([PMIX_SETUP_CORE],[
pmix_config_prefix[src/tools/wrapper/Makefile]
pmix_config_prefix[src/tools/wrapper/pmixcc-wrapper-data.txt]
pmix_config_prefix[src/tools/palloc/Makefile]
pmix_config_prefix[src/tools/pctrl/Makefile]
)

# Success
Expand Down
11 changes: 10 additions & 1 deletion examples/pset.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* Copyright (c) 2013-2020 Intel, Inc. All rights reserved.
* Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
* Copyright (c) 2019 IBM Corporation. All rights reserved.
* Copyright (c) 2021-2022 Nanook Consulting. All rights reserved.
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -129,6 +129,7 @@ int main(int argc, char **argv)
mylock_t mylock;
myrel_t myrel;
pmix_info_t info;
pmix_value_t *val;

EXAMPLES_HIDE_UNUSED_PARAMS(argc, argv);

Expand Down Expand Up @@ -180,6 +181,14 @@ int main(int argc, char **argv)
}
DEBUG_DESTRUCT_MYREL(&myrel);

// check if I can retrieve my new pset membership
if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_PSET_NAMES, NULL, 0, &val))) {
fprintf(stderr, "[%s:%d] PMIx_Get PMIX_PSET_NAMES returned %s\n", myproc.nspace,
myproc.rank, PMIx_Error_string(rc));
goto done;
}
fprintf(stderr, "[%s:%d] belongs to psets %s\n", myproc.nspace, myproc.rank, val->data.string);
PMIX_VALUE_RELEASE(val);

done:
/* finalize us */
Expand Down
6 changes: 6 additions & 0 deletions include/pmix.h
Original file line number Diff line number Diff line change
Expand Up @@ -1720,7 +1720,13 @@ PMIX_EXPORT pmix_value_cmp_t PMIx_Value_compare(pmix_value_t *v1,



PMIX_EXPORT void PMIx_Data_array_init(pmix_data_array_t *p,
pmix_data_type_t type);
PMIX_EXPORT void PMIx_Data_array_construct(pmix_data_array_t *p,
size_t num, pmix_data_type_t type);
PMIX_EXPORT void PMIx_Data_array_destruct(pmix_data_array_t *d);
PMIX_EXPORT pmix_data_array_t* PMIx_Data_array_create(size_t n, pmix_data_type_t type);
PMIX_EXPORT void PMIx_Data_array_free(pmix_data_array_t *p);


/* initialize an info struct */
Expand Down
1 change: 1 addition & 0 deletions include/pmix_common.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -897,6 +897,7 @@ typedef uint32_t pmix_rank_t;
#define PMIX_JOB_CTRL_PROVISION_IMAGE "pmix.jctrl.pvnimg" // (char*) name of the image that is to be provisioned
#define PMIX_JOB_CTRL_PREEMPTIBLE "pmix.jctrl.preempt" // (bool) job can be pre-empted
#define PMIX_JOB_CTRL_TERMINATE "pmix.jctrl.term" // (bool) politely terminate the specified procs
#define PMIX_JOB_CTRL_DEFINE_PSET "pmix.jctrl.defpset" // (char*) Pset name to be assigned to the targets
#define PMIX_REGISTER_CLEANUP "pmix.reg.cleanup" // (char*) comma-delimited list of files to
// be removed upon process termination
#define PMIX_REGISTER_CLEANUP_DIR "pmix.reg.cleanupdir" // (char*) comma-delimited list of directories to
Expand Down
57 changes: 55 additions & 2 deletions src/server/pmix_server_get.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, pmix_modex_cbfunc_t cbfunc, vo
struct timeval tv = {0, 0};
pmix_buffer_t pbkt;
pmix_cb_t cb;
pmix_kval_t *kval;
pmix_byte_object_t bo;
pmix_proc_t proc;
char *data;
size_t sz, n;
Expand Down Expand Up @@ -256,10 +258,61 @@ pmix_status_t pmix_server_get(pmix_buffer_t *buf, pmix_modex_cbfunc_t cbfunc, vo
}
if (NULL != psets) {
data = PMIx_Argv_join(psets, ',');
sz = strlen(data);
PMIx_Argv_free(psets);
// pass it back
// we have to assemble this data into a form that
// the client_get function can properly unpack
PMIX_CONSTRUCT(&pbkt, pmix_buffer_t);
PMIX_CONSTRUCT(&cb, pmix_cb_t);
PMIX_KVAL_NEW(kval, PMIX_PSET_NAMES);
kval->value->data.string = data;
kval->value->type = PMIX_STRING;
pmix_list_append(&cb.kvs, &kval->super);
/* assemble the provided data into a byte object */
PMIX_GDS_ASSEMB_KVS_REQ(rc, pmix_globals.mypeer, &proc, &cb.kvs, &pbkt, cd);
if (rc != PMIX_SUCCESS) {
PMIX_ERROR_LOG(rc);
PMIX_DESTRUCT(&pbkt);
PMIX_DESTRUCT(&cb);
return rc;
}
PMIX_DESTRUCT(&cb);
if (PMIX_PEER_IS_V1(cd->peer)) {
/* if the client is using v1, then it expects the
* data returned to it as the rank followed by a byte object containing
* a buffer - so we have to do a little gyration */
pmix_buffer_t xfer;
PMIX_CONSTRUCT(&xfer, pmix_buffer_t);
PMIX_BFROPS_PACK(rc, cd->peer, &xfer, &pbkt, 1, PMIX_BUFFER);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DESTRUCT(&pbkt);
PMIX_DESTRUCT(&xfer);
PMIX_DESTRUCT(&cb);
return rc;
}
PMIX_UNLOAD_BUFFER(&xfer, bo.bytes, bo.size);
PMIX_DESTRUCT(&xfer);
} else {
PMIX_UNLOAD_BUFFER(&pbkt, bo.bytes, bo.size);
}
PMIX_DESTRUCT(&pbkt);
/* pack it for transmission */
PMIX_CONSTRUCT(&pbkt, pmix_buffer_t);
PMIX_BFROPS_PACK(rc, cd->peer, &pbkt, &bo, 1, PMIX_BYTE_OBJECT);
if (PMIX_SUCCESS != rc) {
PMIX_ERROR_LOG(rc);
PMIX_DESTRUCT(&pbkt);
return rc;
}
/* unload the resulting payload */
PMIX_UNLOAD_BUFFER(&pbkt, data, sz);
PMIX_DESTRUCT(&pbkt);
/* call the internal callback function - it will
* release the cbdata */
cbfunc(PMIX_SUCCESS, data, sz, cbdata, relfn, data);
/* return success so the server doesn't duplicate
* the release of cbdata */
return PMIX_SUCCESS;
} else {
// return not found as this proc doesn't belong to any psets
Expand Down Expand Up @@ -734,7 +787,7 @@ static pmix_status_t get_job_data(char *nspace,
}
if (PMIX_PEER_IS_V1(cd->peer)) {
/* if the client is using v1, then it expects the
* data returned to it as the rank followed by abyte object containing
* data returned to it as the rank followed by a byte object containing
* a buffer - so we have to do a little gyration */
pmix_buffer_t xfer;
PMIX_CONSTRUCT(&xfer, pmix_buffer_t);
Expand Down
6 changes: 4 additions & 2 deletions src/tools/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ SUBDIRS += \
tools/pattrs \
tools/pquery \
tools/wrapper \
tools/palloc
tools/palloc \
tools/pctrl

DIST_SUBDIRS += \
tools/pevent \
Expand All @@ -43,4 +44,5 @@ DIST_SUBDIRS += \
tools/pattrs \
tools/pquery \
tools/wrapper \
tools/palloc
tools/palloc \
tools/pctrl
33 changes: 33 additions & 0 deletions src/tools/pctrl/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved.
# Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
# Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

if PMIX_INSTALL_BINARIES

bin_PROGRAMS = pctrl

dist_pmixdata_DATA = help-pctrl.txt

endif # PMIX_INSTALL_BINARIES

pctrl_SOURCES = pctrl.c
pctrl_LDADD = \
$(top_builddir)/src/libpmix.la
182 changes: 182 additions & 0 deletions src/tools/pctrl/help-pctrl.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
# -*- text -*-
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2012 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
# Copyright (c) 2022-2023 Nanook Consulting. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#
# This is the US/English help file for pctrl
#
[usage]
%s (%s) %s

Usage: %s [OPTION]...
PMIx Job control tool


/***** General Options *****/

-h|--help This help message
-h|--help <arg0> Help for the specified option
-v|--verbose Enable typical debug options
-V|--version Print version and exit

--uri <arg0> Specify the URI of the server to which we are to connect, or
the name of the file (specified as file:filename) that contains that info
--namespace <arg0> Namespace of the daemon to which we should connect
--nspace <arg0> Synonym for "namespace"
--pid <arg0> PID of the daemon to which we should connect (int => PID or file:<file>
for file containing the PID
--system-server-first First look for a system server and connect to it if found
--system-server-only Connect only to a system-level server
--tmpdir <arg0> Set the root for the session directory tree
--wait-to-connect <arg0> Delay specified number of seconds before trying to connect
--num-connect-retries <arg0> Max number of times to try to connect

--request-id <arg0> String identifier for this job control request
--pause Pause the specified processes
--resume "Un-pause" the specified processes
--cancel <arg0> Cancel the specified request ID ("all" => cancel all requests from this requestor)
--kill Force terminate the specified processes
--terminate Politely terminate the specified processes
--signal <arg0> Provide the specified processes with the given signal
--restart <arg0> Restart the specified processes using the given checkpoint ID
--checkpoint <arg0> Checkpoint the specified processes and assign the given ID to it
--pset <arg0> Define a new pset (with the given name) whose membership is
comprised of the specified processes
--targets <arg0> Comma-delimited list of target processes for the requested
job-control operation

#
# CONNECTION OPTIONS
#
#
[uri]
Specify the URI of the DVM master, or the name of the file (specified as
file:filename) that contains that info
#
[num-connect-retries]
Max number of times to try to connect to the specified server (int)
#
[pid]
PID of the daemon to which we should connect (int => PID or file:<file>
for file containing the PID
#
[namespace]
Namespace of the daemon we are to connect to (char*)
#
[nspace]
Namespace of the daemon we are to connect to (char*) - synonym for "namespace"
#
[system-server-first]
First look for a system server and connect to it if found
#
[system-server-only]
Connect only to a system-level server - abort if one is not found
#
[tmpdir]
Define the root location for the session directory tree where the
rendezvous files can be found.

The rendezvous files contain connection information for a target
server and are located in the session directory tree. It may be necessary to point the
tool at the location where those files can be found if that location is other than the
expected default.

The root of the session directory defaults to the system temporary directory
as defined in the environment using (in precedence order) the envars TMPDIR, TEMP, and
finally TMP. In the absence of any of those variables, PMIx will default to the "/tmp"
location.
#
[wait-to-connect]
Delay specified number of seconds before trying to connect
#
# PCTRL-SPECIFIC OPTIONS
#
[request-id]
String identifier for this job control request. The request ID can be used for
subsequent query of request status and/or cancellation of the request. Note that
a request ID that matches a currently active request will be rejected - so care should be taken
to ensure that the ID provided is unique and not currently in use.
#
[pause]
Pause the specified processes. This typically takes the form of applying a SIGSTOP
to the specified processes. The pctrl tool will return a status indicating whether
or not the operation succeeded.
#
[resume]
Direct the specified processes to resume execution. This typically takes the form
of applying a SIGCONT signal to the specified processes. The pctrl tool will return
a status indicating whether or not the operation succeeded.
#
[cancel]
Cancel the specified request ID. The provided ID must match the ID provided to
a prior request - if the ID cannot be found, then a PMIX_ERR_NOT_FOUND status
shall be returned by pctrl.
#
[kill]
Force terminate the specified processes. Precise behavior depends upon the
runtime environment. However, typically the specified processes will receive
the following sequence of signals:

* SIGCONT - wakeup a sleeping process
* SIGTERM - provide a trappable signal indicating that the process
should cleanly exit, if possible
* SIGKILL - forcibly terminate the process

The pctrl tool will return a status indicating whether or not the operation
succeeded (i.e., all processes exited).
#
[terminate]
Politely terminate the specified processes. Precise behavior depends upon the
runtime environment. However, typically the specified processes will receive
the following sequence of signals:

* SIGCONT - wakeup a sleeping process
* SIGTERM - provide a trappable signal indicating that the process
should cleanly exit, if possible

The pctrl tool will return a status indicating whether or not the operation
succeeded (i.e., the processes terminated)
#
[signal]
Provide the specified processes with the given signal. Signals are to be
provided via their name (e.g., SIGTERM, SIGKILL) or an integer value
(e.g., -9).
#
[restart]
"Un-pause" the specified processes - usually implemented by applying a
SIGCONT signal to the processes.
#
[checkpoint]
Checkpoint the specified processes and assign the given ID to it. The checkpoint
operation will be conducted according to the method specified when the processes
were originally spawned. Support for this operation therefore depends both on the
capabilities of the runtime environment _and_ the application being told to
checkpoint.
#
[pset]
Define a new pset (with the given name) whose membership is comprised of the
specified processes.
#
[targets]
Comma-delimited list of target processes for the requested job-control
operation. Wildcard ranks (e.g., to apply the request to all processes
in the specified namespace) can be indicated with an asterisk ('*'). Syntax
requires that each process be identified as "nspace:rank". Note that
typical command line restrictions may necessitate the use of special
delimiters - e.g., "my\;weird.nspace:5".

0 comments on commit dfab3b3

Please sign in to comment.