Skip to content

Commit

Permalink
Merge pull request #384 from rhc54/rfc/srvr
Browse files Browse the repository at this point in the history
Add some missing attributes and one API for server support
  • Loading branch information
jjhursey committed Mar 7, 2022
2 parents 7f436bd + 3a95743 commit 3604086
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 1 deletion.
42 changes: 41 additions & 1 deletion Chap_API_Data_Mgmt.tex
Expand Up @@ -601,7 +601,7 @@ \subsection{\code{PMIx_Data_decompress}}
\begin{codepar}
bool
PMIx_Data_decompress(const uint8_t *inbytes, size_t size,
uint8_t **outbytes, size_t *nbytes,);
uint8_t **outbytes, size_t *nbytes);
\end{codepar}
\cspecificend

Expand Down Expand Up @@ -632,3 +632,43 @@ \subsection{\code{PMIx_Data_decompress}}
unexpected and potentially catastrophic results.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{\code{PMIx_Data_embed}}
\declareapi{PMIx_Data_embed}

%%%%
\summary

Embed a data payload into a buffer

%%%%
\format

\versionMarker{4.2}
\cspecificstart
\begin{codepar}
pmix_status_t
PMIx_Data_embed(pmix_data_buffer_t *buffer,
const pmix_byte_object_t *payload);
\end{codepar}
\cspecificend

\begin{arglist}
\argout{buffer}{Address of the buffer where the payload is to be embedded (handle)}
\argin{payload}{Address of the \refstruct{pmix_byte_object_t} structure containing the data to be embedded into the buffer (handle)}
\end{arglist}

Returns one of the following:
\begin{constantdesc}
\item \refconst{PMIX_SUCCESS} The data has been embedded as requested
\item \refconst{PMIX_ERR_BAD_PARAM} The destination and/or source pointer is \code{NULL}
\item \refconst{PMIX_ERR_NOT_SUPPORTED} The \ac{PMIx} implementation does not support this function.
\end{constantdesc}

%%%%
\descr

The embed function is identical in operation to \refapi{PMIx_Data_load}
except that it does \emph{not} clear the payload object upon completion.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 changes: 34 additions & 0 deletions Chap_API_Proc_Mgmt.tex
Expand Up @@ -169,6 +169,9 @@ \subsection{\code{PMIx_Spawn}}
\pasteAttributeItem{PMIX_COSPAWN_APP}
\pasteAttributeItem{PMIX_SPAWN_TOOL}
\pasteAttributeItem{PMIX_EVENT_SILENT_TERMINATION}
\pasteAttributeItem{PMIX_ENVARS_HARVESTED}
\pasteAttributeItem{PMIX_JOB_TIMEOUT}
\pasteAttributeItem{PMIX_SPAWN_TIMEOUT}

\optattrend

Expand Down Expand Up @@ -294,6 +297,9 @@ \subsection{\code{PMIx_Spawn_nb}}
\pasteAttributeItem{PMIX_COSPAWN_APP}
\pasteAttributeItem{PMIX_SPAWN_TOOL}
\pasteAttributeItem{PMIX_EVENT_SILENT_TERMINATION}
\pasteAttributeItem{PMIX_ENVARS_HARVESTED}
\pasteAttributeItem{PMIX_JOB_TIMEOUT}
\pasteAttributeItem{PMIX_SPAWN_TIMEOUT}

\optattrend

Expand Down Expand Up @@ -329,6 +335,18 @@ \subsection{Spawn-specific constants}
\declareconstitemNEW{PMIX_ERR_JOB_FAILED_TO_LAUNCH}
One or more processes in the job request failed to launch
%
\declareconstitemNEW{PMIX_ERR_JOB_EXE_NOT_FOUND}
Specified executable not found
%
\declareconstitemNEW{PMIX_ERR_JOB_INSUFFICIENT_RESOURCES}
Insufficient resources to spawn job
%
\declareconstitemNEW{PMIX_ERR_JOB_SYS_OP_FAILED}
System library operation failed
%
\declareconstitemNEW{PMIX_ERR_JOB_WDIR_NOT_FOUND}
Specified working directory not found
%
\end{constantdesc}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Expand Down Expand Up @@ -536,6 +554,22 @@ \subsection{Spawn attributes}
\declareAttribute{PMIX_EVENT_SILENT_TERMINATION}{"pmix.evsilentterm"}{bool}{
Do not generate an event when this job normally terminates.
}
%
\declareAttributeNEW{PMIX_ENVARS_HARVESTED}{"pmix.evar.hvstd"}{bool}{
Environmental parameters have been harvested by the spawn requestor - the server
does not need to harvest them.
}
%
\declareAttributeNEW{PMIX_JOB_TIMEOUT}{"pmix.job.time"}{int}{
Time in seconds before the spawned job should time out and be terminated (0 => infinite), defined as the total runtime of the job (equivalent to the walltime limit of typical batch schedulers).
}
%
\declareAttributeNEW{PMIX_SPAWN_TIMEOUT}{"pmix.sp.time"}{int}{
Time in seconds before spawn operation should time out (0 => infinite).
Logically equivalent to passing the \refattr{PMIX_TIMEOUT} attribute to the
\refapi{PMIx_Spawn} \ac{API}, it is provided as a separate attribute to distinguish
it from the \refattr{PMIX_JOB_TIMEOUT} attribute
}
\vspace{\baselineskip}
Attributes used to adjust remote environment variables prior to spawning the specified application processes.
Expand Down
4 changes: 4 additions & 0 deletions Chap_API_Reserved_Keys.tex
Expand Up @@ -485,6 +485,10 @@ \subsection{Node realm keys}
\declareAttribute{PMIX_LOCAL_SIZE}{"pmix.local.size"}{uint32_t}{
Number of processes in the specified job or application realm on the caller's node. Defaults to job realm unless the \refattr{PMIX_APP_INFO} and the \refattr{PMIX_APPNUM} qualifiers are given.
}
%
\declareAttributeNEW{PMIX_NODE_OVERSUBSCRIBED}{"pmix.ndosub"}{bool}{
True if the number of processes from this job on this node exceeds the number of slots allocated to it
}

\vspace{\baselineskip}

Expand Down
19 changes: 19 additions & 0 deletions Chap_API_Server.tex
Expand Up @@ -108,6 +108,7 @@ \subsection{\code{PMIx_server_init}}
\pasteAttributeItemEnd{}
\pasteAttributeItem{PMIX_SERVER_ENABLE_MONITORING}
\pasteAttributeItem{PMIX_HOMOGENEOUS_SYSTEM}
\pasteAttributeItem{PMIX_SINGLETON}

\optattrend

Expand Down Expand Up @@ -233,6 +234,10 @@ \subsection{Server Initialization Attributes}
\declareAttributeNEW{PMIX_HOMOGENEOUS_SYSTEM}{"pmix.homo"}{bool}{
The nodes comprising the session are homogeneous - i.e., they each contain the same number of identical packages, fabric interfaces, \acp{GPU}, and other devices.
}
%
\declareAttributeNEW{PMIX_SINGLETON}{"pmix.singleton"}{char*}{
String representation (nspace.rank) of proc ID for the singleton the server was started to support
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Expand Down Expand Up @@ -473,6 +478,7 @@ \subsection{\code{PMIx_server_register_nspace}}
\item \pasteAttributeItem{PMIX_NODE_SIZE}
\item \pasteAttributeItem{PMIX_LOCALLDR}
\item \pasteAttributeItem{PMIX_LOCAL_PEERS}
\item \pasteAttributeItem{PMIX_NODE_OVERSUBSCRIBED}
\end{itemize}

plus the following information for the server's own node:
Expand Down Expand Up @@ -2388,6 +2394,7 @@ \subsection{\code{pmix_server_fencenb_fn_t}}
The following attributes are required to be supported by all host environments:

\pasteAttributeItem{PMIX_COLLECT_DATA}
\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}

\reqattrend

Expand Down Expand Up @@ -2886,6 +2893,8 @@ \subsection{\code{pmix_server_spawn_fn_t}}
\pasteAttributeItem{PMIX_JOB_CONTINUOUS}
\pasteAttributeItem{PMIX_MAX_RESTARTS}
\pasteAttributeItem{PMIX_TIMEOUT}
\pasteAttributeItem{PMIX_JOB_TIMEOUT}
\pasteAttributeItem{PMIX_SPAWN_TIMEOUT}

\optattrend

Expand Down Expand Up @@ -2957,6 +2966,8 @@ \subsection{\code{pmix_server_connect_fn_t}}
\end{itemize}

\reqattrstart
\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}

\ac{PMIx} libraries are required to pass any provided attributes to the host environment for processing.
\reqattrend

Expand Down Expand Up @@ -3025,6 +3036,8 @@ \subsection{\code{pmix_server_disconnect_fn_t}}
\end{itemize}

\reqattrstart
\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}

\ac{PMIx} libraries are required to pass any provided attributes to the host environment for processing.
\reqattrend

Expand Down Expand Up @@ -4310,6 +4323,12 @@ \subsection{\code{pmix_server_grp_fn_t}}
\item a PMIx error constant indicating either an error in the input or that the request was immediately processed and failed - the \refarg{cbfunc} will not be called
\end{itemize}

\reqattrstart
The following attributes are required to be supported by a host environment.

\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}
\reqattrend

\optattrstart
The following attributes may be supported by a host environment.

Expand Down
8 changes: 8 additions & 0 deletions Chap_API_Sync_Access.tex
Expand Up @@ -164,6 +164,14 @@ \subsection{Fence-related attributes}
has been committed via \refapi{PMIx_Commit}, making the collection locally
available to each participant at the end of the operation. By default, this will include all job-level information that was locally generated by \ac{PMIx} servers unless excluded using the \refattr{PMIX_COLLECT_GENERATED_JOB_INFO} attribute.
}
%
\declareAttributeNEW{PMIX_LOCAL_COLLECTIVE_STATUS}{"pmix.loc.col.st"}{pmix_status_t}{
Status code for local collective operation being reported to the host by the server library. PMIx servers may aggregate the participation by local client processes in a collective operation - e.g., instead of passing individual client calls to \refapi{PMIx_Fence} up to the host environment, the server may pass only a single call to the host when all local participants have executed their \refapi{PMIx_Fence} call, thereby reducing the burden placed on the host. However, in cases where the operation locally fails (e.g., if a participating client abnormally terminates prior to calling the operation), the server upcall functions to the host do not include a \refstruct{pmix_status_t} by which the PMIx server can alert the host to that failure. This attribute resolves that problem by allowing the server to pass the status information regarding the local collective operation.
}
\advicermstart
The PMIx server is allowed to pass \refconst{PMIX_SUCCESS} using this attribute, but is not required to do so. PMIx implementations may choose to only report errors in this manner. The lack of an included status shall therefore be taken to indicate that the collective operation locally succeeded.
\advicermend

%
\declareAttributeNEW{PMIX_COLLECT_GENERATED_JOB_INFO}{"pmix.collect.gen"}{bool}{
Collect all job-level information (i.e., reserved keys) that was locally generated by \ac{PMIx} servers. Some job-level information (e.g., distance between processes and fabric devices) is best determined on a distributed basis as it primarily pertains to local processes. Should remote processes need to access the information, it can either be obtained collectively using the \refapi{PMIx_Fence} operation with this directive, or can be retrieved one peer at a time using \refapi{PMIx_Get} without first having performed the job-wide collection.
Expand Down

0 comments on commit 3604086

Please sign in to comment.