resolve merge conflicts

radical-cybertools · Jul 2, 2018 · 71566f2 · 71566f2
2 parents dde5789 + 29c6201
commit 71566f2
Show file tree

Hide file tree

Showing 94 changed files with 1,168 additions and 497 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -5,6 +5,12 @@
     https://github.com/radical-cybertools/radical.pilot/issues?q=is%3Aissue+is%3Aopen+
 
 
+0.47.14 Release                                                       2018-06-13
+--------------------------------------------------------------------------------
+
+  - fix recursive output staging
+
+
 0.47.13 Release                                                       2018-06-02
 --------------------------------------------------------------------------------
 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.47.13
+0.47.14
diff --git a/bin/radical-pilot-create-static-ve b/bin/radical-pilot-create-static-ve
@@ -100,18 +100,26 @@ elif test "$arg" = "bwpy"
 then
     # this is where we end up after the `exec` call in the branch above
     echo "create bwpy ve [$prefix]"
-
+    PYTHON=python2.7
 else
     # this is not BW.
     echo "create rct ve [$prefix]"
-
+    PYTHON=python
 fi
 
 # create the ve, install bare necessities
 mkdir -p "$prefix"
+cd $prefix
+
+echo -n "install virtualenv "
+VIRTENV_TGZ="virtualenv-1.9.tar.gz"
+VIRTENV_TGZ_URL="https://pypi.python.org/packages/source/v/virtualenv/$VIRTENV_TGZ"
+curl -k -L -O "$VIRTENV_TGZ_URL" 2>&1 | progress
+tar zxmf "$VIRTENV_TGZ"
+VIRTENV_CMD="$PYTHON virtualenv-1.9/virtualenv.py"
 
 echo -n "create  virtualenv "
-stdbuf -oL virtualenv "$prefix" | progress
+stdbuf -oL $VIRTENV_CMD "$prefix" | progress
 .          "$prefix"/bin/activate
 
 echo -n "update  setuptools "
@@ -128,8 +136,8 @@ done
 # install the radical stack (utils, saga, pilot) into a separate tree
 # ($prefix/rp_install(, so that any local install can use the ve *w/o* 
 # the radical stack, by re-routing PYTHONPATH
-python_version=`python -c 'import distutils.sysconfig as sc; print sc.get_python_version()'`
-ve_mod_prefix=` python -c 'import distutils.sysconfig as sc; print sc.get_python_lib()'`
+python_version=`$PYTHON -c 'import distutils.sysconfig as sc; print sc.get_python_version()'`
+ve_mod_prefix=` $PYTHON -c 'import distutils.sysconfig as sc; print sc.get_python_lib()'`
 rp_mod_prefix=`echo $ve_mod_prefix | sed -e "s|$prefix|$prefix/rp_install/|"`
 
 # BW doesn't like us anymore: after loading the bwpy module, we also need to
@@ -146,17 +154,24 @@ then
 
     # find binary - fucking virtualenv seems to pick the bin name randomly or
     # whatever...
-    for p in python python2 python2.7
+    for p in python*
     do
-        if ! test -h $p
+        if   ! test -e $p; then echo "miss   $p"
+        elif ! test -x $p; then echo "ignore $p"
+        elif   test -h $p
         then
-            echo "patch $p"
-            mv $p $p-exe
+            echo "wrap   $p"
+            tgt=$(readlink $p)
+            rm -f $p
             echo "#!/bin/sh" > $p
-            echo "exec bwpy-environ -- $cwd/$p-exe \"\$@\"" >> $p
+            echo "exec bwpy-environ -- $tgt \"\$@\"" >> $p
             chmod 0755  $p
         else
-            echo "skip  $p"
+            echo "patch  $p"
+            mv   $p $p.rp
+            echo "#!/bin/sh" > $p
+            echo "exec bwpy-environ -- $cwd/$p.rp \"\$@\"" >> $p
+            chmod 0755  $p
         fi
     done
 fi
@@ -166,7 +181,7 @@ echo
 echo "---------------------------------------------------------------------"
 echo
 echo "PYTHONPATH: $PYTHONPATH"
-echo "python: `which python` (`python --version`)"
+echo "python: `which python` (`python -V`)"
 echo
 echo "---------------------------------------------------------------------"
 echo

diff --git a/docs/architecture/bootstrapping.md b/docs/architecture/bootstrapping.md
@@ -1,86 +1,162 @@
 
-# Agent Bootstrapping
+# Pilot Structure
 
-## `bootstrap_0.sh`
+Pilots are defined via the RP API, and are shaped (in space, times and
+capabilities) according to application requirements.  The pilot is structured in
+3 layers: pilot job, pilot partition, and pilot agent.
 
-`bootstrapper_0.sh` is the original workload when the pilot agent gets placed on
-the target resource.  It is usually placed by the batch system on the first of
+The pilot job is a placeholder job which gets submitted to a target resource
+(cluster), usually via the target resource's batch system.  The pilot job has
+two purposes: (i) to acquire sufficient resources and capabilities from the
+target resource, and (ii) to host the bootstrapping chain - described below.
+One of the bootstrapping stages (`bootstrap_1.py`) partitions the resources,
+acquired by the pilot job, to one or more partitions.  On each partition, the
+bootstrapper places one pilot agent which manages that partition and executes
+units on its resources.
+
+### Implementation:
+
+A pilot agent consists of different types of components which can be distributed
+over different compute nodes, and which communicate with each other over
+a network of ZMQ channels.  The agent can be configured to create multiple
+instances of each component type, for scaling and reliability purposes.
+
+
+### Multiplicity:
+
+    * 1 application          : 1..n pilot jobs
+    * 1 pilot job            : 1..m pilot partitions
+    * 1 pilot partition      : 1    pilot agent
+    * 1 pilot agent          : k    pilot agent components
+    * 1 pilot agent component: 1..i pilot agent component instances
+
+
+## Pilot Job Specification
+
+The pilot job needs to have sufficient resources to hold the pilot partitions
+required by the application.  RP additionally needs to know where and how to
+submit the pilot job.  These information are passed via the pilot description
+which additionally references a list of partition descriptions, obviously
+describing the partitions to be created in the pilot job.
+
+The pilot description specifies:
+
+    * resource target
+    * resource access mechanism (schema, project)
+    * runtime environment settings (sandbox, STDOUT/STDERR)
+    * runtime (optional)
+
+Other parameters needed for the pilot job submission are stored in the resource
+configuration files.  Resource requirements are derived from the partition
+specifications (CPUs, GPUs, Memory, ...).
+
+
+## Pilot Partitions
+
+A pilot partition represents a subset of the resources acquired by a pilot job.
+One or more partitions can co-exist in a pilot job - but it is guaranteed that
+any resources managed by one partition are not concurrently used by any other
+partition.  A pilot partition thus represents an abstracted view of a resource
+partition to the pilot agent.
+
+Partition lifetimes are independent of pilot lifetimes - but a pilot will always
+start with a pre-defined set of partitions which manage the pilot's resources.
+
+
+# Pilot Bootstrapping
+
+## Stage 0: `bootstrap_0.sh`
+
+`bootstrapper_0.sh` is the original workload of the pilot job as it is placed on
+the target resource.  It is usually started by the batch system on the first of
 the allocated nodes.  In cases where multiple pilots get submitted in one
 request, several `bootstrap_0.sh` instances may get started simultaneously - but
 this document is only concerned with individual instances, and assumes that any
-separation wrt. environment and available resources is take care of *before*
+separation wrt.  environment and available resources is taken care of *before*
 instantiation.
 
-The purpose of stage 0 is to prepare the Shell and Python environment for the
-later bootstrapping stages.  Specifically, the stage will 
+The purpose of stage 0 is to prepare the Shell and Python environment for later
+bootstrapping stages.  Specifically, the stage will:
 
-  - creation of the pilot sandbox under `$PWD`;
-  - if needed, create a tunnel for MongoDB access;
-  - perform resource specific environment settings to ensure the availability
-    of a functional Python (`module load` commands etc);
-  - create a virtualenv, or ensure if a static VE exists;
+  - create the pilot sandbox under `$PWD`;
+  - create a tunnel for MongoDB access (if needed);
+  - perform resource specific environment settings to ensure the availability of
+    a functional Python (`module load` commands etc);
+  - create a virtualenv, or ensure a VE exists and is usable;
   - activate that virtualenv;
   - install all python module dependencies;
   - install the RCT stack (into a tree outside of the VE)
-  - create `bootstrap_2,sh` on the fly
-  - start the next bootstrapper stage (`bootstrapper_1.py`) in Python land.
+  - create `bootstrap_2.sh` on the fly
+  - start the next bootstrapping stage (`bootstrapper_1.py`) in Python land.
 
 
-## `bootstrap_1.py`
+## Stage 1: `bootstrap_1.py`
 
-The second stage enters Python, and will 
-
-  - pull pilot creation and cancellation requests from MongoDB;
-  - invoke LRMS specific code parts of RP to set up a partition for each pilot
-    reqest (pilots fail immediately otherwise);
-  - run `bootstrap_2.sh` for each pilot to get it enroled on its partition.
+The second stage enters Python, and will:
 
+  - pull pilot partition creation and termination requests from MongoDB;
+  - invoke LRMS specific code parts of RP to set up the requested partitions;
+  - run `bootstrap_2.sh` for each partition to get the partition's pilot agent
+    started.
 
-## `bootstrap_2.sh`
 
-This has been created on the fly by `bootstrap_0.sh` to make sure that all
-agents and sub-agents can use the exact environment settings as created and
-defined by `bootstrap_0.sh`, w/o needing to go through the whole process again,
-on the same node, or on other node.  More specifically, `bootstrap_2.sh` will
-launch the `radical-pilot-agent` python script on the first node of the agent's
-partition.
+## Stage 2: `bootstrap_2.sh`
 
+This script has been created on the fly by stage 0 to make sure that all pilot
+agents and sub-agents use the exact same environment settings as created and
+defined by stage 0, w/o the need to redo the necessary setup steps.  More
+specifically, `bootstrap_2.sh` will launch the `radical-pilot-agent` Python
+script on the partition.  It will already use the configured agent launch
+methods (`agent_launch_method` and `agent_spawner`) for placing and starting the
+agents.
 
-## `radical-pilot-agent` (`bootstrap_3`)
 
-The `radical-pilot-agent` script and its only method `bootstrap_3` manage the
-lifetime of `Agent_0` (and later `Agent_n`) class instances.
+## Stage 3: `radical-pilot-agent` (`bootstrap_3`)
 
+Up to this point we do not actively place any element of the bootstrapping
+chain, and the elements thus live wherever the batch system happens to
+originally place them.  This changes now: depending on the agent configuration,
+this script will land on a specific node which is part of the partition the
+agent is supposed to manage (but it can also live on a MOM-node or other special
+nodes on certain architectures).
 
-## `Agent_0`
+The `radical-pilot-agent` Python script and its only method (`bootstrap_3()`)
+manage the lifetime of the `Agent_0` class instance, which *is* the agent
+responsible for managing the partition it got started on.
 
-Up to this point we do not actively place any element of the bootstrapping
-chain, and the elements thus live whereever the batch system happens to
-originally place them.  This changes now.
 
-The `Agent_0` class constructor will now inspect the job environment, and will
-determine whats compute nodes are available.  This will respect the partitioning
-information from `bootstrap_1.py`.  Based on that information, and on the agent
+## Stage 4: `Agent_0`
+
+The `Agent_0` class constructor will now inspect its environment, and will
+determine what resources are available.  This will respect the partitioning
+information from `bootstrap_1.py`.  Based on that information and on the agent
 configuration file (`agent_0.cfg`), the class will then instantiate all
-communication bridges and several RP agent components.  But more importantly,
-the config file will also contain information about the placement of the
-sub-agents, which will live on the compute nodes and will instantiate the
-remaining agent components on those nodes.
-
-Once the sub-agent placement decisions have been made, `Agent_0` will write the
-configuration files for those sub-agents, and then use the `agent_launch_method`
-and `agent_spawner` (as defined in its config file) to execute `bootstrap_2.sh
-agent_$n` on the target node.  Using `bootstrap_2.sh` will ensure that the
-sub-agents find the same environment as `Agent_0`.  Using the launch and spawner
-methods of RP avoids special code pathes for agent and unit execution - which
-are ultimately the same thing.
+communication bridges and several RP agent components.  The configuration file
+also contains information about any sub-agents to be placed on the compute
+nodes.
+
+Once the respective sub-agent placement decisions have been made, `Agent_0` will
+write configuration files for those sub-agents, and then again use the
+`agent_launch_method` and `agent_spawner` (as defined in its configuration file)
+to execute `bootstrap_2.sh agent_$n` on the target node.  Using `bootstrap_2.sh`
+will ensure that the sub-agents find the same environment as `Agent_0`.  Using
+the launch and spawner methods of RP avoids special code paths for agent and
+unit execution - which are ultimately the same thing.
+
 
 ## `Agent_n`
 
-The sub-agents will connect to the communication bridges (their addresses have
-been stored in the sub-agent config files by `Agent_0`), and will report
-successful startup and component instantiation.  Once all sub-agents report for
-duty, the bootstrapping process is complete, and the agent(s) start operation by
-pulling units from the database.
+The `Agent_n` sub-agents, which host additional agent component instances, are
+bootstrapped by the same mechanism as `Agent_0`: `bootstrap_2.sh`,
+`radical-pilot-agent`, `bootstrap_3()`, `Agent_n` instance.  The only difference
+is the limited configuration file, and the disability of `Agent_n` to spawn
+further sub-agents.
+
+The sub-agent components will connect to the communication bridges (their
+addresses have been stored in the sub-agent configuration files by `Agent_0`),
+and will report successful startup and component instantiation.  Once all
+sub-agents report for duty, the bootstrapping process is complete, and the agent
+start operation by pulling units from the database.
 
+---
 
diff --git a/docs/source/events.md b/docs/source/events.md
@@ -95,9 +95,9 @@ indication on event ordering *within each individual component*.
     * per file          : staging_in_start, staging_in_stop
 
 
-### bootstrap_1.sh
+### bootstrap_0.sh
 
-    bootstrap_1_start   : pilot bootstrapper 1 starts                (uid: pid)
+    bootstrap_0_start   : pilot bootstrapper 1 starts                (uid: pid)
     tunnel_setup_start  : setting up tunnel    starts                (uid: pid)
     tunnel_setup_stop   : setting up tunnel    stops                 (uid: pid, [CFG-R])
     ve_setup_start      : pilot ve setup       starts                (uid: pid)
@@ -117,7 +117,7 @@ indication on event ordering *within each individual component*.
     sync_rel            : time sync event                            (uid: pid, msg: 'agent_0 start')
     cleanup_start       : sandbox deletion     starts                (uid: pid)
     cleanup_stop        : sandbox deletion     stops                 (uid: pid)
-    bootstrap_1_stop    : pilot bootstrapper 1 stops                 (uid: pid)
+    bootstrap_0_stop    : pilot bootstrapper 1 stops                 (uid: pid)
 
     partial orders
     * as above

diff --git a/docs/source/machconf.rst b/docs/source/machconf.rst
@@ -133,7 +133,7 @@ A configuration file has to be valid JSON. The structure is as follows:
             "mpi_launch_method"           : "MPIEXEC",
             "forward_tunnel_endpoint"     : "login03",
             "global_virtenv"              : "/home/hpc/pr87be/di29sut/pilotve",
-            "pre_bootstrap_1"             : ["source /etc/profile",
+            "pre_bootstrap_0"             : ["source /etc/profile",
                                              "source /etc/profile.d/modules.sh",
                                              "module load python/2.7.6",
                                              "module unload mpi.ibm", "module load mpi.intel",
@@ -170,8 +170,8 @@ All fields are mandatory, unless indicated otherwise below.
 * ``mpi_launch_method``: type of MPI support, required for MPI units. Valid values are: ``MPIRUN``, ``MPIEXEC``, ``APRUN``, ``IBRUN`` or ``POE``.
 * ``python_interpreter``: path to python (optional).
 * ``python_dist``: `anaconda` or `default`, ie. not `anaconda` (mandatory).
-* ``pre_bootstrap_1``: list of commands to execute for initialization of main agent (optional).
-* ``pre_bootstrap_2``: list of commands to execute for initialization of sub-agent (optional).
+* ``pre_bootstrap_0``: list of commands to execute for initialization of main agent (optional).
+* ``pre_bootstrap_1``: list of commands to execute for initialization of sub-agent (optional).
 * ``valid_roots``: list of shared file system roots (optional). Note: pilot sandboxes must lie under these roots.
 * ``pilot_agent``: type of pilot agent to use. Currently: ``radical-pilot-agent-multicore.py``.
 * ``forward_tunnel_endpoint``: name of the host which can be used to create ssh tunnels from the compute nodes to the outside world (optional).

diff --git a/docs/source/schedulers.rst b/docs/source/schedulers.rst
@@ -23,8 +23,8 @@ agent scheduler will place the units on the set of resources (cores) that agent
 is managing.  The agent scheduler can be configured via agent and resource
 configuration files (see :ref:`chapter_resources`).
 
-Round-Robin Scheduler (``SCHED_ROUND_ROBIN``)
----------------------------------------------
+Round-Robin Scheduler (``SCHEDULER_ROUND_ROBIN``)
+-------------------------------------------------
 
 The Round-Robin scheduler will fairly distributed arriving compute units over
 the set of known pilots, independent of unit state, expected workload, pilot
@@ -33,8 +33,8 @@ fast scheduler, which does not impose any additional communication roundtrips
 between the unit manager and pilot agents.
 
 
-Backfilling Scheduler (``SCHED_BACKFILLING``)
----------------------------------------------
+Backfilling Scheduler (``SCHEDULER_BACKFILLING``)
+-------------------------------------------------
 
 The backfilling scheduler does a better job at actual load balancing, but at the
 cost of additional communication roundtrips.  It depends on the actual

diff --git a/docs/source/user_guide/04_scheduler_selection.rst b/docs/source/user_guide/04_scheduler_selection.rst
@@ -10,10 +10,10 @@ matches submitted units to pilots for execution.  On constructing the unit
 manager, it can be configured to use a specific scheduling policy for that.  The
 following policies are implemented:
 
- * `rp.SCHED_ROUND_ROBIN`: alternate units between all available pilot.  This
+ * `rp.SCHEDULER_ROUND_ROBIN`: alternate units between all available pilot.  This
    policy leads to a static and fair, but not necessarily load-balanced unit
    assignment.  
- * `rp.SCHED_BACKFILLING`: dynamic unit scheduling based on pilot capacity and
+ * `rp.SCHEDULER_BACKFILLING`: dynamic unit scheduling based on pilot capacity and
    availability.  This is the most intelligent scheduler with good load
    balancing, but it comes with a certain scheduling overhead.
 

diff --git a/docs/source/user_guide/simple_bot_multi.rst b/docs/source/user_guide/simple_bot_multi.rst
@@ -62,7 +62,7 @@ scheduler.
 
 .. code-block:: python
     umgr = rp.UnitManager (session=session,
-                           scheduler=rp.SCHED_ROUND_ROBIN)
+                           scheduler=rp.SCHEDULER_ROUND_ROBIN)
 
 
 ------------