diff --git a/examples/01_unit_details.py b/examples/01_unit_details.py index 449bf4c019..5e374e8bd9 100755 --- a/examples/01_unit_details.py +++ b/examples/01_unit_details.py @@ -79,18 +79,22 @@ # Create a workload of ComputeUnits. # Each compute unit runs '/bin/date'. - n = 128 # number of units to run - report.info('create %d unit description(s)\n\t' % n) + report.info('create %d unit description(s)\n\t' % 2) cuds = list() - for i in range(0, n): - - # create a new CU description, and fill it. - # Here we don't use dict initialization. - cud = rp.ComputeUnitDescription() - cud.executable = '/bin/date' - cuds.append(cud) - report.progress() + + cud = rp.ComputeUnitDescription() + cud.executable = '/usr/bin/mongo' + cud.arguments = 'mongodb://$RP_APP_TUNNEL/ --eval db.stats()'.split() + cuds.append(cud) + report.progress() + + cud = rp.ComputeUnitDescription() + cud.executable = 'echo' + cud.arguments = '$RP_APP_TUNNEL' + cuds.append(cud) + report.progress() + report.ok('>>ok\n') # Submit the previously created ComputeUnit descriptions to the @@ -106,13 +110,21 @@ for unit in units: report.plain(' * %s: %s, exit: %3s, out: %s\n' \ % (unit.uid, unit.state[:4], - unit.exit_code, unit.stdout.strip()[:35])) + unit.exit_code, unit.stderr[:35])) # get some more details for one unit: unit_dict = units[0].as_dict() report.plain("unit workdir : %s\n" % unit_dict['unit_sandbox']) report.plain("pilot id : %s\n" % unit_dict['pilot']) report.plain("exit code : %s\n" % unit_dict['exit_code']) + report.plain("exit stdout : %s\n" % unit_dict['stdout']) + + # get some more details for one unit: + unit_dict = units[1].as_dict() + report.plain("unit workdir : %s\n" % unit_dict['unit_sandbox']) + report.plain("pilot id : %s\n" % unit_dict['pilot']) + report.plain("exit code : %s\n" % unit_dict['exit_code']) + report.plain("exit stdout : %s\n" % unit_dict['stdout']) except Exception as e: diff --git a/src/radical/pilot/agent/bootstrap_0.sh b/src/radical/pilot/agent/bootstrap_0.sh index 4f7c270e67..8733b18669 100755 --- a/src/radical/pilot/agent/bootstrap_0.sh +++ b/src/radical/pilot/agent/bootstrap_0.sh @@ -1546,17 +1546,35 @@ RUNTIME=$((RUNTIME * 60)) # down on its own RUNTIME=$((RUNTIME + 60)) +# ------------------------------------------------------------------------------ # If the host that will run the agent is not capable of communication # with the outside world directly, we will setup a tunnel. -if [[ $FORWARD_TUNNEL_ENDPOINT ]]; then +get_tunnel(){ + + addr=$1 profile_event 'tunnel_setup_start' echo "# -------------------------------------------------------------------" - echo "# Setting up forward tunnel for MongoDB to $FORWARD_TUNNEL_ENDPOINT." + echo "# Setting up forward tunnel to $addr." # Bind to localhost - BIND_ADDRESS=`/sbin/ifconfig $TUNNEL_BIND_DEVICE|grep "inet addr"|cut -f2 -d:|cut -f1 -d" "` + BIND_ADDRESS=$(/sbin/ifconfig $TUNNEL_BIND_DEVICE|grep "inet addr"|cut -f2 -d:|cut -f1 -d" ") + + if test -z "$BIND_ADDRESS" + then + BIND_ADDRESS=$(/sbin/ifconfig lo | grep 'inet' | xargs echo | cut -f 2 -d ' ') + fi + + if test -z "$BIND_ADDRESS" + then + BIND_ADDRESS=$(ip addr + | grep 'state UP' -A2 + | grep 'inet' + | awk '{print $2}' + | cut -f1 -d'/') + # BIND_ADDRESS="127.0.0.1" + fi # Look for an available port to bind to. # This might be necessary if multiple agents run on one host. @@ -1573,25 +1591,51 @@ if [[ $FORWARD_TUNNEL_ENDPOINT ]]; then # Set up tunnel # TODO: Extract port and host FORWARD_TUNNEL_ENDPOINT_PORT=22 - if test "$FORWARD_TUNNEL_ENDPOINT" = "BIND_ADDRESS"; then + + if test -z "$FORWARD_TUNNEL_ENDPOINT" + then + FORWARD_TUNNEL_ENDPOINT_HOST=$BIND_ADDRESS + + elif test "$FORWARD_TUNNEL_ENDPOINT" = "BIND_ADDRESS"; then # On some systems, e.g. Hopper, sshd on the mom node is not bound to 127.0.0.1 # In those situations, and if configured, bind to the just obtained bind address. FORWARD_TUNNEL_ENDPOINT_HOST=$BIND_ADDRESS + else + # FIXME: ensur FT_EP is set FORWARD_TUNNEL_ENDPOINT_HOST=$FORWARD_TUNNEL_ENDPOINT fi - ssh -o StrictHostKeyChecking=no -x -a -4 -T -N -L $BIND_ADDRESS:$DBPORT:$HOSTPORT -p $FORWARD_TUNNEL_ENDPOINT_PORT $FORWARD_TUNNEL_ENDPOINT_HOST & + + # FIXME: check if tunnel stays up + echo ssh -o StrictHostKeyChecking=no -x -a -4 -T -N -L $BIND_ADDRESS:$DBPORT:$addr -p $FORWARD_TUNNEL_ENDPOINT_PORT $FORWARD_TUNNEL_ENDPOINT_HOST + ssh -o StrictHostKeyChecking=no -x -a -4 -T -N -L $BIND_ADDRESS:$DBPORT:$addr -p $FORWARD_TUNNEL_ENDPOINT_PORT $FORWARD_TUNNEL_ENDPOINT_HOST & # Kill ssh process when bootstrap_0 dies, to prevent lingering ssh's trap 'jobs -p | grep ssh | xargs kill' EXIT # and export to agent - export RADICAL_PILOT_DB_HOSTPORT=$BIND_ADDRESS:$DBPORT + export RP_BS_TUNNEL="$BIND_ADDRESS:$DBPORT" profile_event 'tunnel_setup_stop' +} +if ! test -z "$FORWARD_TUNNEL_ENDPOINT" +then + get_tunnel "$HOSTPORT" + export RADICAL_PILOT_DB_HOSTPORT="$RP_BS_TUNNEL" fi +# we also set up a tunnel for the application to use, if a respective endpoint +# is requested in the environment +if ! test -z "$RP_APP_TUNNEL_ADDR" +then + echo "app tunnel addr : $RP_APP_TUNNEL_ADDR" + get_tunnel "$RP_APP_TUNNEL_ADDR" + export RP_APP_TUNNEL="$RP_BS_TUNNEL" + echo "app tunnel setup: $RP_APP_TUNNEL" +fi + + rehash "$PYTHON" # ready to setup the virtenv diff --git a/src/radical/pilot/agent/executing/popen.py b/src/radical/pilot/agent/executing/popen.py index e412178635..abf423ae44 100644 --- a/src/radical/pilot/agent/executing/popen.py +++ b/src/radical/pilot/agent/executing/popen.py @@ -257,6 +257,9 @@ def spawn(self, launcher, cu): else: env_string += 'unset RP_PROF\n' + if 'RP_APP_TUNNEL' in os.environ: + env_string += 'export RP_APP_TUNNEL="%s"\n' % os.environ['RP_APP_TUNNEL'] + env_string += ''' prof(){ if test -z "$RP_PROF" diff --git a/src/radical/pilot/configs/resource_local.json b/src/radical/pilot/configs/resource_local.json index 971759474d..dabbfa4a76 100644 --- a/src/radical/pilot/configs/resource_local.json +++ b/src/radical/pilot/configs/resource_local.json @@ -14,6 +14,10 @@ "job_manager_endpoint" : "fork://localhost/", "filesystem_endpoint" : "file://localhost/" }, + "pre_bootstrap_1" : [ + "export RP_APP_TUNNEL_ADDR=144.76.72.175:27017", + "echo $RP_APP_TUNNEL_ADDR" + ], "default_remote_workdir" : "$HOME", "lrms" : "FORK", "agent_scheduler" : "CONTINUOUS", diff --git a/src/radical/pilot/configs/resource_vtarc_dt.json b/src/radical/pilot/configs/resource_vtarc_dt.json new file mode 100644 index 0000000000..eacc121697 --- /dev/null +++ b/src/radical/pilot/configs/resource_vtarc_dt.json @@ -0,0 +1,50 @@ + +{ + "stampede_ssh": { + "description" : "The XSEDE 'Stampede' cluster at TACC (https://www.tacc.utexas.edu/stampede/).", + "notes" : "Always set the ``project`` attribute in the ComputePilotDescription or the pilot will fail.", + "schemas" : ["gsissh", "ssh", "go"], + "mandatory_args" : ["project"], + "gsissh" : + { + "job_manager_endpoint" : "slurm+gsissh://stampede.tacc.utexas.edu:2222/", + "filesystem_endpoint" : "gsisftp://stampede.tacc.utexas.edu:2222/" + }, + "ssh" : + { + "job_manager_endpoint" : "slurm+ssh://stampede.tacc.utexas.edu/", + "filesystem_endpoint" : "sftp://stampede.tacc.utexas.edu/" + }, + "go": + { + "job_manager_endpoint" : "slurm+ssh://stampede.tacc.utexas.edu/", + "filesystem_endpoint" : "go://xsede#stampede/" + }, + "default_queue" : "normal", + "lrms" : "SLURM", + "agent_scheduler" : "CONTINUOUS", + "agent_spawner" : "POPEN", + "agent_launch_method" : "SSH", + "task_launch_method" : "SSH", + "mpi_launch_method" : "MPIRUN_RSH", + "pre_bootstrap_1" : ["module purge", + "module load TACC", + "module load intel/15.0.2", + "module load python/2.7.12", + "module unload xalt", + # Load ICC license so we can build during bootstrap + "source ~train00/ssi_sourceme", + "export TACC_DELETE_FILES=TRUE" + ], + "default_remote_workdir" : "$WORK", + "valid_roots" : ["/scratch", "$SCRATCH", "/work", "$WORK"], + "rp_version" : "local", + "virtenv_mode" : "create", + "python_dist" : "default", + "export_to_cu" : ["LMOD_CMD", + "LMOD_SYSTEM_DEFAULT_MODULES", + "LD_LIBRARY_PATH"], + "cu_pre_exec" : ["module restore"] + } +} +