# Radical Pilot Tutorial


## 1. Installation

    pip install radical.pilot
    
## 2. Radical Pilot Setup

Import dependency and create callbacks for state update

Documentation: http://radicalpilot.readthedocs.org/en/latest/machconf.html#preconfigured-resources

In [8]:
import os, sys
import commands
import radical.pilot as rp

os.environ["RADICAL_PILOT_DBURL"]="mongodb://ec2-54-221-194-147.compute-1.amazonaws.com:24242/giannis"

#os.environ["RADICAL_PILOT_DBURL"]="mongodb://129.114.108.124:27017/sctut15"
os.environ["RADICAL_PILOT_VERBOSE"]="DEBUG"

""" DESCRIPTION: Tutorial 1: A Simple Workload consisting of a Bag-of-Tasks
"""

def pilot_state_cb (pilot, state):
    if not pilot:
        return
    print "[Callback]: ComputePilot '%s' state: %s." % (pilot.uid, state)
    if state == rp.FAILED:
        sys.exit (1)

def unit_state_cb (unit, state):
    if not unit:
        return

    global CNT
    print "[Callback]: unit %s on %s: %s." % (unit.uid, unit.pilot_id, state)
    if state == rp.FAILED:
        print "stderr: %s" % unit.stderr
        sys.exit(2)

In [9]:
commands.getstatusoutput('echo $RADICAL_PILOT_VERBOSE')

(0, 'DEBUG')

## 3. Submit Pilot

In [2]:
# Create a new session. No need to try/except this: if session creation
# fails, there is not much we can do anyways...
session = rp.Session(name="Pilot-Test")
print "session id: %s" % session.uid


try:
    c = rp.Context('ssh')
    c.user_id = "tg824689"
    session.add_context(c)

    # Add a Pilot Manager. Pilot managers manage one or more ComputePilots.
    print "Initializing Pilot Manager ..."
    pmgr = rp.PilotManager(session=session)
    # Register our callback with the PilotManager.
    pmgr.register_callback(pilot_state_cb)

    pdesc = rp.ComputePilotDescription ()
    pdesc.resource = "xsede.stampede"  # NOTE: This is a "label", not a hostname
    #pdesc.resource = "local.locahost"
    #pdesc.resource = "yarn.localhost"  # NOTE: This is a "label", not a hostname
    pdesc.runtime  = 10 # minutes
    pdesc.cores    = 16
    pdesc.cleanup  = False
    pdesc.queue    = 'development'
    pdesc.project  = 'TG-MCB090174'

    # submit the pilot.
    print "Submitting Compute Pilot to Pilot Manager ..."
    pilot = pmgr.submit_pilots(pdesc)

    # Combine the ComputePilot, the ComputeUnits and a scheduler via
    # a UnitManager object.
    print "Initializing Unit Manager ..."
    umgr = rp.UnitManager (session=session,
                           scheduler=rp.SCHED_DIRECT_SUBMISSION)

    # Register our callback with the UnitManager. This callback will get
    # called every time any of the units managed by the UnitManager
    # change their state.
    umgr.register_callback(unit_state_cb)

    # Add the created ComputePilot to the UnitManager.
    print "Registering Compute Pilot with Unit Manager ..."
    umgr.add_pilots(pilot)

    NUMBER_JOBS  = 16 # the total number of cus to run

    # submit CUs to pilot job
    cudesc_list = []
    for i in range(NUMBER_JOBS):

        # -------- BEGIN USER DEFINED CU DESCRIPTION --------- #
        cudesc = rp.ComputeUnitDescription()
        cudesc.environment = {'CU_NO': i}
        cudesc.executable  = "/bin/echo"
        cudesc.arguments   = ['I am CU number $CU_NO']
        cudesc.cores       = 1
        # -------- END USER DEFINED CU DESCRIPTION --------- #

        cudesc_list.append(cudesc)

    # Submit the previously created ComputeUnit descriptions to the
    # PilotManager. This will trigger the selected scheduler to start
    # assigning ComputeUnits to the ComputePilots.
    print "Submit Compute Units to Unit Manager ..."
    cu_set = umgr.submit_units (cudesc_list)

    print "Waiting for CUs to complete ..."
    umgr.wait_units()
    print "All CUs completed successfully!"
    
    for cu in cu_set:
        print cu.stdout


except Exception as e:
    # Something unexpected happened in the pilot code above
    print "caught Exception: %s" % e
    raise

except (KeyboardInterrupt, SystemExit) as e:
    # the callback called sys.exit(), and we can here catch the
    # corresponding KeyboardInterrupt exception for shutdown.  We also catch
    # SystemExit (which gets raised if the main threads exits for some other
    # reason).
    print "need to exit now: %s" % e

finally:
    # always clean up the session, no matter if we caught an exception or not.
    print "closing session"
    session.close ()


session id: Pilot-Test
Initializing Pilot Manager ...
Submitting Compute Pilot to Pilot Manager ...
Initializing Unit Manager ...
Registering Compute Pilot with Unit Manager ...
Submit Compute Units to Unit Manager ...
Waiting for CUs to complete ...
[Callback]: unit unit.000005 on pilot.0000: StagingInput.
[Callback]: unit unit.000001 on pilot.0000: StagingInput.
[Callback]: unit unit.000002 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000007 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000009 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000008 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000003 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000004 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000000 on pilot.0000: PendingInputStaging.
[Callback]: unit unit.000006 on pilot.0000: PendingInputStaging.
[Callback]: ComputePilot 'pilot.0000' state: Launching.
[Callback]: unit unit.000005 on pilot.0000: PendingAgentI