# SLURM job management tests

In this notebook we are testing SLURM allocation and task submission

In [1]:
from slurm_helpers import SlurmJob

## Start an allocation

In [2]:
# Configure the allocation
node_type = 'haswell'
n_nodes = 2
qos = 'interactive'
time = 10

In [3]:
job = SlurmJob(node_type=node_type, n_nodes=n_nodes, qos=qos, time=time)

Launched in background. Redirecting stdin to /dev/null
salloc: Pending job allocation 15662545
salloc: job 15662545 queued and waiting for resources
salloc: job 15662545 has been allocated resources
salloc: Granted job allocation 15662545
salloc: Waiting for resource configuration
salloc: Nodes nid00[220-221] are ready for job



In [4]:
job.jobid

15662545

## Submitting tasks to the allocation

In [5]:
# Submit two tasks into this allocation
p1 = job.submit_task('sleep 1m')
p2 = job.submit_task('which python')

In [6]:
%%bash

sacct | tail -n 4

15662545     allocation interacti+    dasrepo        128    RUNNING      0:0 
15662545.ex+     extern               dasrepo        128    RUNNING      0:0 
15662545.0        which               dasrepo          1  COMPLETED      0:0 
15662545.1        sleep               dasrepo          1    RUNNING      0:0 


In [7]:
# Submit a multi-node task which must wait until p1, p2 are done
p3 = job.submit_task('hostname', n_nodes=2)

In [12]:
%%bash

sacct | tail -n 5

15662545     allocation interacti+    dasrepo        128    RUNNING      0:0 
15662545.ex+     extern               dasrepo        128    RUNNING      0:0 
15662545.0        which               dasrepo          1  COMPLETED      0:0 
15662545.1        sleep               dasrepo          1  COMPLETED      0:0 
15662545.2     hostname               dasrepo          2  COMPLETED      0:0 


In [13]:
# Are they done?
print(p1.poll(), p2.poll(), p3.poll())

0 0 0


In [14]:
for p in [p1, p2, p3]:
    print('command:', p.args)
    outputs = p.communicate()
    print('stdout:', outputs[0].decode().strip())
    print('stderr:', outputs[1].decode().strip())

command: ['srun', '--jobid', '15662545', '-N', '1', 'sleep', '1m']
stdout: 
stderr: Launched in background. Redirecting stdin to /dev/null
command: ['srun', '--jobid', '15662545', '-N', '1', 'which', 'python']
stdout: /global/common/cori/software/python/3.6-anaconda-5.2/bin/python
stderr: Launched in background. Redirecting stdin to /dev/null
command: ['srun', '--jobid', '15662545', '-N', '2', 'hostname']
stdout: nid00221
nid00220
stderr: Launched in background. Redirecting stdin to /dev/null
srun: Job 15662545 step creation temporarily disabled, retrying
srun: Step created for job 15662545


In [15]:
# Job is canceled when SlurmJob deleted
del job

In [16]:
%%bash

sqs

No jobs found
