-
Notifications
You must be signed in to change notification settings - Fork 117
Closed
Labels
Description
Hi,
first of all thank you for this magnificent piece of work.
I would like to run tests using spack as build system and slurm as launcher.
This is my settings.py file:
site_configuration = {
"systems": [
{
"name": "cluster",
"descr": "cluster",
"hostnames": [".*"],
"partitions": [
{
"name": "hpc",
"scheduler": "slurm",
"launcher": "srun",
"access": ["--exclusive", "--partition hpc"],
"max_jobs": 2,
"environs": ["gnu"],
},
],
},
],
"environments": [
{
"name": "gnu",
"cc": "gcc-9",
"cxx": "g++-9",
"ftn": "gfortran-9"
},
],
"logging": [
{
"level": "debug",
"handlers": [
{
"type": "stream",
"name": "stdout",
"level": "info",
"format": "%(message)s"
},
{
"type": "file",
"level": "debug",
"format": "[%(asctime)s] %(levelname)s: %(check_info)s: %(message)s",
"append": False
}
],
"handlers_perflog": [
{
"type": "filelog",
"prefix": "%(check_system)s/%(check_partition)s",
"level": "info",
"format": (
"%(check_job_completion_time)s|reframe %(version)s|"
"%(check_info)s|jobid=%(check_jobid)s|"
"%(check_perf_var)s=%(check_perf_value)s|"
"ref=%(check_perf_ref)s "
"(l=%(check_perf_lower_thres)s, "
"u=%(check_perf_upper_thres)s)|"
"%(check_perf_unit)s"
),
"append": True
}
]
}
],
}This is the test that I am trying to run:
import reframe as rfm
import reframe.utility.sanity as sn
@rfm.simple_test
class Fio(rfm.RegressionTest):
descr = "fio benchmark"
valid_systems = ["cluster:hpc"]
valid_prog_environs = ["*"]
vars_dirs = parameter(["/mnt/resource"])
vars_rw = parameter(["read", "write"])
vars_bs_size = parameter([("4K", "128M"), ("4M", "2G")])
prerun_cmds = ["set -euo pipefail"]
executable = "fio"
executable_opts = [
"--name=fio",
"--directory=$VARS_DIRS",
"--rw=$VARS_RW",
"--blocksize=$VARS_BS",
"--size=$VARS_BS",
"--iodepth=64",
"--numjobs=$VARS_NUM_JOBS",
"--group_reporting",
"--direct=1",
"--time_based",
"--runtime=10s",
]
postrun_cmds = ["echo FINISHED"]
# num_tasks = 0
build_system = "Spack"
@run_after("init")
def set_description(self):
self.descr = f"FIO check ({self.vars_dirs})"
@run_before("compile")
def setup_build_system(self):
self.build_system.specs = ["fio"]
self.build_system.install_tree = "/shared/home/vasco/spack/opt/spack"
@run_before("run")
def set_variables(self):
self.variables = {
"VARS_DIRS": str(self.vars_dirs),
"VARS_RW": str(self.vars_rw),
"VARS_BS": str(self.vars_bs_size[0]),
"VARS_SIZE": str(self.vars_bs_size[1]),
"VARS_NUM_JOBS": "$(nproc)",
}
@sanity_function
def assert_FINISHED(self):
return sn.assert_found("FINISHED", self.stdout)
@run_before("performance")
def set_perf_patterns(self):
self.perf_patterns = {
"IOPS": sn.extractsingle("IOPS=(\S+),", self.stdout, 1, int),
"BW": sn.extractsingle("BW=(\d+.\d+)(\S+)", self.stdout, 1, float),
}Some questions:
- I have set
prerun_cmds = ["set -euo pipefail"]in order for the job to fail fast, is there a way to add this as settings level, ie for all tests? - As it is now it will build on the login node and then run using slurm:
- is there a way to split the build and run in 2 separate jobs run by slurm so the build will happen on the same kind of node as the run?
- keep the dependency between nodes of the same partition, for example if I run the same test with 4 parameters on 2 partition I expect slurm 10 jobs (2 builds, 8 runs) each 4 runs, on each partition, will depend on the build; so as soon as the build is finished the 4 runs can run in parallel.
- If no
num_tasksis specified,reframewill automatically add#SBATCH --ntasks=1, instead ifnum_tasks = 0if will add#SBATCH --ntasks=5000.
The second case makessbatchfail:
$ cat rfm_Fio__tmp_read___4K____128M___job.sh
#!/bin/bash
#SBATCH --job-name="rfm_Fio__tmp_read___4K____128M___job"
#SBATCH --ntasks=5000
#SBATCH --output=rfm_Fio__tmp_read___4K____128M___job.out
#SBATCH --error=rfm_Fio__tmp_read___4K____128M___job.err
#SBATCH --exclusive
#SBATCH --partition hpc
export VARS_DIRS=/tmp
export VARS_RW=read
export VARS_BS=4K
export VARS_SIZE=128M
export VARS_NUM_JOBS=$(nproc)
. $SPACK_ROOT/share/spack/setup-env.sh
spack env activate -V -d rfm_spack_env
spack load fio
set -euo pipefail
srun fio --name=fio --directory=$VARS_DIRS --rw=$VARS_RW --blocksize=$VARS_BS --size=$VARS_BS --iodepth=64 --numjobs=$VARS_NUM_JOBS --group_reporting --direct=1 --time_based --runtime=10s
echo FINISHED
$ sbatch rfm_Fio__tmp_read___4K____128M___job.sh
sbatch: error: Batch job submission failed: Requested node configuration is not availableIs there a way to completely avoid the #SBATCH --ntasks=* line?
Thanks