# Wrapping the call for the Bayesian optimization loop

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

In [2]:
import os
import numpy as np
import netCDF4 as nc
import xarray as xr
import datatree as dt
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from sithom.plot import plot_defaults, label_subplots, lim
from tcpips.constants import DATA_PATH, FIGURE_PATH
from adforce.mesh import xr_loader

plot_defaults()

In [5]:
from slurmpy import Slurm

In [11]:
s = Slurm("nws13", 
          {"nodes": 1, 
           "account": "n01-SOWISE",
           "partition": "standard",
           "qos": "standard",
           "time": "1:0:0",
           "tasks-per-node": 128,
           "cpus-per-task": 1,
           "output": "test.out",
           "error": "test.out",
           "mail-type": "ALL",
           "mail-user": "sdat2@cam.ac.uk"}
)
jid = s.run(
"""
module load PrgEnv-gnu/8.3.3
module load cray-hdf5-parallel
module load cray-netcdf-hdf5parallel

work=/mnt/lustre/a2fs-work1/work/n01/n01/sithom
source $work/.bashrc

conda activate base

python -c "import numpy as np; print('numpy', np.__version__)"
python -c "import netCDF4 as nc; print('netCDF4', nc.__version__)"
python -c "import xarray as xr; print('xarray', xr.__version__)"
python -c "import cartopy; print('cartopy', cartopy.__version__)"
"""
)
jid

b'Submitted batch job 5751164'


5751164

In [6]:
s = Slurm(
    "angle-test",
    {
        "nodes": 1,
        "account": "n01-SOWISE",
        "partition": "standard",
        "qos": "standard",
        "time": "1:0:0",
        "tasks-per-node": 128,
        "cpus-per-task": 1,
        "output": "test.out",
        "error": "test.out",
        "mail-type": "ALL",
        "mail-user": "sdat2@cam.ac.uk",
    },
)

jid = s.run(
"""
module load PrgEnv-gnu/8.3.3
module load cray-hdf5-parallel
module load cray-netcdf-hdf5parallel

cd /mnt/lustre/a2fs-work1/work/n01/n01/sithom/adcirc-swan/angle_test/exp_000

work=/mnt/lustre/a2fs-work1/work/n01/n01/sithom
source $work/.bashrc

d1=/work/n01/n01/sithom/adcirc-swan/katrina1

echo "hook 1"
eval "$(conda shell.bash hook)"

# define variables
case_name=$SLURM_JOB_NAME # name for printing
np=128 # how many parallel tasks to define

export OMP_NUM_THREADS=1

# Propagate the cpus-per-task setting from script to srun commands
#    By default, Slurm does not propagate this setting from the sbatch
#    options to srun commands in the job script. If this is not done,
#    process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

#...Run the case
echo ""
echo "|---------------------------------------------|"
echo "    TEST CASE: $case_name"
echo ""
echo -n "    Prepping case..."
${d1}/adcprep --np $np --partmesh >  adcprep.log
${d1}/adcprep --np $np --prepall  >> adcprep.log
if [ $? == 0 ] ; then
    echo "done!"
else
    echo "ERROR!"
    exit 1
fi

echo -n "    Runnning case..."
srun --distribution=block:block --hint=nomultithread ${d1}/padcirc > padcirc_log.txt
exitstat=$?
echo "Finished"
echo "    ADCIRC Exit Code: $exitstat"
if [ "x$exitstat" != "x0" ] ; then
    echo "    ERROR: ADCIRC did not exit cleanly."
    exit 1
fi
echo ""

"""
)
jid

b'Submitted batch job 5754241'


5754241

In [26]:
import os, sys
import subprocess

args = f"sacct -j {jid} -o state"
job_states = [x.strip() for x in os.popen(args).read().strip().split("\n")]
is_finished = np.all([x == "COMPLETED" for x in job_states[2:]]) if len(job_states) > 2 else False

In [28]:
is_finished

True

In [30]:
import time
from sithom.time import timeit


@timeit
def run_and_wait(dir: str, jobname="run") -> int:
    s = Slurm(
        jobname,
        {
            "nodes": 1,
            "account": "n01-SOWISE",
            "partition": "standard",
            "qos": "standard",
            "time": "1:0:0",
            "tasks-per-node": 128,
            "cpus-per-task": 1,
            "output": os.path.join(dir, "test.out"),
            "error": os.path.join(dir, "test.out"),
            "mail-type": "ALL",
            "mail-user": "sdat2@cam.ac.uk",
        },
    )

    jid = s.run(
    f"""
module load PrgEnv-gnu/8.3.3
module load cray-hdf5-parallel
module load cray-netcdf-hdf5parallel

cd {dir}

work=/mnt/lustre/a2fs-work1/work/n01/n01/sithom
source $work/.bashrc

d1=/work/n01/n01/sithom/adcirc-swan/katrina1

echo "hook 1"
eval "$(conda shell.bash hook)"

# define variables
case_name=$SLURM_JOB_NAME # name for printing
np=128 # how many parallel tasks to define

export OMP_NUM_THREADS=1

# Propagate the cpus-per-task setting from script to srun commands
#    By default, Slurm does not propagate this setting from the sbatch
#    options to srun commands in the job script. If this is not done,
#    process/thread pinning may be incorrect leading to poor performance
export SRUN_CPUS_PER_TASK=$SLURM_CPUS_PER_TASK

#...Run the case
echo ""
echo "|---------------------------------------------|"
echo "    TEST CASE: $case_name"
echo ""
echo -n "    Prepping case..."
$d1/adcprep --np $np --partmesh >  adcprep.log
$d1/adcprep --np $np --prepall  >> adcprep.log
if [ $? == 0 ] ; then
    echo "done!"
else
    echo "ERROR!"
    exit 1
fi

echo -n "    Runnning case..."
srun --distribution=block:block --hint=nomultithread $d1/padcirc > padcirc_log.txt
exitstat=$?
echo "Finished"
echo "    ADCIRC Exit Code: $exitstat"
if [ "x$exitstat" != "x0" ] ; then
    echo "    ERROR: ADCIRC did not exit cleanly."
    exit 1
fi
echo ""

"""
    )
    def query_job(jid: int) -> bool:
        args = f"sacct -j {jid} -o state"
        job_states = [x.strip() for x in os.popen(args).read().strip().split("\n")]
        return np.all([x == "COMPLETED" for x in job_states[2:]]) if len(job_states) > 2 else False
    
    is_finished = query_job(jid)
    while not is_finished:
        is_finished = query_job(jid)
        time.sleep(10)

    print(f"Job {jid} finished")

    return jid


run_and_wait("/mnt/lustre/a2fs-work1/work/n01/n01/sithom/adcirc-swan/angle_test/exp_000")

b'Submitted batch job 5754702'


'run_and_wait'  07 min 14 s 



5754702