training_noodles/specs/defaults.yml

# Name of the spec
name: Default spec

# Description of the spec
description: >-
  All default settings are in this spec. Properties in user specs would inherit
  from these settings if user doesn't specify them.

################################################################################
# Experiments
################################################################################

# The default properties for each experiment
experiment_default:
  # Default name of the experiment
  name: <default>
  # Default environment variables, would be added to each experiment no matter
  # the command type, so that you don't need to type the same environment
  # variable every time
  envs: {}
  # Default experiment dependencies in each command type, as a list of
  # experiment names, if one of the dependency hasn't been deployed, the current
  # experiment won't be deployed either
  depends_on: {}
  # Default requirements in each command type, as a list of requirement IDs and
  # associative comparison operator and values (e.g., "==Yes", ">=10"), each
  # command group would be run (see "requirements" in latter section) to check
  # whether the metrics (results) are met
  requirements: {}
  # Default commands in each command type, as a single command or list of
  # commands to be run on either local or remote (default) (e.g.,
  # "git clone...", "remote:git clone...", "local:scp...")
  commands: {}
  # Default output paths in each command type, as a dict(stdout_to=<path1>,
  # stderr_to=<path2>) to be the STDOUT and STDERR outputs produced by commands
  write_outputs: {}

# Runs before the start of main experiments
before_all_experiments: []

# The main experiments
experiments: []

# Runs after the main experiments have finished
after_all_experiments: []

################################################################################
# Servers
################################################################################

# The default properties for each server
server_default:
  # Default name of the server
  name: <default>
  # Path to the private key on local
  private_key_path: $HOME/.ssh/id_rsa
  # Port to connect
  port: 22
  # Username on the server (e.g., 'user1')
  username: $USER
  # Hostname of the server (e.g., 'example.com', '123.123.123.123')
  hostname: localhost

# All servers
servers: []

################################################################################
# Requirements
################################################################################

# Commands to run to check requirements on servers
requirements:
  # Get average CPU usage over 3 seconds (Output: Three floats between 0.0-1.0)
  # Reference: https://askubuntu.com/a/941997
  # (If there are multiple outputs in each line, Noodles would try to calculate
  # the average)
  cpu_usage:
  - "(grep 'cpu ' /proc/stat;sleep 0.1;grep 'cpu ' /proc/stat) | awk -v RS='' '{print ($13-$2+$15-$4)/($13-$2+$15-$4+$16-$5)}'"
  - "sleep 1.5"
  - "(grep 'cpu ' /proc/stat;sleep 0.1;grep 'cpu ' /proc/stat) | awk -v RS='' '{print ($13-$2+$15-$4)/($13-$2+$15-$4+$16-$5)}'"
  - "sleep 1.5"
  - "(grep 'cpu ' /proc/stat;sleep 0.1;grep 'cpu ' /proc/stat) | awk -v RS='' '{print ($13-$2+$15-$4)/($13-$2+$15-$4+$16-$5)}'"
  # Get average CPU load over last 1 minute (Output: CPU load greater or equal
  # to 0.0)
  # Reference: https://stackoverflow.com/a/24839903
  # (Please note that CPU usage and load are two different concepts, please
  # refer to https://estl.tech/cpu-usage-vs-load-ecca22287b21)
  cpu_load: "awk '{print $1}' /proc/loadavg"
  # Get memory usage (Output: A float between 0.0-1.0)
  # Reference: https://askubuntu.com/a/941997
  memory_usage: "awk '/MemTotal/{t=$2}/MemAvailable/{a=$2}END{print 1-a/t}' /proc/meminfo"
  # Get disk usage (Output: A float between 0.0-1.0)
  # Reference: https://askubuntu.com/a/941997
  disk_usage: "df | awk '/ \/$/{print substr($5, 1, length($5)-1)/100}'"
  # Get CUDA GPU utilization (Output: A float between 0.0-1.0)
  # Reference: https://gist.github.com/jonatw/9322244
  cuda_gpu_utilization: "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits | awk '{s+=$1; n++} END {print s/100/n}'"
  # Get CUDA memory usage (Output: A float between 0.0-1.0)
  # Reference: https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries
  cuda_memory_usage: "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits | awk '{p=$1/$2; s+=p; n++} END {print s/n}'"

################################################################################
# Deployment
################################################################################

# Path in each command type, as a string for Noodles to write the current
# deployment status to the file
write_status_to: {}

# The interval to run each deployment round
round_interval: 10

# The interval to deploy each experiment in each round
deployment_interval: 0

# The interval to execute the commands
commands_interval: 0

################################################################################
# Error Handling
################################################################################

# Whether to check any nonzero return code and nonempty stderr and raise error
check_any_errors: True

# List of error handlers
error_handlers: []

################################################################################
# Shell Commands
################################################################################

# Shell command to execute the commands read from the string
# See: https://linux.die.net/man/1/bash
shell_string: "bash -c"

# Shell command to execute the commands read from the standard input (STDIN)
# See: https://linux.die.net/man/1/bash
shell_stdin: "bash -s"