/
defaults.yml
139 lines (117 loc) · 5.64 KB
/
defaults.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# Name of the spec
name: Default spec
# Description of the spec
description: >-
All default settings are in this spec. Properties in user specs would inherit
from these settings if user doesn't specify them.
################################################################################
# Experiments
################################################################################
# The default properties for each experiment
experiment_default:
# Default name of the experiment
name: <default>
# Default environment variables, would be added to each experiment no matter
# the command type, so that you don't need to type the same environment
# variable every time
envs: {}
# Default experiment dependencies in each command type, as a list of
# experiment names, if one of the dependency hasn't been deployed, the current
# experiment won't be deployed either
depends_on: {}
# Default requirements in each command type, as a list of requirement IDs and
# associative comparison operator and values (e.g., "==Yes", ">=10"), each
# command group would be run (see "requirements" in latter section) to check
# whether the metrics (results) are met
requirements: {}
# Default commands in each command type, as a single command or list of
# commands to be run on either local or remote (default) (e.g.,
# "git clone...", "remote:git clone...", "local:scp...")
commands: {}
# Default output paths in each command type, as a dict(stdout_to=<path1>,
# stderr_to=<path2>) to be the STDOUT and STDERR outputs produced by commands
write_outputs: {}
# Runs before the start of main experiments
before_all_experiments: []
# The main experiments
experiments: []
# Runs after the main experiments have finished
after_all_experiments: []
################################################################################
# Servers
################################################################################
# The default properties for each server
server_default:
# Default name of the server
name: <default>
# Path to the private key on local
private_key_path: $HOME/.ssh/id_rsa
# Port to connect
port: 22
# Username on the server (e.g., 'user1')
username: $USER
# Hostname of the server (e.g., 'example.com', '123.123.123.123')
hostname: localhost
# All servers
servers: []
################################################################################
# Requirements
################################################################################
# Commands to run to check requirements on servers
requirements:
# Get average CPU usage over 3 seconds (Output: Three floats between 0.0-1.0)
# Reference: https://askubuntu.com/a/941997
# (If there are multiple outputs in each line, Noodles would try to calculate
# the average)
cpu_usage:
- "(grep 'cpu ' /proc/stat;sleep 0.1;grep 'cpu ' /proc/stat) | awk -v RS='' '{print ($13-$2+$15-$4)/($13-$2+$15-$4+$16-$5)}'"
- "sleep 1.5"
- "(grep 'cpu ' /proc/stat;sleep 0.1;grep 'cpu ' /proc/stat) | awk -v RS='' '{print ($13-$2+$15-$4)/($13-$2+$15-$4+$16-$5)}'"
- "sleep 1.5"
- "(grep 'cpu ' /proc/stat;sleep 0.1;grep 'cpu ' /proc/stat) | awk -v RS='' '{print ($13-$2+$15-$4)/($13-$2+$15-$4+$16-$5)}'"
# Get average CPU load over last 1 minute (Output: CPU load greater or equal
# to 0.0)
# Reference: https://stackoverflow.com/a/24839903
# (Please note that CPU usage and load are two different concepts, please
# refer to https://estl.tech/cpu-usage-vs-load-ecca22287b21)
cpu_load: "awk '{print $1}' /proc/loadavg"
# Get memory usage (Output: A float between 0.0-1.0)
# Reference: https://askubuntu.com/a/941997
memory_usage: "awk '/MemTotal/{t=$2}/MemAvailable/{a=$2}END{print 1-a/t}' /proc/meminfo"
# Get disk usage (Output: A float between 0.0-1.0)
# Reference: https://askubuntu.com/a/941997
disk_usage: "df | awk '/ \/$/{print substr($5, 1, length($5)-1)/100}'"
# Get CUDA GPU utilization (Output: A float between 0.0-1.0)
# Reference: https://gist.github.com/jonatw/9322244
cuda_gpu_utilization: "nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits | awk '{s+=$1; n++} END {print s/100/n}'"
# Get CUDA memory usage (Output: A float between 0.0-1.0)
# Reference: https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries
cuda_memory_usage: "nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader,nounits | awk '{p=$1/$2; s+=p; n++} END {print s/n}'"
################################################################################
# Deployment
################################################################################
# Path in each command type, as a string for Noodles to write the current
# deployment status to the file
write_status_to: {}
# The interval to run each deployment round
round_interval: 10
# The interval to deploy each experiment in each round
deployment_interval: 0
# The interval to execute the commands
commands_interval: 0
################################################################################
# Error Handling
################################################################################
# Whether to check any nonzero return code and nonempty stderr and raise error
check_any_errors: True
# List of error handlers
error_handlers: []
################################################################################
# Shell Commands
################################################################################
# Shell command to execute the commands read from the string
# See: https://linux.die.net/man/1/bash
shell_string: "bash -c"
# Shell command to execute the commands read from the standard input (STDIN)
# See: https://linux.die.net/man/1/bash
shell_stdin: "bash -s"