Skip to content

Commit

Permalink
Make release stress tests work and improve them. (ray-project#4955)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertnishihara authored and pcmoritz committed Jun 11, 2019
1 parent e6baffb commit 6f48992
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 125 deletions.
6 changes: 3 additions & 3 deletions ci/stress_tests/application_cluster_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ provider:
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
availability_zone: us-west-2b

# How Ray will authenticate with newly launched nodes.
auth:
Expand Down Expand Up @@ -90,8 +90,8 @@ file_mounts: {
# List of shell commands to run to set up nodes.
setup_commands:
- echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc
- ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev1-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
- rllib || pip install -U ray-0.8.0.dev1-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
- ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
- rllib || pip install -U ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
- pip install tensorflow-gpu==1.12.0
- echo "sudo halt" | at now + 60 minutes
# Consider uncommenting these if you also want to run apt-get commands during setup
Expand Down
88 changes: 56 additions & 32 deletions ci/stress_tests/run_application_stress_tests.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
#!/usr/bin/env bash

# This script should be run as follows:
# ./run_application_stress_tests.sh <ray-version> <ray-commit>
# For example, <ray-version> might be 0.7.1
# and <ray-commit> might be bc3b6efdb6933d410563ee70f690855c05f25483. The commit
# should be the latest commit on the branch "releases/<ray-version>".

# This script runs all of the application tests.
# Currently includes an IMPALA stress test and a SGD stress test.
# on both Python 2.7 and 3.6.
Expand All @@ -10,26 +17,39 @@

# This script will exit with code 1 if the test did not run successfully.

# Show explicitly which commands are currently running. This should only be AFTER
# the private key is placed.
set -x

ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log"

echo "Logging to" $RESULT_FILE
echo -e $RAY_AWS_SSH_KEY > /root/.ssh/ray-autoscaler_us-west-2.pem && chmod 400 /root/.ssh/ray-autoscaler_us-west-2.pem || true
touch "$RESULT_FILE"
echo "Logging to" "$RESULT_FILE"

if [[ -z "$1" ]]; then
echo "ERROR: The first argument must be the Ray version string."
exit 1
else
RAY_VERSION=$1
fi

# Show explicitly which commands are currently running. This should only be AFTER
# the private key is placed.
set -x
if [[ -z "$2" ]]; then
echo "ERROR: The second argument must be the commit hash to test."
exit 1
else
RAY_COMMIT=$2
fi

touch $RESULT_FILE
echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_COMMIT/"

# This function identifies the right string for the Ray wheel.
_find_wheel_str(){
local python_version=$1
# echo "PYTHON_VERSION", $python_version
local wheel_str=""
if [ $python_version == "p27" ]; then
if [ "$python_version" == "p27" ]; then
wheel_str="cp27-cp27mu"
else
wheel_str="cp36-cp36m"
Expand All @@ -41,7 +61,7 @@ _find_wheel_str(){
# Actual test runtime is roughly 10 minutes.
test_impala(){
local PYTHON_VERSION=$1
local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION)
local WHEEL_STR=$(_find_wheel_str "$PYTHON_VERSION")

pushd "$ROOT_DIR"
local TEST_NAME="rllib_impala_$PYTHON_VERSION"
Expand All @@ -50,32 +70,34 @@ test_impala(){

cat application_cluster_template.yaml |
sed -e "
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
s/<<<HEAD_TYPE>>>/p3.16xlarge/;
s/<<<WORKER_TYPE>>>/m5.24xlarge/;
s/<<<MIN_WORKERS>>>/5/;
s/<<<MAX_WORKERS>>>/5/;
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > "$CLUSTER"

echo "Try running IMPALA stress test."
{
RLLIB_DIR=../../python/ray/rllib/
ray --logging-level=DEBUG up -y $CLUSTER &&
ray rsync_up $CLUSTER $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
ray --logging-level=DEBUG up -y "$CLUSTER" &&
ray rsync_up "$CLUSTER" $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
sleep 1 &&
ray --logging-level=DEBUG exec $CLUSTER "rllib || true" &&
ray --logging-level=DEBUG exec $CLUSTER "
ray --logging-level=DEBUG exec "$CLUSTER" "rllib || true" &&
ray --logging-level=DEBUG exec "$CLUSTER" "
rllib train -f tuned_examples/atari-impala-large.yaml --redis-address='localhost:6379' --queue-trials" &&
echo "PASS: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
} || echo "FAIL: IMPALA Test for" $PYTHON_VERSION >> $RESULT_FILE
echo "PASS: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
} || echo "FAIL: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"

# Tear down cluster.
if [ "$DEBUG_MODE" = "" ]; then
ray down -y $CLUSTER
rm $CLUSTER
ray down -y "$CLUSTER"
rm "$CLUSTER"
else
echo "Not tearing down cluster" $CLUSTER
echo "Not tearing down cluster" "$CLUSTER"
fi
popd
}
Expand All @@ -93,32 +115,34 @@ test_sgd(){

cat application_cluster_template.yaml |
sed -e "
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;
s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
s/<<<HEAD_TYPE>>>/g3.16xlarge/;
s/<<<WORKER_TYPE>>>/g3.16xlarge/;
s/<<<HEAD_TYPE>>>/p3.16xlarge/;
s/<<<WORKER_TYPE>>>/p3.16xlarge/;
s/<<<MIN_WORKERS>>>/3/;
s/<<<MAX_WORKERS>>>/3/;
s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > $CLUSTER
s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > "$CLUSTER"

echo "Try running SGD stress test."
{
SGD_DIR=$ROOT_DIR/../../python/ray/experimental/sgd/
ray --logging-level=DEBUG up -y $CLUSTER &&
ray --logging-level=DEBUG up -y "$CLUSTER" &&
# TODO: fix submit so that args work
ray rsync_up $CLUSTER $SGD_DIR/mnist_example.py mnist_example.py &&
ray rsync_up "$CLUSTER" "$SGD_DIR/mnist_example.py" mnist_example.py &&
sleep 1 &&
ray --logging-level=DEBUG exec $CLUSTER "
ray --logging-level=DEBUG exec "$CLUSTER" "
python mnist_example.py --redis-address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" &&
echo "PASS: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
} || echo "FAIL: SGD Test for" $PYTHON_VERSION >> $RESULT_FILE
echo "PASS: SGD Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
} || echo "FAIL: SGD Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"

# Tear down cluster.
if [ "$DEBUG_MODE" = "" ]; then
ray down -y $CLUSTER
rm $CLUSTER
ray down -y "$CLUSTER"
rm "$CLUSTER"
else
echo "Not tearing down cluster" $CLUSTER
echo "Not tearing down cluster" "$CLUSTER"
fi
popd
}
Expand All @@ -130,6 +154,6 @@ do
test_sgd $PYTHON_VERSION
done

cat $RESULT_FILE
cat $RESULT_FILE | grep FAIL > test.log
cat "$RESULT_FILE"
cat "$RESULT_FILE" | grep FAIL > test.log
[ ! -s test.log ] || exit 1
47 changes: 34 additions & 13 deletions ci/stress_tests/run_stress_tests.sh
Original file line number Diff line number Diff line change
@@ -1,40 +1,61 @@
#!/usr/bin/env bash

# Show explicitly which commands are currently running.
set -x

ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)
RESULT_FILE=$ROOT_DIR/results-$(date '+%Y-%m-%d_%H-%M-%S').log
echo "Logging to" $RESULT_FILE
echo -e $RAY_AWS_SSH_KEY > /root/.ssh/ray-autoscaler_us-west-2.pem && chmod 400 /root/.ssh/ray-autoscaler_us-west-2.pem || true

touch "$RESULT_FILE"
echo "Logging to" "$RESULT_FILE"

# Show explicitly which commands are currently running. This should only be AFTER
# the private key is placed.
set -x
if [[ -z "$1" ]]; then
echo "ERROR: The first argument must be the Ray version string."
exit 1
else
RAY_VERSION=$1
fi

touch $RESULT_FILE
if [[ -z "$2" ]]; then
echo "ERROR: The second argument must be the commit hash to test."
exit 1
else
RAY_COMMIT=$2
fi

echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_COMMIT/"

run_test(){
local test_name=$1

local CLUSTER="stress_testing_config.yaml"
local CLUSTER="stress_testing_config_temporary.yaml"

cat stress_testing_config.yaml |
sed -e "
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;" > "$CLUSTER"

echo "Try running $test_name."
{
ray up -y $CLUSTER --cluster-name "$test_name" &&
sleep 1 &&
ray --logging-level=DEBUG submit $CLUSTER --cluster-name "$test_name" "$test_name.py"
} || echo "FAIL: $test_name" >> $RESULT_FILE
ray --logging-level=DEBUG submit "$CLUSTER" --cluster-name "$test_name" "$test_name.py"
} || echo "FAIL: $test_name" >> "$RESULT_FILE"

# Tear down cluster.
if [ "$DEBUG_MODE" = "" ]; then
ray down -y $CLUSTER --cluster-name "$test_name"
rm "$CLUSTER"
else
echo "Not tearing down cluster" $CLUSTER
echo "Not tearing down cluster" "$CLUSTER"
fi
}

pushd "$ROOT_DIR"
run_test test_many_tasks_and_transfers
run_test test_many_tasks
run_test test_dead_actors
popd

cat $RESULT_FILE
[ ! -s $RESULT_FILE ] || exit 1
cat "$RESULT_FILE"
[ ! -s "$RESULT_FILE" ] || exit 1
2 changes: 1 addition & 1 deletion ci/stress_tests/stress_testing_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ setup_commands:
# - ray/ci/travis/install-bazel.sh
- pip install boto3==1.4.8 cython==0.29.0
# - cd ray/python; git checkout master; git pull; pip install -e . --verbose
- pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.8.0.dev1-cp36-cp36m-manylinux1_x86_64.whl
- pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
- echo "sudo halt" | at now + 60 minutes

# Custom commands that will be run on the head node after common setup.
Expand Down
File renamed without changes.
Loading

0 comments on commit 6f48992

Please sign in to comment.