From cb952877233241bf36a99fa904aa6f7ad5d085bb Mon Sep 17 00:00:00 2001 From: JackCaoG <59073027+JackCaoG@users.noreply.github.com> Date: Wed, 18 May 2022 18:28:24 -0700 Subject: [PATCH 1/3] Revert "Revert "Start XRT server in a separate process in CircleCI CPU test (#3519)" (#3536)" This reverts commit 093d443bfe4f21444bdbecf969b4d50363439212. --- .circleci/common.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.circleci/common.sh b/.circleci/common.sh index 04d47ff8cf09..eb9f98a04bc0 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -113,6 +113,7 @@ function run_torch_xla_tests() { export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" XLA_PORT=$(shuf -i 40701-40999 -n 1) export XRT_WORKERS="localservice:0;grpc://localhost:$XLA_PORT" + python torch_xla/core/xrt_run_server.py --port $XLA_PORT --restart fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" @@ -141,6 +142,11 @@ function run_torch_xla_tests() { fi fi + # clear the XRT server before cpp test since CPP test won't run torch_xla's + # __init__.py hence will force a in process server. Note that we can not use + # -m here since we are in the XLA dir. Trying to run the torch_xla module + # from this dir will result in a `version.py` missing error. + python torch_xla/core/xrt_run_server.py --stop pushd test/cpp echo "Running C++ Tests" ./run_tests.sh From 8c13b0efaf4acfc29a8a1056ba6cadedf12aa509 Mon Sep 17 00:00:00 2001 From: JackCaoG <59073027+JackCaoG@users.noreply.github.com> Date: Wed, 18 May 2022 18:29:31 -0700 Subject: [PATCH 2/3] Update common.sh --- .circleci/common.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index eb9f98a04bc0..ffd410183a44 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -111,13 +111,14 @@ function run_torch_xla_tests() { export GPU_NUM_DEVICES=2 else export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" - XLA_PORT=$(shuf -i 40701-40999 -n 1) + export XLA_PORT=$(shuf -i 40701-40999 -n 1) export XRT_WORKERS="localservice:0;grpc://localhost:$XLA_PORT" - python torch_xla/core/xrt_run_server.py --port $XLA_PORT --restart fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" pushd $XLA_DIR + echo "Starting GRPC server" + python torch_xla/core/xrt_run_server.py --port $XLA_PORT --restart echo "Running Python Tests" ./test/run_tests.sh # only run test_autocast for cpu and gpu on circleCI. From 298c118f15e3d104789f1689121d512f2d86ea45 Mon Sep 17 00:00:00 2001 From: JackCaoG Date: Thu, 19 May 2022 22:49:19 +0000 Subject: [PATCH 3/3] Remove pjrt test for now --- .circleci/common.sh | 6 ++++-- test/run_tests.sh | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index ffd410183a44..445c4b841550 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -117,8 +117,10 @@ function run_torch_xla_tests() { export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" pushd $XLA_DIR - echo "Starting GRPC server" - python torch_xla/core/xrt_run_server.py --port $XLA_PORT --restart + if [[ ! -z "${XLA_PORT}" ]]; then + echo "Starting GRPC server" + python torch_xla/core/xrt_run_server.py --port $XLA_PORT --restart + fi echo "Running Python Tests" ./test/run_tests.sh # only run test_autocast for cpu and gpu on circleCI. diff --git a/test/run_tests.sh b/test/run_tests.sh index e0c03d67860f..b88d44a2c5d9 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -101,7 +101,6 @@ function run_all_tests { # TODO: enable this test after tf update, currently optimization_barrier does not # work on CPU. # run_test python3 "$CDIR/test_checkpoint.py" - run_pjrt python3 "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY run_test python3 "$CDIR/test_mp_replication.py" run_test python3 "$CDIR/test_mp_all_to_all.py" run_test python3 "$CDIR/test_mp_collective_permute.py"