From 8a65918d2cdee3b4aef3b16f39bff8e30f2a3915 Mon Sep 17 00:00:00 2001 From: JackCaoG Date: Thu, 21 Apr 2022 23:11:26 +0000 Subject: [PATCH 1/4] Start XRT server in a separate process in CircleCI CPU test --- .circleci/common.sh | 9 +++++++++ .circleci/test.sh | 2 ++ 2 files changed, 11 insertions(+) diff --git a/.circleci/common.sh b/.circleci/common.sh index 92f2a805a677..3d306ef7532f 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -21,6 +21,11 @@ function apply_patches() { ./xla/scripts/apply_patches.sh } +function clean_xrt_server() { + echo "Cleanning up XRT server" + python -m torch_xla.core.xrt_run_server --stop +} + function rebase_pull_request_on_target_branch() { # TODO: directly use ENV_VAR when CircleCi exposes base branch. # Try rebasing on top of base (dest) branch first. @@ -113,6 +118,7 @@ function run_torch_xla_tests() { export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" XLA_PORT=$(shuf -i 40701-40999 -n 1) export XRT_WORKERS="localservice:0;grpc://localhost:$XLA_PORT" + python -m torch_xla.core.xrt_run_server --port $XLA_PORT --restart fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" @@ -141,6 +147,9 @@ function run_torch_xla_tests() { fi fi + # clear the XRT server before cpp test since CPP test won't run torch_xla's + # __init__.py hence will force a in process server. + clean_xrt_server pushd test/cpp echo "Running C++ Tests" ./run_tests.sh diff --git a/.circleci/test.sh b/.circleci/test.sh index b01c97674bac..3b5f92ccea14 100755 --- a/.circleci/test.sh +++ b/.circleci/test.sh @@ -12,3 +12,5 @@ source "$PYTORCH_DIR/.jenkins/pytorch/common_utils.sh" install_torchvision run_torch_xla_tests $PYTORCH_DIR $XLA_DIR + +trap clean_xrt_server ERR EXIT From cbc941184a651d64ab4d71d9033fe0dc647786c4 Mon Sep 17 00:00:00 2001 From: JackCaoG Date: Fri, 22 Apr 2022 22:29:05 +0000 Subject: [PATCH 2/4] Update python module command to use file directly --- .circleci/common.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index 3d306ef7532f..b1018a84696b 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -118,7 +118,7 @@ function run_torch_xla_tests() { export XRT_DEVICE_MAP="CPU:0;/job:localservice/replica:0/task:0/device:XLA_CPU:0" XLA_PORT=$(shuf -i 40701-40999 -n 1) export XRT_WORKERS="localservice:0;grpc://localhost:$XLA_PORT" - python -m torch_xla.core.xrt_run_server --port $XLA_PORT --restart + python torch_xla/core/xrt_run_server.py --port $XLA_PORT --restart fi export PYTORCH_TESTING_DEVICE_ONLY_FOR="xla" @@ -148,8 +148,10 @@ function run_torch_xla_tests() { fi # clear the XRT server before cpp test since CPP test won't run torch_xla's - # __init__.py hence will force a in process server. - clean_xrt_server + # __init__.py hence will force a in process server. Note that we can not use + # -m here since we are in the XLA dir. Trying to run the torch_xla module + # from this dir will result in a `version.py` missing error. + python torch_xla/core/xrt_run_server.py --stop pushd test/cpp echo "Running C++ Tests" ./run_tests.sh From c683c099c7c143d8321a214e28e01b3616c88d38 Mon Sep 17 00:00:00 2001 From: JackCaoG Date: Mon, 25 Apr 2022 20:50:06 +0000 Subject: [PATCH 3/4] Don't do the clean up since test will be run inside a docker and it should be clean at the end of python test already --- .circleci/test.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/.circleci/test.sh b/.circleci/test.sh index 3b5f92ccea14..b01c97674bac 100755 --- a/.circleci/test.sh +++ b/.circleci/test.sh @@ -12,5 +12,3 @@ source "$PYTORCH_DIR/.jenkins/pytorch/common_utils.sh" install_torchvision run_torch_xla_tests $PYTORCH_DIR $XLA_DIR - -trap clean_xrt_server ERR EXIT From 6b1b72e31cf8f6f602d20ae0acd083a56d18088b Mon Sep 17 00:00:00 2001 From: JackCaoG Date: Tue, 26 Apr 2022 01:46:36 +0000 Subject: [PATCH 4/4] Remove clean_xrt_server as it is not needed anymore --- .circleci/common.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.circleci/common.sh b/.circleci/common.sh index b1018a84696b..4e6e3512f895 100755 --- a/.circleci/common.sh +++ b/.circleci/common.sh @@ -21,11 +21,6 @@ function apply_patches() { ./xla/scripts/apply_patches.sh } -function clean_xrt_server() { - echo "Cleanning up XRT server" - python -m torch_xla.core.xrt_run_server --stop -} - function rebase_pull_request_on_target_branch() { # TODO: directly use ENV_VAR when CircleCi exposes base branch. # Try rebasing on top of base (dest) branch first.