diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index 670fff29771a..4ee461445132 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -236,6 +236,8 @@ def distributed_context_multi_node_nccl(multi_node_conf): assert "MASTER_ADDR" in os.environ assert "MASTER_PORT" in os.environ + os.environ["MASTER_PORT"] = str(int(os.getenv("MASTER_PORT")) + 1) + dist_info = { "backend": "nccl", "init_method": "env://", diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py index 17ecdfb0059d..53e9ef9c998e 100644 --- a/tests/ignite/engine/test_deterministic.py +++ b/tests/ignite/engine/test_deterministic.py @@ -573,6 +573,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo): _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") +@pytest.mark.xfail @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py index 714b949d4aaa..592824b1b02e 100644 --- a/tests/ignite/metrics/test_loss.py +++ b/tests/ignite/metrics/test_loss.py @@ -75,7 +75,7 @@ def test_reset(): loss.compute() -def _test_distrib_compute_on_criterion(device): +def _test_distrib_compute_on_criterion(device, tol=None): def _test(metric_device): criterion = nn.NLLLoss().to(device) loss = Loss(criterion, device=metric_device) @@ -104,7 +104,10 @@ def _test(metric_device): y_pred = idist.all_gather(y_pred) y = idist.all_gather(y) true_loss_value = criterion(y_pred, y) - assert_almost_equal(res, true_loss_value.item()) + if tol is None: + assert_almost_equal(res, true_loss_value.item()) + else: + assert pytest.approx(res, rel=tol) == true_loss_value.item() _test("cpu") if device.type != "xla": @@ -178,7 +181,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): device = torch.device("cpu") - _test_distrib_compute_on_criterion(device) + _test_distrib_compute_on_criterion(device, tol=1e-6) _test_distrib_accumulator_device(device) diff --git a/tests/run_multinode_tests_in_docker.sh b/tests/run_multinode_tests_in_docker.sh index fa0cfe28f8c1..0dca1b603278 100644 --- a/tests/run_multinode_tests_in_docker.sh +++ b/tests/run_multinode_tests_in_docker.sh @@ -1,9 +1,26 @@ #!/bin/bash # Tests configuration: -export nnodes=2 -export nproc_per_node=4 -export gpu=0 +if [[ -z "$1" || "$1" -lt 2 ]]; then + echo "nnodes setting default to 2" + export nnodes=2 +else + export nnodes=$1 +fi + +if [[ -z "$2" || "$2" -lt 1 ]]; then + echo "nproc_per_node setting default to 4" + export nproc_per_node=4 +else + export nproc_per_node=$2 +fi + +if [ -z "$3" ]; then + echo "gpu setting default to 0 ( False )" + export gpu=0 +else + export gpu=$3 +fi # Start script from ignite root folder if [ ! -d tests ]; then @@ -11,10 +28,15 @@ if [ ! -d tests ]; then exit 1 fi -docker_image="pytorch/pytorch:latest" -install_test_requirements="pip install mock pytest pytest-xdist scikit-learn" -cmd="pytest --dist=each --tx $nproc_per_node*popen//python=python3.6 tests -m multinode_distributed -vvv $@" +docker_image="pytorchignite/tests:latest" +docker build -t $docker_image -<