From e31bfc425fbc7c9f60a4e4b6315b4f8e36181312 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Sat, 29 May 2021 17:07:00 -0500 Subject: [PATCH 1/5] WIP enabled gpu device for gloo backend --- ignite/distributed/auto.py | 6 ++++-- ignite/distributed/comp_models/native.py | 14 +++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index 1d6a585b0e63..7953eca5545c 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -188,7 +188,8 @@ def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Mod # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() - if idist.has_native_dist_support and bnd == idist_native.NCCL: + # if idist.has_native_dist_support and bnd == idist_native.NCCL: + if idist.has_native_dist_support and torch.cuda.is_available(): if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) @@ -199,7 +200,8 @@ def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Mod lrank = idist.get_local_rank() logger.info(f"Apply torch DistributedDataParallel on model, device id: {lrank}") model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[lrank,], **kwargs) - elif idist.has_native_dist_support and bnd == idist_native.GLOO: + # elif idist.has_native_dist_support and bnd == idist_native.GLOO: + elif idist.has_native_dist_support: if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py index c723f4184347..c2d8bfd7ebfb 100644 --- a/ignite/distributed/comp_models/native.py +++ b/ignite/distributed/comp_models/native.py @@ -31,7 +31,7 @@ class _NativeDistModel(ComputationModel): In this implementation we assume the following mapping between backend and devices: - NCCL <-> GPU - - GLOO <-> CPU + - GLOO <-> CPU or GPU - MPI <-> CPU """ @@ -127,7 +127,8 @@ def _create_from_backend( # https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 dist.barrier() - if backend == dist.Backend.NCCL: + # if backend in (dist.Backend.NCCL, dist.Backend.GLOO) and torch.cuda.is_available(): + if torch.cuda.is_available(): torch.cuda.set_device(self._local_rank) self._setup_attrs() @@ -140,7 +141,8 @@ def _init_from_context(self) -> None: def _compute_nproc_per_node(self) -> int: local_rank = self.get_local_rank() device = torch.device("cpu") - if self.backend() == dist.Backend.NCCL: + # if self.backend() == dist.Backend.NCCL: + if torch.cuda.is_available(): # we manually set cuda device to local rank in order to avoid a hang on all_reduce device = torch.device(f"cuda:{local_rank}") tensor = torch.tensor([self.get_local_rank() + 1]).to(device) @@ -151,7 +153,8 @@ def _get_all_hostnames(self) -> List[Tuple[str, ...]]: import socket device = "cpu" - if self.backend() == dist.Backend.NCCL: + # if self.backend() == dist.Backend.NCCL: + if torch.cuda.is_available(): index = torch.cuda.current_device() device = f"cuda:{index}" hostname = socket.gethostname() @@ -281,7 +284,8 @@ def get_node_rank(self) -> int: return cast(int, self._node) def device(self) -> torch.device: - if self.backend() == dist.Backend.NCCL: + # if self.backend() == dist.Backend.NCCL: + if torch.cuda.is_available(): index = torch.cuda.current_device() if index < self.get_local_rank(): warnings.warn( From 741591cd34b898af5cb9f0a82d187eac5ca3bfba Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Sun, 30 May 2021 22:35:57 +0000 Subject: [PATCH 2/5] WIP on adapting and improving distributed tests --- tests/ignite/conftest.py | 9 ++-- tests/ignite/contrib/engines/test_common.py | 33 +++--------- .../contrib/handlers/test_clearml_logger.py | 13 +++-- .../ignite/contrib/handlers/test_lr_finder.py | 10 ++-- .../contrib/handlers/test_neptune_logger.py | 9 ++-- .../regression/test_canberra_metric.py | 15 +++--- .../test_fractional_absolute_error.py | 16 +++--- .../regression/test_fractional_bias.py | 15 +++--- .../test_geometric_mean_absolute_error.py | 14 ++--- .../regression/test_manhattan_distance.py | 15 +++--- .../regression/test_maximum_absolute_error.py | 15 +++--- .../test_mean_absolute_relative_error.py | 15 +++--- .../regression/test_mean_normalized_bias.py | 15 +++--- .../regression/test_median_absolute_error.py | 15 +++--- .../test_median_absolute_percentage_error.py | 15 +++--- .../test_median_relative_absolute_error.py | 15 +++--- .../metrics/regression/test_r2_score.py | 15 +++--- .../regression/test_wave_hedges_distance.py | 15 +++--- .../contrib/metrics/test_average_precision.py | 12 ++--- .../contrib/metrics/test_cohen_kappa.py | 12 ++--- tests/ignite/contrib/metrics/test_roc_auc.py | 12 ++--- .../distributed/comp_models/test_base.py | 2 +- .../distributed/comp_models/test_native.py | 31 ++++++----- tests/ignite/distributed/test_auto.py | 16 +++--- tests/ignite/distributed/test_launcher.py | 16 +++--- tests/ignite/distributed/utils/__init__.py | 7 ++- tests/ignite/distributed/utils/test_native.py | 53 +++++++++++-------- tests/ignite/engine/test_custom_events.py | 15 +++--- tests/ignite/engine/test_deterministic.py | 24 +++++---- tests/ignite/engine/test_engine.py | 4 +- tests/ignite/handlers/test_checkpoint.py | 8 +-- tests/ignite/handlers/test_early_stopping.py | 16 +++--- tests/ignite/metrics/nlp/test_bleu.py | 16 +++--- tests/ignite/metrics/nlp/test_rouge.py | 16 +++--- tests/ignite/metrics/test_accumulation.py | 35 +++--------- tests/ignite/metrics/test_accuracy.py | 31 ++--------- .../metrics/test_classification_report.py | 34 ++++++------ tests/ignite/metrics/test_confusion_matrix.py | 26 ++------- tests/ignite/metrics/test_epoch_metric.py | 12 +++-- tests/ignite/metrics/test_fbeta.py | 26 +++------ tests/ignite/metrics/test_loss.py | 28 +++------- .../metrics/test_mean_absolute_error.py | 17 +++--- .../metrics/test_mean_pairwise_distance.py | 16 +++--- .../ignite/metrics/test_mean_squared_error.py | 15 +++--- tests/ignite/metrics/test_metric.py | 14 ++--- tests/ignite/metrics/test_metrics_lambda.py | 14 ++--- .../test_multilabel_confusion_matrix.py | 14 ++--- tests/ignite/metrics/test_precision.py | 16 +++--- tests/ignite/metrics/test_psnr.py | 17 +++--- tests/ignite/metrics/test_recall.py | 16 +++--- .../metrics/test_root_mean_squared_error.py | 14 ++--- tests/ignite/metrics/test_running_average.py | 14 ++--- tests/ignite/metrics/test_ssim.py | 15 +++--- .../test_top_k_categorical_accuracy.py | 18 ++++--- 54 files changed, 475 insertions(+), 446 deletions(-) diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index a909d1c695e6..5e0fed143e91 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -94,7 +94,8 @@ def _create_dist_context(dist_info, lrank): dist.init_process_group(**dist_info) dist.barrier() - if dist_info["backend"] == "nccl": + # if dist_info["backend"] == "nccl": + if torch.cuda.is_available(): torch.cuda.set_device(lrank) return {"local_rank": lrank, "world_size": dist_info["world_size"], "rank": dist_info["rank"]} @@ -150,8 +151,6 @@ def distributed_context_single_node_nccl(local_rank, world_size): free_port = _setup_free_port(local_rank) - print(local_rank, "Port:", free_port) - dist_info = { "backend": "nccl", "world_size": world_size, @@ -174,7 +173,6 @@ def distributed_context_single_node_gloo(local_rank, world_size): init_method = f'file:///{temp_file.name.replace(backslash, "/")}' else: free_port = _setup_free_port(local_rank) - print(local_rank, "Port:", free_port) init_method = f"tcp://localhost:{free_port}" temp_file = None @@ -213,7 +211,8 @@ def _create_mnodes_dist_context(dist_info, mnodes_conf): dist.init_process_group(**dist_info) dist.barrier() - if dist_info["backend"] == "nccl": + # if dist_info["backend"] == "nccl": + if torch.cuda.is_available(): torch.cuda.device(mnodes_conf["local_rank"]) return mnodes_conf diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index 292a3ab553dd..1c514e65e97b 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -56,7 +56,7 @@ def _test_setup_common_training_handlers( num_epochs = 10 model = DummyModel().to(device) - if distributed and "cuda" in device: + if distributed and "cuda" in torch.device(device).type: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank,], output_device=local_rank) optimizer = torch.optim.SGD(model.parameters(), lr=lr) @@ -581,17 +581,19 @@ def test_setup_neptune_logging(dirname): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(dirname, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(dirname, distributed_context_single_node_nccl): + local_rank = distributed_context_single_node_nccl["local_rank"] - device = f"cuda:{local_rank}" + device = idist.device() _test_setup_common_training_handlers(dirname, device, rank=local_rank, local_rank=local_rank, distributed=True) test_add_early_stopping_by_val_score() @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(dirname, distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): + + device = idist.device() local_rank = distributed_context_single_node_gloo["local_rank"] _test_setup_common_training_handlers(dirname, device, rank=local_rank, local_rank=local_rank, distributed=True) _test_setup_common_training_handlers( @@ -601,24 +603,3 @@ def test_distrib_cpu(dirname, distributed_context_single_node_gloo): dirname, device, rank=local_rank, local_rank=local_rank, distributed=True, lr_scheduler="ignite" ) test_add_early_stopping_by_val_score() - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(dirname, distributed_context_multi_node_gloo): - device = "cpu" - rank = distributed_context_multi_node_gloo["rank"] - _test_setup_common_training_handlers(dirname, device, rank=rank) - test_add_early_stopping_by_val_score() - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(dirname, distributed_context_multi_node_nccl): - local_rank = distributed_context_multi_node_nccl["local_rank"] - rank = distributed_context_multi_node_nccl["rank"] - device = f"cuda:{local_rank}" - _test_setup_common_training_handlers(dirname, device, rank=rank, local_rank=local_rank, distributed=True) - test_add_early_stopping_by_val_score() diff --git a/tests/ignite/contrib/handlers/test_clearml_logger.py b/tests/ignite/contrib/handlers/test_clearml_logger.py index 0d2f9e7c8fcd..9ac30f7cda56 100644 --- a/tests/ignite/contrib/handlers/test_clearml_logger.py +++ b/tests/ignite/contrib/handlers/test_clearml_logger.py @@ -888,18 +888,21 @@ def update_fn(engine, batch): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_save_model_optimizer_lr_scheduler_with_state_dict("cpu") - _test_save_model_optimizer_lr_scheduler_with_state_dict("cpu", on_zero_rank=True) +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_save_model_optimizer_lr_scheduler_with_state_dict(device) + _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + device = idist.device() _test_save_model_optimizer_lr_scheduler_with_state_dict(device) - _test_save_model_optimizer_lr_scheduler_with_state_dict("cpu", on_zero_rank=True) + _test_save_model_optimizer_lr_scheduler_with_state_dict(device, on_zero_rank=True) @pytest.mark.tpu diff --git a/tests/ignite/contrib/handlers/test_lr_finder.py b/tests/ignite/contrib/handlers/test_lr_finder.py index 8ffd2536901d..c0ac998bb4af 100644 --- a/tests/ignite/contrib/handlers/test_lr_finder.py +++ b/tests/ignite/contrib/handlers/test_lr_finder.py @@ -539,8 +539,9 @@ def forward(self, x): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_log_lr_and_loss(device) _test_distrib_integration_mnist(device) @@ -548,8 +549,9 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_log_lr_and_loss(device) _test_distrib_integration_mnist(device) diff --git a/tests/ignite/contrib/handlers/test_neptune_logger.py b/tests/ignite/contrib/handlers/test_neptune_logger.py index b26e1e8546ca..c50ac64bcfda 100644 --- a/tests/ignite/contrib/handlers/test_neptune_logger.py +++ b/tests/ignite/contrib/handlers/test_neptune_logger.py @@ -516,13 +516,16 @@ def test_no_neptune_client(no_site_packages): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_neptune_saver_integration("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_neptune_saver_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + device = idist.device() _test_neptune_saver_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py index 677e1c274313..4bc403296b7a 100644 --- a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py +++ b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py @@ -184,17 +184,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -215,7 +216,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -224,7 +226,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py index 39c0ec6d324a..f0e15e57f950 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py @@ -191,16 +191,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -220,7 +222,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -229,7 +232,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py index b3b32c504a32..d5bbf35a1271 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py @@ -197,17 +197,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -228,7 +229,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -237,7 +239,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py index 299098697df3..d1dd5a65c6a6 100644 --- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py @@ -194,17 +194,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -225,7 +226,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") _test_distrib_compute(device) _test_distrib_integration(device) @@ -234,7 +235,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py index ce079eaaae4c..a46990707830 100644 --- a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py @@ -185,17 +185,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -216,7 +217,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -225,7 +227,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py index 20c44d862148..ab4f0db6500a 100644 --- a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py @@ -181,17 +181,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -212,7 +213,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -221,7 +223,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py index 4841139e712f..8af23968924e 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py @@ -205,17 +205,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -236,7 +237,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -245,7 +247,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py index 41d7a46c7725..878a51436bd3 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py @@ -199,17 +199,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -230,7 +231,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -239,7 +241,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py index a77c1de06bdd..a58ed5860991 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py @@ -199,17 +199,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -230,7 +231,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -239,7 +241,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py index da1f89c9098b..f6a8210cf5d9 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py @@ -209,17 +209,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -240,7 +241,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -249,7 +251,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py index b311761f414d..792e9ffa6477 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py @@ -200,17 +200,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -231,7 +232,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -240,7 +242,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_r2_score.py b/tests/ignite/contrib/metrics/regression/test_r2_score.py index ad5e3fdf5460..097f41eac686 100644 --- a/tests/ignite/contrib/metrics/regression/test_r2_score.py +++ b/tests/ignite/contrib/metrics/regression/test_r2_score.py @@ -171,17 +171,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -202,7 +203,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -211,7 +213,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py index 1fc746d5cbaf..0a62dd215aed 100644 --- a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py @@ -166,17 +166,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -197,7 +198,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) @@ -206,7 +208,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_compute(device) _test_distrib_integration(device) diff --git a/tests/ignite/contrib/metrics/test_average_precision.py b/tests/ignite/contrib/metrics/test_average_precision.py index c1f2fcab1c55..2d988f0811c3 100644 --- a/tests/ignite/contrib/metrics/test_average_precision.py +++ b/tests/ignite/contrib/metrics/test_average_precision.py @@ -278,18 +278,18 @@ def update_fn(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -311,7 +311,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -321,7 +321,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_cohen_kappa.py b/tests/ignite/contrib/metrics/test_cohen_kappa.py index e237794f25fc..72f1c4b6e380 100644 --- a/tests/ignite/contrib/metrics/test_cohen_kappa.py +++ b/tests/ignite/contrib/metrics/test_cohen_kappa.py @@ -273,18 +273,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -310,7 +310,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) @@ -320,7 +320,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_input(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_roc_auc.py b/tests/ignite/contrib/metrics/test_roc_auc.py index 957a97bb6b7c..1fab388b8831 100644 --- a/tests/ignite/contrib/metrics/test_roc_auc.py +++ b/tests/ignite/contrib/metrics/test_roc_auc.py @@ -291,18 +291,18 @@ def update_fn(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -324,7 +324,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) @@ -334,7 +334,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) _test_distrib_integration_binary_input(device) diff --git a/tests/ignite/distributed/comp_models/test_base.py b/tests/ignite/distributed/comp_models/test_base.py index b944ea61cf9f..6a7ca20d35dd 100644 --- a/tests/ignite/distributed/comp_models/test_base.py +++ b/tests/ignite/distributed/comp_models/test_base.py @@ -34,7 +34,7 @@ def test_serial_model(): def test__encode_str__decode_str(): - device = torch.device("cpu") + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") s = "test-abcedfg" encoded_s = ComputationModel._encode_str(s, device, 1024) diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py index 61499f35c0d6..4aacd5e2909b 100644 --- a/tests/ignite/distributed/comp_models/test_native.py +++ b/tests/ignite/distributed/comp_models/test_native.py @@ -279,16 +279,18 @@ def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_si @pytest.mark.distributed @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Should be no-dist config") def test__native_dist_model_create_no_dist_gloo(clean_env): - _test__native_dist_model_create_from_backend_no_dist("gloo", "cpu") - _test__native_dist_model_create_from_context_no_dist("gloo", "cpu") + device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_backend_no_dist("gloo", device) + _test__native_dist_model_create_from_context_no_dist("gloo", device) @pytest.mark.distributed @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Should be no-dist config") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test__native_dist_model_create_no_dist_nccl(clean_env): - _test__native_dist_model_create_from_backend_no_dist("nccl", "cuda:0") - _test__native_dist_model_create_from_context_no_dist("nccl", "cuda:0") + device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_backend_no_dist("nccl", device) + _test__native_dist_model_create_from_context_no_dist("nccl", device) @pytest.mark.distributed @@ -297,12 +299,15 @@ def test__native_dist_model_create_dist_gloo_1(init_method, get_fixed_dirname, l if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_dist_model_create_dist_gloo_1')}/shared" - _test__native_dist_model_create_from_backend_dist(init_method, local_rank, local_rank, world_size, "gloo", "cpu") + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_backend_dist(init_method, local_rank, local_rank, world_size, "gloo", device) @pytest.mark.distributed def test__native_dist_model_create_dist_gloo_2(local_rank, world_size): - _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", "cpu") + + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", device) @pytest.mark.distributed @@ -354,10 +359,7 @@ def _test_dist_spawn_fn(local_rank, backend, world_size, device): assert _model.get_local_rank() == local_rank assert _model.get_world_size() == world_size - if backend == "nccl": - assert _model.device() == torch.device(f"{device}:{local_rank}") - elif backend == "gloo": - assert _model.device() == torch.device(device) + assert _model.device().type == torch.device(device).type def _test__native_dist_model_spawn(backend, num_workers_per_machine, device, init_method=None, **spawn_kwargs): @@ -379,10 +381,15 @@ def test__native_dist_model_spawn_gloo(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test__native_dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu", init_method=init_method) + nproc = torch.cuda.device_count() if torch.cuda.is_available() else 4 + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _test__native_dist_model_spawn( - "gloo", num_workers_per_machine=4, device="cpu", start_method="fork", init_method=init_method + "gloo", num_workers_per_machine=nproc, device=device, init_method=init_method ) + if device.type == "cpu": + _test__native_dist_model_spawn( + "gloo", num_workers_per_machine=nproc, device=device, start_method="fork", init_method=init_method + ) @pytest.mark.distributed diff --git a/tests/ignite/distributed/test_auto.py b/tests/ignite/distributed/test_auto.py index 0a979c1de835..e77ea3ac6dc3 100644 --- a/tests/ignite/distributed/test_auto.py +++ b/tests/ignite/distributed/test_auto.py @@ -78,7 +78,7 @@ def _test_auto_dataloader(ws, nproc, batch_size, num_workers=1, sampler_name=Non def _test_auto_model(model, ws, device, sync_bn=False, **kwargs): model = auto_model(model, sync_bn=sync_bn, **kwargs) bnd = idist.backend() - if ws > 1 and device in ("cuda", "cpu"): + if ws > 1 and torch.device(device).type in ("cuda", "cpu"): if idist.has_native_dist_support and bnd in ("nccl", "gloo"): assert isinstance(model, nn.parallel.DistributedDataParallel) if sync_bn: @@ -93,8 +93,8 @@ def _test_auto_model(model, ws, device, sync_bn=False, **kwargs): assert isinstance(model, nn.Module) assert all( - [p.device.type == device for p in model.parameters()] - ), f"{[p.device.type for p in model.parameters()]} vs {device}" + [p.device.type == torch.device(device).type for p in model.parameters()] + ), f"{[p.device.type for p in model.parameters()]} vs {torch.device(device).type}" def _test_auto_model_optimizer(ws, device): @@ -103,7 +103,7 @@ def _test_auto_model_optimizer(ws, device): _test_auto_model(model, ws, device) model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) - _test_auto_model(model, ws, device, sync_bn="cuda" in device) + _test_auto_model(model, ws, device, sync_bn="cuda" in torch.device(device).type) if ws > 1: _test_auto_model(model, ws, device, find_unused_parameters=True) _test_auto_model(model, ws, device, find_unused_parameters=False) @@ -138,9 +138,10 @@ def test_auto_methods_gloo(distributed_context_single_node_gloo): _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=2) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, sampler_name="WeightedRandomSampler") - _test_auto_model_optimizer(ws, "cpu") + device = idist.device() + _test_auto_model_optimizer(ws, device) - if ws > 1: + if ws > 1 and device.type == "cpu": with pytest.raises(AssertionError, match=r"SyncBatchNorm layers only work with GPU modules"): model = nn.Sequential(nn.Linear(20, 100), nn.BatchNorm1d(100)) auto_model(model, sync_bn=True) @@ -156,7 +157,8 @@ def test_auto_methods_nccl(distributed_context_single_node_nccl): _test_auto_dataloader(ws=ws, nproc=ws, batch_size=10, num_workers=10) _test_auto_dataloader(ws=ws, nproc=ws, batch_size=1, sampler_name="WeightedRandomSampler") - _test_auto_model_optimizer(ws, "cuda") + device = idist.device() + _test_auto_model_optimizer(ws, device) if ws > 1: with pytest.raises(ValueError, match=r"Argument kwargs should not contain 'device_ids'"): diff --git a/tests/ignite/distributed/test_launcher.py b/tests/ignite/distributed/test_launcher.py index 5a268f59632e..5dbd8146d751 100644 --- a/tests/ignite/distributed/test_launcher.py +++ b/tests/ignite/distributed/test_launcher.py @@ -92,7 +92,8 @@ def test_check_idist_parallel_torch_launch_n_procs_gloo(init_method, dirname, ex if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test_check_idist_parallel_torch_launch(init_method, exec_filepath, "gloo", 4) + np = torch.cuda.device_count() if torch.cuda.is_available() else 4 + _test_check_idist_parallel_torch_launch(init_method, exec_filepath, "gloo", np) @pytest.mark.distributed @@ -150,7 +151,8 @@ def _test_check_idist_parallel_spawn(fp, backend, nprocs): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_check_idist_parallel_spawn_n_procs_gloo(exec_filepath): - _test_check_idist_parallel_spawn(exec_filepath, "gloo", 4) + np = 4 if not torch.cuda.is_available() else torch.cuda.device_count() + _test_check_idist_parallel_spawn(exec_filepath, "gloo", np) @pytest.mark.distributed @@ -182,7 +184,7 @@ def _test_func(index, ws, device, backend, true_init_method): assert 0 <= index < ws assert index == idist.get_local_rank() assert ws == idist.get_world_size() - assert device in idist.device().type + assert torch.device(device).type == idist.device().type assert backend == idist.backend() if idist.model_name() == "native-dist": @@ -203,8 +205,8 @@ def test_idist_parallel_spawn_n_procs_native(init_method, backend, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - nproc_per_node = 4 if "gloo" == backend else torch.cuda.device_count() - device = "cpu" if "gloo" == backend else "cuda" + nproc_per_node = torch.cuda.device_count() if torch.cuda.is_available() else 4 + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=backend, nproc_per_node=nproc_per_node, init_method=init_method) as parallel: parallel.run(_test_func, ws=nproc_per_node, device=device, backend=backend, true_init_method=init_method) @@ -222,14 +224,14 @@ def test_idist_parallel_n_procs_native(init_method, backend, get_fixed_dirname, init_method = f"file://{get_fixed_dirname('idist_parallel_n_procs_native')}/shared" os.environ["RANK"] = str(local_rank) - device = "cuda" if "nccl" in backend else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=backend, init_method=init_method) as parallel: parallel.run(_test_func, ws=world_size, device=device, backend=backend, true_init_method=init_method) @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_idist_parallel_no_dist(): - device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with idist.Parallel(backend=None) as parallel: parallel.run(_test_func, ws=1, device=device, backend=None, true_init_method=None) diff --git a/tests/ignite/distributed/utils/__init__.py b/tests/ignite/distributed/utils/__init__.py index 7af11ac620bd..91f15958431b 100644 --- a/tests/ignite/distributed/utils/__init__.py +++ b/tests/ignite/distributed/utils/__init__.py @@ -21,11 +21,10 @@ def _test_distrib_config(local_rank, backend, ws, true_device, rank=None, true_i this_device = idist.device() assert isinstance(this_device, torch.device) - if backend in ("nccl", "horovod") and "cuda" in this_device.type: - true_device = torch.device(f"{true_device}:{local_rank}") - assert this_device == true_device, f"{this_device} vs {true_device}" + if backend in ("nccl", "gloo", "horovod") and "cuda" in this_device.type: + assert this_device.type == torch.device(true_device).type, f"{this_device} vs {true_device}" elif backend in ("gloo", "horovod"): - assert this_device == torch.device(true_device) + assert this_device.type == torch.device(true_device).type elif backend == "xla-tpu": assert true_device in this_device.type diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py index 89a93ce9f08d..4e18c79c31f3 100644 --- a/tests/ignite/distributed/utils/test_native.py +++ b/tests/ignite/distributed/utils/test_native.py @@ -42,8 +42,9 @@ def test_native_distrib_single_node_launch_tool_gloo(init_method, get_fixed_dirn if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_gloo')}/shared" + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") _test_native_distrib_single_node_launch_tool( - "gloo", "cpu", local_rank, world_size, timeout=timeout, init_method=init_method + "gloo", device, local_rank, world_size, timeout=timeout, init_method=init_method ) @@ -56,11 +57,12 @@ def test_native_distrib_single_node_launch_tool_nccl(init_method, get_fixed_dirn if init_method == "FILE": init_method = f"file://{get_fixed_dirname('native_distrib_single_node_launch_tool_nccl')}/shared" - _test_native_distrib_single_node_launch_tool("nccl", "cuda", local_rank, world_size, init_method=init_method) + device = torch.device(f"cuda:{local_rank}") + _test_native_distrib_single_node_launch_tool("nccl", device, local_rank, world_size, init_method=init_method) def _test_native_distrib_single_node_spawn(init_method, backend, device, **kwargs): - world_size = 4 if device == "cpu" else torch.cuda.device_count() + world_size = 4 if torch.device(device).type == "cpu" else torch.cuda.device_count() idist.spawn( backend, _test_distrib_config, @@ -84,7 +86,8 @@ def test_native_distrib_single_node_spawn_gloo(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test_native_distrib_single_node_spawn(init_method, "gloo", "cpu", timeout=timeout) + device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu") + _test_native_distrib_single_node_spawn(init_method, "gloo", device, timeout=timeout) @pytest.mark.distributed @@ -96,7 +99,8 @@ def test_native_distrib_single_node_spawn_nccl(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - _test_native_distrib_single_node_spawn(init_method, "nccl", "cuda") + device = torch.device("cuda") + _test_native_distrib_single_node_spawn(init_method, "nccl", device) @pytest.mark.distributed @@ -132,7 +136,8 @@ def _test_idist_methods_in_native_context(backend, device, local_rank): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_methods_in_native_gloo_context(distributed_context_single_node_gloo): local_rank = distributed_context_single_node_gloo["local_rank"] - _test_idist_methods_in_native_context("gloo", "cpu", local_rank) + device = torch.device(f"cuda:{local_rank}" if torch.cuda.is_available() else "cpu") + _test_idist_methods_in_native_context("gloo", device, local_rank) @pytest.mark.distributed @@ -140,7 +145,8 @@ def test_idist_methods_in_native_gloo_context(distributed_context_single_node_gl @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_methods_in_native_nccl_context(distributed_context_single_node_nccl): local_rank = distributed_context_single_node_nccl["local_rank"] - _test_idist_methods_in_native_context("nccl", "cuda", local_rank) + device = torch.device(f"cuda:{local_rank}") + _test_idist_methods_in_native_context("nccl", device, local_rank) def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_rank): @@ -166,8 +172,10 @@ def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_ @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_methods_in_native_gloo_context_set_local_rank(distributed_context_single_node_gloo): + local_rank = distributed_context_single_node_gloo["local_rank"] - _test_idist_methods_in_native_context_set_local_rank("gloo", "cpu", local_rank) + device = idist.device() + _test_idist_methods_in_native_context_set_local_rank("gloo", device, local_rank) @pytest.mark.distributed @@ -175,7 +183,8 @@ def test_idist_methods_in_native_gloo_context_set_local_rank(distributed_context @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context_single_node_nccl): local_rank = distributed_context_single_node_nccl["local_rank"] - _test_idist_methods_in_native_context_set_local_rank("nccl", "cuda", local_rank) + device = idist.device() + _test_idist_methods_in_native_context_set_local_rank("nccl", device, local_rank) @pytest.mark.distributed @@ -183,7 +192,7 @@ def test_idist_methods_in_native_nccl_context_set_local_rank(distributed_context @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist__model_methods_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib__get_max_length(device) @@ -191,7 +200,7 @@ def test_idist__model_methods_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist__model_methods_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib__get_max_length(device) @@ -200,7 +209,7 @@ def test_idist__model_methods_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_all_reduce_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_all_reduce(device) @@ -208,7 +217,7 @@ def test_idist_all_reduce_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_all_reduce_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_all_reduce(device) @@ -217,7 +226,7 @@ def test_idist_all_reduce_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_all_gather_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_all_gather(device) @@ -225,7 +234,7 @@ def test_idist_all_gather_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_all_gather_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_all_gather(device) @@ -234,7 +243,7 @@ def test_idist_all_gather_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_broadcast_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_broadcast(device) @@ -242,7 +251,7 @@ def test_idist_broadcast_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_broadcast_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_broadcast(device) @@ -251,7 +260,7 @@ def test_idist_broadcast_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_barrier_nccl(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_barrier(device) @@ -259,7 +268,7 @@ def test_idist_barrier_nccl(distributed_context_single_node_nccl): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_barrier_gloo(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_barrier(device) @@ -325,7 +334,8 @@ def test_idist_methods_overhead_nccl(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo): - device = "cpu" + + device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) @@ -334,6 +344,7 @@ def test_idist_one_rank_only_gloo(distributed_context_single_node_gloo): @pytest.mark.skipif(not has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test_idist_one_rank_only_nccl(local_rank, distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" + + device = idist.device() _test_distrib_one_rank_only(device=device) _test_distrib_one_rank_only_with_engine(device=device) diff --git a/tests/ignite/engine/test_custom_events.py b/tests/ignite/engine/test_custom_events.py index 2a09e116b1ca..6c9bf230dce5 100644 --- a/tests/ignite/engine/test_custom_events.py +++ b/tests/ignite/engine/test_custom_events.py @@ -455,7 +455,7 @@ def _test(num_workers): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, drop_last=True, shuffle=True, ) @@ -489,16 +489,19 @@ def test_every_event_filter_with_engine_with_dataloader(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_every_event_filter_with_engine() - _test_every_event_filter_with_engine_with_dataloader("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_every_event_filter_with_engine(device) + _test_every_event_filter_with_engine_with_dataloader(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_every_event_filter_with_engine(device) _test_every_event_filter_with_engine_with_dataloader(device) diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py index c1cd8e90cb46..9e626fc5bf10 100644 --- a/tests/ignite/engine/test_deterministic.py +++ b/tests/ignite/engine/test_deterministic.py @@ -262,7 +262,7 @@ def _test(epoch_length=None): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -294,7 +294,7 @@ def _(engine): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -370,7 +370,7 @@ def _test(epoch_length=None): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -401,7 +401,7 @@ def _(engine): data, batch_size=batch_size, num_workers=num_workers, - pin_memory="cuda" in device, + pin_memory="cuda" in torch.device(device).type, sampler=sampler, drop_last=True, shuffle=sampler is None, @@ -563,16 +563,18 @@ def test_resume_random_data_iterator_from_iter(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -582,7 +584,8 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") @@ -591,7 +594,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" + + device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") _test_resume_random_dataloader_from_epoch(device, setup_sampler, sampler_type="distributed") diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 3500ad4328a1..52841d8a102d 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -500,14 +500,14 @@ def test_run_check_triggered_events_on_iterator(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() diff --git a/tests/ignite/handlers/test_checkpoint.py b/tests/ignite/handlers/test_checkpoint.py index 2319be61f92b..de35a1eb4247 100644 --- a/tests/ignite/handlers/test_checkpoint.py +++ b/tests/ignite/handlers/test_checkpoint.py @@ -1188,8 +1188,9 @@ def _test_checkpoint_load_objects_ddp(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo, get_rank_zero_dirname): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo, get_rank_zero_dirname): + + device = idist.device() dirname = get_rank_zero_dirname() _test_save_model_optimizer_lr_scheduler_with_state_dict(device, os.path.join(dirname, "1")) _test_save_model_optimizer_lr_scheduler_with_state_dict(device, os.path.join(dirname, "2"), on_zero_rank=True) @@ -1200,7 +1201,8 @@ def test_distrib_cpu(distributed_context_single_node_gloo, get_rank_zero_dirname @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl, get_rank_zero_dirname): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl, get_rank_zero_dirname): + device = idist.device() dirname = get_rank_zero_dirname() _test_save_model_optimizer_lr_scheduler_with_state_dict(device, os.path.join(dirname, "1")) diff --git a/tests/ignite/handlers/test_early_stopping.py b/tests/ignite/handlers/test_early_stopping.py index 712338e11f45..61af9270af96 100644 --- a/tests/ignite/handlers/test_early_stopping.py +++ b/tests/ignite/handlers/test_early_stopping.py @@ -336,16 +336,18 @@ def evaluation(engine): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -354,7 +356,8 @@ def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) @@ -363,6 +366,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" + + device = idist.device() _test_distrib_with_engine_early_stopping(device) _test_distrib_integration_engine_early_stopping(device) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index c98143cf98d2..3e9dd148422f 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -184,15 +184,17 @@ def _test(metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) @@ -211,7 +213,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) @@ -219,7 +222,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py index 40aafae189c2..1b59cde18d9f 100644 --- a/tests/ignite/metrics/nlp/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -176,15 +176,17 @@ def _test(metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) @@ -203,7 +205,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) @@ -211,7 +214,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_accumulation.py b/tests/ignite/metrics/test_accumulation.py index e47a9273e583..012cc8875ac4 100644 --- a/tests/ignite/metrics/test_accumulation.py +++ b/tests/ignite/metrics/test_accumulation.py @@ -425,9 +425,9 @@ def _test_apex_average(device, amp_mode, opt_level): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") + device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) _test_distrib_geom_average(device) @@ -437,21 +437,9 @@ def test_distrib_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - - device = torch.device("cpu") - _test_distrib_variable_accumulation(device) - _test_distrib_average(device) - _test_distrib_geom_average(device) - _test_distrib_integration(device) - _test_distrib_accumulator_device(device) - +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) _test_distrib_geom_average(device) @@ -464,7 +452,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") def test_distrib_hvd(gloo_hvd_executor): - device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") + device = idist.device() nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() gloo_hvd_executor(_test_distrib_variable_accumulation, (device,), np=nproc, do_init=True) @@ -474,22 +462,11 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_variable_accumulation(device) - _test_distrib_average(device) - _test_distrib_geom_average(device) - _test_distrib_integration(device) - _test_distrib_accumulator_device(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_variable_accumulation(device) _test_distrib_average(device) diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py index 6f84735cc3ee..dd48a341257c 100644 --- a/tests/ignite/metrics/test_accuracy.py +++ b/tests/ignite/metrics/test_accuracy.py @@ -503,8 +503,9 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_single_node_nccl['local_rank']}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -513,9 +514,9 @@ def test_distrib_gpu(distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_multilabel_input_NHW(device) _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) @@ -536,28 +537,6 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_multilabel_input_NHW(device) - _test_distrib_integration_multiclass(device) - _test_distrib_integration_multilabel(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_multilabel_input_NHW(device) - _test_distrib_integration_multiclass(device) - _test_distrib_integration_multilabel(device) - _test_distrib_accumulator_device(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py index 3a15987a6a04..b42fe5b50b36 100644 --- a/tests/ignite/metrics/test_classification_report.py +++ b/tests/ignite/metrics/test_classification_report.py @@ -141,23 +141,23 @@ def update(engine, i): _test(metric_device, 2, ["0", "1", "2", "3", "4", "5", "6"]) -@pytest.mark.multinode_distributed +@pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) _test_integration_multilabel(device, True) _test_integration_multilabel(device, False) -@pytest.mark.multinode_distributed +@pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(local_rank, distributed_context_single_node_gloo): + + device = idist.device() _test_integration_multiclass(device, True) _test_integration_multiclass(device, False) _test_integration_multilabel(device, True) @@ -165,13 +165,17 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = torch.device("cpu") - _test_integration_multiclass(device, True) - _test_integration_multiclass(device, False) - _test_integration_multilabel(device, True) - _test_integration_multilabel(device, False) +@pytest.mark.skipif(not idist.has_hvd_support, reason="Skip if no Horovod dist support") +@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc") +def test_distrib_hvd(gloo_hvd_executor): + + device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") + nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count() + + gloo_hvd_executor(_test_integration_multiclass, (device, True), np=nproc, do_init=True) + gloo_hvd_executor(_test_integration_multiclass, (device, False), np=nproc, do_init=True) + gloo_hvd_executor(_test_integration_multilabel, (device, True), np=nproc, do_init=True) + gloo_hvd_executor(_test_integration_multilabel, (device, False), np=nproc, do_init=True) def _test_distrib_xla_nprocs(index): diff --git a/tests/ignite/metrics/test_confusion_matrix.py b/tests/ignite/metrics/test_confusion_matrix.py index 1745b532408f..f6a9bee590ca 100644 --- a/tests/ignite/metrics/test_confusion_matrix.py +++ b/tests/ignite/metrics/test_confusion_matrix.py @@ -589,18 +589,18 @@ def _test(average=None): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_multiclass_images(device) _test_distrib_accumulator_device(device) @@ -617,24 +617,6 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_multiclass_images(device) - _test_distrib_accumulator_device(device) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_multiclass_images(device) - _test_distrib_accumulator_device(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") diff --git a/tests/ignite/metrics/test_epoch_metric.py b/tests/ignite/metrics/test_epoch_metric.py index ef660a16be41..70296d484180 100644 --- a/tests/ignite/metrics/test_epoch_metric.py +++ b/tests/ignite/metrics/test_epoch_metric.py @@ -193,14 +193,18 @@ def assert_data_fn(all_preds, all_targets): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): - _test_distrib_integration(device="cuda") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() + _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - _test_distrib_integration(device="cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() + _test_distrib_integration(device) @pytest.mark.tpu diff --git a/tests/ignite/metrics/test_fbeta.py b/tests/ignite/metrics/test_fbeta.py index 8e38d516140c..c3d86d1af936 100644 --- a/tests/ignite/metrics/test_fbeta.py +++ b/tests/ignite/metrics/test_fbeta.py @@ -146,15 +146,17 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) @@ -169,22 +171,6 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_integration, (device,), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_integration(device) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_integration(device) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py index 353933b0ee04..90f263aeb324 100644 --- a/tests/ignite/metrics/test_loss.py +++ b/tests/ignite/metrics/test_loss.py @@ -181,18 +181,18 @@ def test_sum_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) @@ -209,34 +209,18 @@ def test_distrib_hvd(gloo_hvd_executor): gloo_hvd_executor(_test_distrib_accumulator_device, (device, y_test_1()), np=nproc, do_init=True) -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") - _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2(), tol=1e-6) - _test_distrib_accumulator_device(device, y_test_1()) - - -@pytest.mark.multinode_distributed -@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") - _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) - _test_distrib_accumulator_device(device, y_test_1()) - - @pytest.mark.tpu @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) def _test_distrib_xla_nprocs(index): + device = idist.device() _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) _test_distrib_accumulator_device(device, y_test_1()) diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py index cfd662dedddf..a2fba0b55bd0 100644 --- a/tests/ignite/metrics/test_mean_absolute_error.py +++ b/tests/ignite/metrics/test_mean_absolute_error.py @@ -129,16 +129,18 @@ def test_accumulator_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -159,7 +161,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -168,7 +171,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -177,6 +181,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py index 2e042fd4703d..0c44d78ac26b 100644 --- a/tests/ignite/metrics/test_mean_pairwise_distance.py +++ b/tests/ignite/metrics/test_mean_pairwise_distance.py @@ -138,16 +138,18 @@ def test_accumulator_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -168,7 +170,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -177,7 +180,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py index 5f5a169d2770..b73b60749d1c 100644 --- a/tests/ignite/metrics/test_mean_squared_error.py +++ b/tests/ignite/metrics/test_mean_squared_error.py @@ -131,17 +131,18 @@ def test_accumulator_detached(): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -162,7 +163,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -171,7 +173,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_metric.py b/tests/ignite/metrics/test_metric.py index 06997139281b..8eb84b4cd7d8 100644 --- a/tests/ignite/metrics/test_metric.py +++ b/tests/ignite/metrics/test_metric.py @@ -668,18 +668,18 @@ def _test_creating_on_xla_fails(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = f"cuda:{distributed_context_single_node_nccl['local_rank']}" + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = "cpu" + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @@ -700,7 +700,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" + + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) @@ -709,7 +710,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" + + device = idist.device() _test_distrib_sync_all_reduce_decorator(device) _test_invalid_sync_all_reduce(device) diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py index 618cb35fd965..7629d59816f6 100644 --- a/tests/ignite/metrics/test_metrics_lambda.py +++ b/tests/ignite/metrics/test_metrics_lambda.py @@ -402,18 +402,18 @@ def update(engine, i): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_integration(device) _test_distrib_metrics_on_diff_devices(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_integration(device) @@ -433,7 +433,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) @@ -441,7 +442,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) _test_distrib_metrics_on_diff_devices(device) diff --git a/tests/ignite/metrics/test_multilabel_confusion_matrix.py b/tests/ignite/metrics/test_multilabel_confusion_matrix.py index c4ea2bc2987a..7a21825aa1b3 100644 --- a/tests/ignite/metrics/test_multilabel_confusion_matrix.py +++ b/tests/ignite/metrics/test_multilabel_confusion_matrix.py @@ -359,18 +359,18 @@ def test_simple_batched(): # @pytest.mark.distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -# def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +# def test_distrib_nccl_gpu(distributed_context_single_node_nccl): -# device = torch.device(f"cuda:{local_rank}") +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) # @pytest.mark.distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -# def test_distrib_cpu(distributed_context_single_node_gloo): +# def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): -# device = torch.device("cpu") +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) @@ -391,7 +391,8 @@ def test_simple_batched(): # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") # def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): -# device = torch.device("cpu") +# +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) @@ -400,7 +401,8 @@ def test_simple_batched(): # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") # def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): -# device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") +# +# device = idist.device() # _test_distrib_multiclass_images(device) # _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index bff02cc65c27..8162fd4de1d1 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -518,8 +518,9 @@ def _test(average, metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -528,8 +529,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -554,7 +556,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -565,7 +568,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_psnr.py b/tests/ignite/metrics/test_psnr.py index e1abf0f1fff7..cea2c3ea8ab3 100644 --- a/tests/ignite/metrics/test_psnr.py +++ b/tests/ignite/metrics/test_psnr.py @@ -239,8 +239,9 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -248,8 +249,9 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -258,7 +260,8 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -267,7 +270,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -276,6 +280,7 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index fe9b14e93dc8..76a9d67599d1 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -519,8 +519,9 @@ def _test(average, metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -529,8 +530,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -555,7 +557,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) @@ -566,7 +569,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration_multiclass(device) _test_distrib_integration_multilabel(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_root_mean_squared_error.py b/tests/ignite/metrics/test_root_mean_squared_error.py index e5c66616456f..c4d13e259b02 100644 --- a/tests/ignite/metrics/test_root_mean_squared_error.py +++ b/tests/ignite/metrics/test_root_mean_squared_error.py @@ -103,17 +103,17 @@ def _test(metric_device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_integration(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_integration(device) @@ -132,7 +132,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) @@ -140,7 +141,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py index 2e06ad589ecf..b83b92239405 100644 --- a/tests/ignite/metrics/test_running_average.py +++ b/tests/ignite/metrics/test_running_average.py @@ -392,9 +392,9 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) @@ -402,9 +402,9 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): - device = torch.device("cpu") + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) @@ -427,7 +427,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) @@ -437,7 +438,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_on_output(device) _test_distrib_on_metric(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_ssim.py b/tests/ignite/metrics/test_ssim.py index d612bd521920..4d5ab9766817 100644 --- a/tests/ignite/metrics/test_ssim.py +++ b/tests/ignite/metrics/test_ssim.py @@ -193,17 +193,18 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): - device = f"cuda:{local_rank}" + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(distributed_context_single_node_gloo): - device = "cpu" +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -212,7 +213,8 @@ def test_distrib_cpu(distributed_context_single_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = "cpu" + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -221,7 +223,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = f"cuda:{distributed_context_multi_node_nccl['local_rank']}" + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py index e2e4bb39ad85..859c4ba61bcd 100644 --- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py +++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py @@ -128,16 +128,18 @@ def _test_distrib_accumulator_device(device): @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") -def test_distrib_gpu(local_rank, distributed_context_single_node_nccl): - device = torch.device(f"cuda:{local_rank}") +def test_distrib_nccl_gpu(distributed_context_single_node_nccl): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @pytest.mark.distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") -def test_distrib_cpu(local_rank, distributed_context_single_node_gloo): - device = torch.device("cpu") +def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -158,7 +160,8 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): - device = torch.device("cpu") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -167,7 +170,8 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): - device = torch.device(f"cuda:{distributed_context_multi_node_nccl['local_rank']}") + + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) @@ -176,12 +180,14 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars") @pytest.mark.skipif(not idist.has_xla_support, reason="Skip if no PyTorch XLA package") def test_distrib_single_device_xla(): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) def _test_distrib_xla_nprocs(index): + device = idist.device() _test_distrib_integration(device) _test_distrib_accumulator_device(device) From 990247b9a6fb4ff6e618002ab711f4d2b2af0e37 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 31 May 2021 09:11:51 +0000 Subject: [PATCH 3/5] Fixed formatting and renamed multinode tests --- ignite/distributed/auto.py | 3 ++- tests/ignite/contrib/engines/test_common.py | 23 ++++++++++++++++ .../regression/test_canberra_metric.py | 4 +-- .../test_fractional_absolute_error.py | 4 +-- .../regression/test_fractional_bias.py | 4 +-- .../test_geometric_mean_absolute_error.py | 4 +-- .../regression/test_manhattan_distance.py | 4 +-- .../regression/test_maximum_absolute_error.py | 4 +-- .../test_mean_absolute_relative_error.py | 4 +-- .../regression/test_mean_normalized_bias.py | 4 +-- .../regression/test_median_absolute_error.py | 4 +-- .../test_median_absolute_percentage_error.py | 4 +-- .../test_median_relative_absolute_error.py | 4 +-- .../metrics/regression/test_r2_score.py | 4 +-- .../regression/test_wave_hedges_distance.py | 4 +-- .../contrib/metrics/test_average_precision.py | 4 +-- .../contrib/metrics/test_cohen_kappa.py | 4 +-- tests/ignite/contrib/metrics/test_roc_auc.py | 4 +-- .../distributed/comp_models/test_native.py | 8 +++--- tests/ignite/distributed/utils/test_native.py | 2 +- tests/ignite/engine/test_deterministic.py | 4 +-- tests/ignite/engine/test_engine.py | 4 +-- tests/ignite/handlers/test_early_stopping.py | 4 +-- tests/ignite/metrics/nlp/test_bleu.py | 4 +-- tests/ignite/metrics/nlp/test_rouge.py | 4 +-- tests/ignite/metrics/test_accumulation.py | 26 +++++++++++++++++++ tests/ignite/metrics/test_accuracy.py | 24 +++++++++++++++++ .../metrics/test_classification_report.py | 24 +++++++++++++++++ tests/ignite/metrics/test_confusion_matrix.py | 20 ++++++++++++++ tests/ignite/metrics/test_fbeta.py | 18 +++++++++++++ tests/ignite/metrics/test_loss.py | 20 ++++++++++++++ .../metrics/test_mean_absolute_error.py | 4 +-- .../metrics/test_mean_pairwise_distance.py | 4 +-- .../ignite/metrics/test_mean_squared_error.py | 4 +-- tests/ignite/metrics/test_metric.py | 4 +-- tests/ignite/metrics/test_metrics_lambda.py | 4 +-- .../test_multilabel_confusion_matrix.py | 4 +-- tests/ignite/metrics/test_precision.py | 4 +-- tests/ignite/metrics/test_psnr.py | 4 +-- tests/ignite/metrics/test_recall.py | 4 +-- .../metrics/test_root_mean_squared_error.py | 4 +-- tests/ignite/metrics/test_running_average.py | 4 +-- tests/ignite/metrics/test_ssim.py | 4 +-- .../test_top_k_categorical_accuracy.py | 4 +-- 44 files changed, 229 insertions(+), 75 deletions(-) diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index 7953eca5545c..03d09cbe7b40 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -9,8 +9,9 @@ from torch.utils.data.sampler import Sampler from ignite.distributed import utils as idist + +# from ignite.distributed.comp_models import native as idist_native from ignite.distributed.comp_models import horovod as idist_hvd -from ignite.distributed.comp_models import native as idist_native from ignite.distributed.comp_models import xla as idist_xla from ignite.utils import setup_logger diff --git a/tests/ignite/contrib/engines/test_common.py b/tests/ignite/contrib/engines/test_common.py index 1c514e65e97b..aed310d66bd6 100644 --- a/tests/ignite/contrib/engines/test_common.py +++ b/tests/ignite/contrib/engines/test_common.py @@ -603,3 +603,26 @@ def test_distrib_gloo_cpu_or_gpu(dirname, distributed_context_single_node_gloo): dirname, device, rank=local_rank, local_rank=local_rank, distributed=True, lr_scheduler="ignite" ) test_add_early_stopping_by_val_score() + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(dirname, distributed_context_multi_node_gloo): + + device = idist.device() + rank = distributed_context_multi_node_gloo["rank"] + _test_setup_common_training_handlers(dirname, device, rank=rank) + test_add_early_stopping_by_val_score() + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(dirname, distributed_context_multi_node_nccl): + + local_rank = distributed_context_multi_node_nccl["local_rank"] + rank = distributed_context_multi_node_nccl["rank"] + device = idist.device() + _test_setup_common_training_handlers(dirname, device, rank=rank, local_rank=local_rank, distributed=True) + test_add_early_stopping_by_val_score() diff --git a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py index 4bc403296b7a..b30a9c402e68 100644 --- a/tests/ignite/contrib/metrics/regression/test_canberra_metric.py +++ b/tests/ignite/contrib/metrics/regression/test_canberra_metric.py @@ -215,7 +215,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -225,7 +225,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py index f0e15e57f950..dc45bd6d0a98 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_absolute_error.py @@ -221,7 +221,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -231,7 +231,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py index d5bbf35a1271..7079b2baa438 100644 --- a/tests/ignite/contrib/metrics/regression/test_fractional_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_fractional_bias.py @@ -228,7 +228,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -238,7 +238,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py index d1dd5a65c6a6..eae47c7fa71a 100644 --- a/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_geometric_mean_absolute_error.py @@ -225,7 +225,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = torch.device("cpu" if not torch.cuda.is_available() else "cuda") _test_distrib_compute(device) _test_distrib_integration(device) @@ -234,7 +234,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py index a46990707830..2ed6726db4a6 100644 --- a/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_manhattan_distance.py @@ -216,7 +216,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -226,7 +226,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py index ab4f0db6500a..828a9dcc1ee4 100644 --- a/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_maximum_absolute_error.py @@ -212,7 +212,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -222,7 +222,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py index 8af23968924e..90e8baceb497 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_absolute_relative_error.py @@ -236,7 +236,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -246,7 +246,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py index 878a51436bd3..b9036287ff1c 100644 --- a/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py +++ b/tests/ignite/contrib/metrics/regression/test_mean_normalized_bias.py @@ -230,7 +230,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -240,7 +240,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py index a58ed5860991..ebe063293626 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_error.py @@ -230,7 +230,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -240,7 +240,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py index f6a8210cf5d9..a463b6406e05 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_absolute_percentage_error.py @@ -240,7 +240,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -250,7 +250,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py index 792e9ffa6477..06c5ab2eea53 100644 --- a/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py +++ b/tests/ignite/contrib/metrics/regression/test_median_relative_absolute_error.py @@ -231,7 +231,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -241,7 +241,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_r2_score.py b/tests/ignite/contrib/metrics/regression/test_r2_score.py index 097f41eac686..4a87089f3304 100644 --- a/tests/ignite/contrib/metrics/regression/test_r2_score.py +++ b/tests/ignite/contrib/metrics/regression/test_r2_score.py @@ -202,7 +202,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -212,7 +212,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py index 0a62dd215aed..da8ba88f7f20 100644 --- a/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py +++ b/tests/ignite/contrib/metrics/regression/test_wave_hedges_distance.py @@ -197,7 +197,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_compute(device) @@ -207,7 +207,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_compute(device) diff --git a/tests/ignite/contrib/metrics/test_average_precision.py b/tests/ignite/contrib/metrics/test_average_precision.py index 2d988f0811c3..7b7f55aaca0e 100644 --- a/tests/ignite/contrib/metrics/test_average_precision.py +++ b/tests/ignite/contrib/metrics/test_average_precision.py @@ -309,7 +309,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) @@ -319,7 +319,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) diff --git a/tests/ignite/contrib/metrics/test_cohen_kappa.py b/tests/ignite/contrib/metrics/test_cohen_kappa.py index 72f1c4b6e380..32f9bbf9a1f3 100644 --- a/tests/ignite/contrib/metrics/test_cohen_kappa.py +++ b/tests/ignite/contrib/metrics/test_cohen_kappa.py @@ -308,7 +308,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_binary_input(device) @@ -318,7 +318,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_binary_input(device) diff --git a/tests/ignite/contrib/metrics/test_roc_auc.py b/tests/ignite/contrib/metrics/test_roc_auc.py index 1fab388b8831..aa34089cdbc2 100644 --- a/tests/ignite/contrib/metrics/test_roc_auc.py +++ b/tests/ignite/contrib/metrics/test_roc_auc.py @@ -322,7 +322,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) @@ -332,7 +332,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_binary_and_multilabel_inputs(device) diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py index 4aacd5e2909b..80cb480f5ab4 100644 --- a/tests/ignite/distributed/comp_models/test_native.py +++ b/tests/ignite/distributed/comp_models/test_native.py @@ -279,7 +279,7 @@ def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_si @pytest.mark.distributed @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Should be no-dist config") def test__native_dist_model_create_no_dist_gloo(clean_env): - device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu") + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _test__native_dist_model_create_from_backend_no_dist("gloo", device) _test__native_dist_model_create_from_context_no_dist("gloo", device) @@ -288,7 +288,7 @@ def test__native_dist_model_create_no_dist_gloo(clean_env): @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Should be no-dist config") @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU") def test__native_dist_model_create_no_dist_nccl(clean_env): - device = torch.device(f"cuda:0" if torch.cuda.is_available() else "cpu") + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _test__native_dist_model_create_from_backend_no_dist("nccl", device) _test__native_dist_model_create_from_context_no_dist("nccl", device) @@ -383,9 +383,7 @@ def test__native_dist_model_spawn_gloo(init_method, dirname): nproc = torch.cuda.device_count() if torch.cuda.is_available() else 4 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - _test__native_dist_model_spawn( - "gloo", num_workers_per_machine=nproc, device=device, init_method=init_method - ) + _test__native_dist_model_spawn("gloo", num_workers_per_machine=nproc, device=device, init_method=init_method) if device.type == "cpu": _test__native_dist_model_spawn( "gloo", num_workers_per_machine=nproc, device=device, start_method="fork", init_method=init_method diff --git a/tests/ignite/distributed/utils/test_native.py b/tests/ignite/distributed/utils/test_native.py index 4e18c79c31f3..55ce5ebb7647 100644 --- a/tests/ignite/distributed/utils/test_native.py +++ b/tests/ignite/distributed/utils/test_native.py @@ -86,7 +86,7 @@ def test_native_distrib_single_node_spawn_gloo(init_method, dirname): if init_method == "FILE": init_method = f"file://{dirname}/shared" - device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu") + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") _test_native_distrib_single_node_spawn(init_method, "gloo", device, timeout=timeout) diff --git a/tests/ignite/engine/test_deterministic.py b/tests/ignite/engine/test_deterministic.py index 9e626fc5bf10..f7a581ba4343 100644 --- a/tests/ignite/engine/test_deterministic.py +++ b/tests/ignite/engine/test_deterministic.py @@ -583,7 +583,7 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") @@ -593,7 +593,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_resume_random_dataloader_from_iter(device, setup_sampler, sampler_type="distributed") diff --git a/tests/ignite/engine/test_engine.py b/tests/ignite/engine/test_engine.py index 52841d8a102d..ebbdf74c5c5e 100644 --- a/tests/ignite/engine/test_engine.py +++ b/tests/ignite/engine/test_engine.py @@ -515,7 +515,7 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() @@ -523,7 +523,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): _test_run_check_triggered_events_on_iterator() _test_run_check_triggered_events() diff --git a/tests/ignite/handlers/test_early_stopping.py b/tests/ignite/handlers/test_early_stopping.py index 61af9270af96..66b96f757042 100644 --- a/tests/ignite/handlers/test_early_stopping.py +++ b/tests/ignite/handlers/test_early_stopping.py @@ -355,7 +355,7 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_with_engine_early_stopping(device) @@ -365,7 +365,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_with_engine_early_stopping(device) diff --git a/tests/ignite/metrics/nlp/test_bleu.py b/tests/ignite/metrics/nlp/test_bleu.py index 3e9dd148422f..745678449830 100644 --- a/tests/ignite/metrics/nlp/test_bleu.py +++ b/tests/ignite/metrics/nlp/test_bleu.py @@ -212,7 +212,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -221,7 +221,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/nlp/test_rouge.py b/tests/ignite/metrics/nlp/test_rouge.py index 1b59cde18d9f..7102a6f98f12 100644 --- a/tests/ignite/metrics/nlp/test_rouge.py +++ b/tests/ignite/metrics/nlp/test_rouge.py @@ -204,7 +204,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -213,7 +213,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_accumulation.py b/tests/ignite/metrics/test_accumulation.py index 012cc8875ac4..1034c3cfd21c 100644 --- a/tests/ignite/metrics/test_accumulation.py +++ b/tests/ignite/metrics/test_accumulation.py @@ -500,3 +500,29 @@ def test_apex_average_on_cuda(): _test_apex_average(device, amp_mode="apex", opt_level="O1") _test_apex_average(device, amp_mode="apex", opt_level="O2") _test_apex_average(device, amp_mode="apex", opt_level="O3") + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_variable_accumulation(device) + _test_distrib_average(device) + _test_distrib_geom_average(device) + _test_distrib_integration(device) + _test_distrib_accumulator_device(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_variable_accumulation(device) + _test_distrib_average(device) + _test_distrib_geom_average(device) + _test_distrib_integration(device) + _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_accuracy.py b/tests/ignite/metrics/test_accuracy.py index dd48a341257c..c997d2f62a0a 100644 --- a/tests/ignite/metrics/test_accuracy.py +++ b/tests/ignite/metrics/test_accuracy.py @@ -562,3 +562,27 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_multilabel_input_NHW(device) + _test_distrib_integration_multiclass(device) + _test_distrib_integration_multilabel(device) + _test_distrib_accumulator_device(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_multilabel_input_NHW(device) + _test_distrib_integration_multiclass(device) + _test_distrib_integration_multilabel(device) + _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_classification_report.py b/tests/ignite/metrics/test_classification_report.py index b42fe5b50b36..57b80cfcf364 100644 --- a/tests/ignite/metrics/test_classification_report.py +++ b/tests/ignite/metrics/test_classification_report.py @@ -201,3 +201,27 @@ def to_numpy_multilabel(y): num_classes = y.shape[0] y = y.reshape((num_classes, -1)).transpose(1, 0) return y + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_integration_multiclass(device, True) + _test_integration_multiclass(device, False) + _test_integration_multilabel(device, True) + _test_integration_multilabel(device, False) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_integration_multiclass(device, True) + _test_integration_multiclass(device, False) + _test_integration_multilabel(device, True) + _test_integration_multilabel(device, False) diff --git a/tests/ignite/metrics/test_confusion_matrix.py b/tests/ignite/metrics/test_confusion_matrix.py index f6a9bee590ca..af85e060d9b3 100644 --- a/tests/ignite/metrics/test_confusion_matrix.py +++ b/tests/ignite/metrics/test_confusion_matrix.py @@ -638,3 +638,23 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_multiclass_images(device) + _test_distrib_accumulator_device(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_multiclass_images(device) + _test_distrib_accumulator_device(device) diff --git a/tests/ignite/metrics/test_fbeta.py b/tests/ignite/metrics/test_fbeta.py index c3d86d1af936..27eb28905189 100644 --- a/tests/ignite/metrics/test_fbeta.py +++ b/tests/ignite/metrics/test_fbeta.py @@ -190,3 +190,21 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_integration(device) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_loss.py b/tests/ignite/metrics/test_loss.py index 90f263aeb324..23749b960072 100644 --- a/tests/ignite/metrics/test_loss.py +++ b/tests/ignite/metrics/test_loss.py @@ -232,3 +232,23 @@ def _test_distrib_xla_nprocs(index): def test_distrib_xla_nprocs(xmp_executor): n = int(os.environ["NUM_TPU_WORKERS"]) xmp_executor(_test_distrib_xla_nprocs, args=(), nprocs=n) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): + + device = idist.device() + _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2(), tol=1e-6) + _test_distrib_accumulator_device(device, y_test_1()) + + +@pytest.mark.multinode_distributed +@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") +@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): + + device = idist.device() + _test_distrib_compute_on_criterion(device, y_test_1(), y_test_2()) + _test_distrib_accumulator_device(device, y_test_1()) diff --git a/tests/ignite/metrics/test_mean_absolute_error.py b/tests/ignite/metrics/test_mean_absolute_error.py index a2fba0b55bd0..0e9e2a75140f 100644 --- a/tests/ignite/metrics/test_mean_absolute_error.py +++ b/tests/ignite/metrics/test_mean_absolute_error.py @@ -160,7 +160,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -170,7 +170,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_mean_pairwise_distance.py b/tests/ignite/metrics/test_mean_pairwise_distance.py index 0c44d78ac26b..4ba5bdf4ec00 100644 --- a/tests/ignite/metrics/test_mean_pairwise_distance.py +++ b/tests/ignite/metrics/test_mean_pairwise_distance.py @@ -169,7 +169,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -179,7 +179,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_mean_squared_error.py b/tests/ignite/metrics/test_mean_squared_error.py index b73b60749d1c..a1df3fb3a5cf 100644 --- a/tests/ignite/metrics/test_mean_squared_error.py +++ b/tests/ignite/metrics/test_mean_squared_error.py @@ -162,7 +162,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -172,7 +172,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_metric.py b/tests/ignite/metrics/test_metric.py index 8eb84b4cd7d8..8664d6c230aa 100644 --- a/tests/ignite/metrics/test_metric.py +++ b/tests/ignite/metrics/test_metric.py @@ -699,7 +699,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_sync_all_reduce_decorator(device) @@ -709,7 +709,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_sync_all_reduce_decorator(device) diff --git a/tests/ignite/metrics/test_metrics_lambda.py b/tests/ignite/metrics/test_metrics_lambda.py index 7629d59816f6..1376f414fc9b 100644 --- a/tests/ignite/metrics/test_metrics_lambda.py +++ b/tests/ignite/metrics/test_metrics_lambda.py @@ -432,7 +432,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -441,7 +441,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_multilabel_confusion_matrix.py b/tests/ignite/metrics/test_multilabel_confusion_matrix.py index 7a21825aa1b3..01c959332fb8 100644 --- a/tests/ignite/metrics/test_multilabel_confusion_matrix.py +++ b/tests/ignite/metrics/test_multilabel_confusion_matrix.py @@ -390,7 +390,7 @@ def test_simple_batched(): # @pytest.mark.multinode_distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -# def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +# def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): # # device = idist.device() # _test_distrib_multiclass_images(device) @@ -400,7 +400,7 @@ def test_simple_batched(): # @pytest.mark.multinode_distributed # @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") # @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -# def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +# def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): # # device = idist.device() # _test_distrib_multiclass_images(device) diff --git a/tests/ignite/metrics/test_precision.py b/tests/ignite/metrics/test_precision.py index 8162fd4de1d1..be2f6a909fd9 100644 --- a/tests/ignite/metrics/test_precision.py +++ b/tests/ignite/metrics/test_precision.py @@ -555,7 +555,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration_multiclass(device) @@ -567,7 +567,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration_multiclass(device) diff --git a/tests/ignite/metrics/test_psnr.py b/tests/ignite/metrics/test_psnr.py index cea2c3ea8ab3..f39fbd46e763 100644 --- a/tests/ignite/metrics/test_psnr.py +++ b/tests/ignite/metrics/test_psnr.py @@ -259,7 +259,7 @@ def test_distrib_nccl_gpu(distributed_context_single_node_nccl): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -269,7 +269,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_recall.py b/tests/ignite/metrics/test_recall.py index 76a9d67599d1..8f3cdb9c67e4 100644 --- a/tests/ignite/metrics/test_recall.py +++ b/tests/ignite/metrics/test_recall.py @@ -556,7 +556,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration_multiclass(device) @@ -568,7 +568,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration_multiclass(device) diff --git a/tests/ignite/metrics/test_root_mean_squared_error.py b/tests/ignite/metrics/test_root_mean_squared_error.py index c4d13e259b02..7c0ccee4d60b 100644 --- a/tests/ignite/metrics/test_root_mean_squared_error.py +++ b/tests/ignite/metrics/test_root_mean_squared_error.py @@ -131,7 +131,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -140,7 +140,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_running_average.py b/tests/ignite/metrics/test_running_average.py index b83b92239405..857e929c8509 100644 --- a/tests/ignite/metrics/test_running_average.py +++ b/tests/ignite/metrics/test_running_average.py @@ -426,7 +426,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_on_output(device) @@ -437,7 +437,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_on_output(device) diff --git a/tests/ignite/metrics/test_ssim.py b/tests/ignite/metrics/test_ssim.py index 4d5ab9766817..ae62768eda45 100644 --- a/tests/ignite/metrics/test_ssim.py +++ b/tests/ignite/metrics/test_ssim.py @@ -212,7 +212,7 @@ def test_distrib_gloo_cpu_or_gpu(distributed_context_single_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -222,7 +222,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) diff --git a/tests/ignite/metrics/test_top_k_categorical_accuracy.py b/tests/ignite/metrics/test_top_k_categorical_accuracy.py index 859c4ba61bcd..2282248a96a1 100644 --- a/tests/ignite/metrics/test_top_k_categorical_accuracy.py +++ b/tests/ignite/metrics/test_top_k_categorical_accuracy.py @@ -159,7 +159,7 @@ def test_distrib_hvd(gloo_hvd_executor): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): +def test_multinode_distrib_gloo_cpu_or_gpu(distributed_context_multi_node_gloo): device = idist.device() _test_distrib_integration(device) @@ -169,7 +169,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo): @pytest.mark.multinode_distributed @pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support") @pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed") -def test_multinode_distrib_gpu(distributed_context_multi_node_nccl): +def test_multinode_distrib_nccl_gpu(distributed_context_multi_node_nccl): device = idist.device() _test_distrib_integration(device) From 12bfaa3275cdca2b0df2274e271efa1d2efbae89 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 31 May 2021 11:51:47 +0000 Subject: [PATCH 4/5] Fixed issue with backends in auto_model method --- ignite/distributed/auto.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index 03d09cbe7b40..a0209e5c0b7a 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -10,7 +10,7 @@ from ignite.distributed import utils as idist -# from ignite.distributed.comp_models import native as idist_native +from ignite.distributed.comp_models import native as idist_native from ignite.distributed.comp_models import horovod as idist_hvd from ignite.distributed.comp_models import xla as idist_xla from ignite.utils import setup_logger @@ -189,25 +189,21 @@ def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Mod # distributed data parallel model if idist.get_world_size() > 1: bnd = idist.backend() - # if idist.has_native_dist_support and bnd == idist_native.NCCL: - if idist.has_native_dist_support and torch.cuda.is_available(): + if idist.has_native_dist_support and bnd in (idist_native.NCCL, idist_native.GLOO, idist_native.MPI): if sync_bn: logger.info("Convert batch norm to sync batch norm") model = nn.SyncBatchNorm.convert_sync_batchnorm(model) - if "device_ids" in kwargs: - raise ValueError(f"Argument kwargs should not contain 'device_ids', but got {kwargs}") + if torch.cuda.is_available(): + if "device_ids" in kwargs: + raise ValueError(f"Argument kwargs should not contain 'device_ids', but got {kwargs}") - lrank = idist.get_local_rank() - logger.info(f"Apply torch DistributedDataParallel on model, device id: {lrank}") - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[lrank,], **kwargs) - # elif idist.has_native_dist_support and bnd == idist_native.GLOO: - elif idist.has_native_dist_support: - if sync_bn: - logger.info("Convert batch norm to sync batch norm") - model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + lrank = idist.get_local_rank() + logger.info(f"Apply torch DistributedDataParallel on model, device id: {lrank}") + kwargs["device_ids"] = [lrank, ] + else: + logger.info("Apply torch DistributedDataParallel on model") - logger.info("Apply torch DistributedDataParallel on model") model = torch.nn.parallel.DistributedDataParallel(model, **kwargs) elif idist.has_hvd_support and bnd == idist_hvd.HOROVOD: import horovod.torch as hvd From 2d23ce5ad684756cac493029dd28a2bf844d5338 Mon Sep 17 00:00:00 2001 From: vfdev-5 Date: Mon, 31 May 2021 11:59:15 +0000 Subject: [PATCH 5/5] Removed commented code and fixed formatting --- ignite/distributed/auto.py | 7 ++++--- ignite/distributed/comp_models/native.py | 4 ---- tests/ignite/conftest.py | 2 -- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/ignite/distributed/auto.py b/ignite/distributed/auto.py index a0209e5c0b7a..e92bec43a764 100644 --- a/ignite/distributed/auto.py +++ b/ignite/distributed/auto.py @@ -9,9 +9,8 @@ from torch.utils.data.sampler import Sampler from ignite.distributed import utils as idist - -from ignite.distributed.comp_models import native as idist_native from ignite.distributed.comp_models import horovod as idist_hvd +from ignite.distributed.comp_models import native as idist_native from ignite.distributed.comp_models import xla as idist_xla from ignite.utils import setup_logger @@ -200,7 +199,9 @@ def auto_model(model: nn.Module, sync_bn: bool = False, **kwargs: Any) -> nn.Mod lrank = idist.get_local_rank() logger.info(f"Apply torch DistributedDataParallel on model, device id: {lrank}") - kwargs["device_ids"] = [lrank, ] + kwargs["device_ids"] = [ + lrank, + ] else: logger.info("Apply torch DistributedDataParallel on model") diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py index c2d8bfd7ebfb..d5bde07329cf 100644 --- a/ignite/distributed/comp_models/native.py +++ b/ignite/distributed/comp_models/native.py @@ -127,7 +127,6 @@ def _create_from_backend( # https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 dist.barrier() - # if backend in (dist.Backend.NCCL, dist.Backend.GLOO) and torch.cuda.is_available(): if torch.cuda.is_available(): torch.cuda.set_device(self._local_rank) @@ -141,7 +140,6 @@ def _init_from_context(self) -> None: def _compute_nproc_per_node(self) -> int: local_rank = self.get_local_rank() device = torch.device("cpu") - # if self.backend() == dist.Backend.NCCL: if torch.cuda.is_available(): # we manually set cuda device to local rank in order to avoid a hang on all_reduce device = torch.device(f"cuda:{local_rank}") @@ -153,7 +151,6 @@ def _get_all_hostnames(self) -> List[Tuple[str, ...]]: import socket device = "cpu" - # if self.backend() == dist.Backend.NCCL: if torch.cuda.is_available(): index = torch.cuda.current_device() device = f"cuda:{index}" @@ -284,7 +281,6 @@ def get_node_rank(self) -> int: return cast(int, self._node) def device(self) -> torch.device: - # if self.backend() == dist.Backend.NCCL: if torch.cuda.is_available(): index = torch.cuda.current_device() if index < self.get_local_rank(): diff --git a/tests/ignite/conftest.py b/tests/ignite/conftest.py index 5e0fed143e91..ce52495c32c5 100644 --- a/tests/ignite/conftest.py +++ b/tests/ignite/conftest.py @@ -94,7 +94,6 @@ def _create_dist_context(dist_info, lrank): dist.init_process_group(**dist_info) dist.barrier() - # if dist_info["backend"] == "nccl": if torch.cuda.is_available(): torch.cuda.set_device(lrank) @@ -211,7 +210,6 @@ def _create_mnodes_dist_context(dist_info, mnodes_conf): dist.init_process_group(**dist_info) dist.barrier() - # if dist_info["backend"] == "nccl": if torch.cuda.is_available(): torch.cuda.device(mnodes_conf["local_rank"]) return mnodes_conf