From dfaa7267051dc52bedfe60f9b17f49367561d1e3 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Wed, 23 Feb 2022 22:18:10 +0000 Subject: [PATCH] Add barrier() after init_process_group() --- references/classification/utils.py | 1 + references/optical_flow/utils.py | 1 + references/segmentation/utils.py | 1 + references/video_classification/utils.py | 1 + 4 files changed, 4 insertions(+) diff --git a/references/classification/utils.py b/references/classification/utils.py index 4afe9bf68f1..473f4815265 100644 --- a/references/classification/utils.py +++ b/references/classification/utils.py @@ -274,6 +274,7 @@ def init_distributed_mode(args): torch.distributed.init_process_group( backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) + torch.distributed.barrier() setup_for_distributed(args.rank == 0) diff --git a/references/optical_flow/utils.py b/references/optical_flow/utils.py index e3643a91663..acdc49bd1f7 100644 --- a/references/optical_flow/utils.py +++ b/references/optical_flow/utils.py @@ -267,6 +267,7 @@ def setup_ddp(args): world_size=args.world_size, init_method=args.dist_url, ) + torch.distributed.barrier() def reduce_across_processes(val): diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py index 5084ef5cc79..22096c9dd2c 100644 --- a/references/segmentation/utils.py +++ b/references/segmentation/utils.py @@ -291,4 +291,5 @@ def init_distributed_mode(args): torch.distributed.init_process_group( backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) + torch.distributed.barrier() setup_for_distributed(args.rank == 0) diff --git a/references/video_classification/utils.py b/references/video_classification/utils.py index 5c4bdf89c0c..a68c2386bcf 100644 --- a/references/video_classification/utils.py +++ b/references/video_classification/utils.py @@ -250,4 +250,5 @@ def init_distributed_mode(args): torch.distributed.init_process_group( backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank ) + torch.distributed.barrier() setup_for_distributed(args.rank == 0)