From fe7a5f0d9df85354f407662756dbfe0652693052 Mon Sep 17 00:00:00 2001
From: Yosua Michael Maranatha <yosuamichael@fb.com>
Date: Thu, 14 Apr 2022 13:05:00 +0100
Subject: [PATCH 1/5] Change code to reduce variance in eval

---
 references/classification/train.py       |  1 +
 references/detection/train.py            | 15 ++++++++++-
 references/optical_flow/train.py         |  9 +++++++
 references/segmentation/train.py         | 32 +++++++++++++++++++++++-
 references/segmentation/utils.py         | 24 ++++++++++--------
 references/similarity/train.py           | 23 +++++++++++++++++
 references/video_classification/train.py | 30 ++++++++++++++++++++--
 references/video_classification/utils.py | 17 +++++++++----
 8 files changed, 131 insertions(+), 20 deletions(-)

diff --git a/references/classification/train.py b/references/classification/train.py
index 6a3c289bc04..92a77768ff3 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -59,6 +59,7 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, arg
 
 
 def evaluate(model, criterion, data_loader, device, print_freq=100, log_suffix=""):
+
     model.eval()
     metric_logger = utils.MetricLogger(delimiter="  ")
     header = f"Test: {log_suffix}"
diff --git a/references/detection/train.py b/references/detection/train.py
index 758171013e8..d3b394b8bd0 100644
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -132,6 +132,10 @@ def get_args_parser(add_help=True):
         action="store_true",
     )
 
+    parser.add_argument(
+        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
+    )
+
     # distributed training parameters
     parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
     parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
@@ -153,6 +157,12 @@ def main(args):
 
     device = torch.device(args.device)
 
+    if args.use_deterministic_algorithms:
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True)
+    else:
+        torch.backends.cudnn.benchmark = True
+
     # Data loading code
     print("Loading data")
 
@@ -162,7 +172,7 @@ def main(args):
     print("Creating data loaders")
     if args.distributed:
         train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
     else:
         train_sampler = torch.utils.data.RandomSampler(dataset)
         test_sampler = torch.utils.data.SequentialSampler(dataset_test)
@@ -243,6 +253,9 @@ def main(args):
             scaler.load_state_dict(checkpoint["scaler"])
 
     if args.test_only:
+        # We disable the cudnn benchmarking because it can noticeably affect the accuracy
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
         evaluate(model, data_loader_test, device=device)
         return
 
diff --git a/references/optical_flow/train.py b/references/optical_flow/train.py
index 5070cb554d4..18c31a75e65 100644
--- a/references/optical_flow/train.py
+++ b/references/optical_flow/train.py
@@ -209,6 +209,12 @@ def main(args):
         raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
     device = torch.device(args.device)
 
+    if args.use_deterministic_algorithms:
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True)
+    else:
+        torch.backends.cudnn.benchmark = True
+
     model = torchvision.models.optical_flow.__dict__[args.model](weights=args.weights)
 
     if args.distributed:
@@ -372,6 +378,9 @@ def get_args_parser(add_help=True):
 
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load.")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu, Default: cuda)")
+    parser.add_argument(
+        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
+    )
 
     return parser
 
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index e8570ab7f69..2ea2d8482a1 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -61,6 +61,7 @@ def evaluate(model, data_loader, device, num_classes):
     confmat = utils.ConfusionMatrix(num_classes)
     metric_logger = utils.MetricLogger(delimiter="  ")
     header = "Test:"
+    num_processed_samples = 0
     with torch.inference_mode():
         for image, target in metric_logger.log_every(data_loader, 100, header):
             image, target = image.to(device), target.to(device)
@@ -68,9 +69,26 @@ def evaluate(model, data_loader, device, num_classes):
             output = output["out"]
 
             confmat.update(target.flatten(), output.argmax(1).flatten())
+            # FIXME need to take into account that the datasets
+            # could have been padded in distributed setup
+            num_processed_samples += image.shape[0]
 
         confmat.reduce_from_all_processes()
 
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+    if (
+        hasattr(data_loader.dataset, "__len__")
+        and len(data_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        # See FIXME above
+        warnings.warn(
+            f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
+            "samples were used for the validation, which might bias the results. "
+            "Try adjusting the batch size and / or the world size. "
+            "Setting the world size to 1 is always a safe bet."
+        )
+
     return confmat
 
 
@@ -108,12 +126,18 @@ def main(args):
 
     device = torch.device(args.device)
 
+    if args.use_deterministic_algorithms:
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True)
+    else:
+        torch.backends.cudnn.benchmark = True
+
     dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args))
     dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args))
 
     if args.distributed:
         train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
     else:
         train_sampler = torch.utils.data.RandomSampler(dataset)
         test_sampler = torch.utils.data.SequentialSampler(dataset_test)
@@ -191,6 +215,9 @@ def main(args):
                 scaler.load_state_dict(checkpoint["scaler"])
 
     if args.test_only:
+        # We disable the cudnn benchmarking because it can noticeably affect the accuracy
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
         confmat = evaluate(model, data_loader_test, device=device, num_classes=num_classes)
         print(confmat)
         return
@@ -261,6 +288,9 @@ def get_args_parser(add_help=True):
         help="Only test the model",
         action="store_true",
     )
+    parser.add_argument(
+        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
+    )
     # distributed training parameters
     parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
     parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py
index 27c8f4ce51e..dfd12726b53 100644
--- a/references/segmentation/utils.py
+++ b/references/segmentation/utils.py
@@ -30,11 +30,7 @@ def synchronize_between_processes(self):
         """
         Warning: does not synchronize the deque!
         """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
-        dist.barrier()
-        dist.all_reduce(t)
+        t = reduce_across_processes([self.count, self.total])
         t = t.tolist()
         self.count = int(t[0])
         self.total = t[1]
@@ -92,12 +88,7 @@ def compute(self):
         return acc_global, acc, iu
 
     def reduce_from_all_processes(self):
-        if not torch.distributed.is_available():
-            return
-        if not torch.distributed.is_initialized():
-            return
-        torch.distributed.barrier()
-        torch.distributed.all_reduce(self.mat)
+        reduce_across_processes(self.mat)
 
     def __str__(self):
         acc_global, acc, iu = self.compute()
@@ -296,3 +287,14 @@ def init_distributed_mode(args):
     )
     torch.distributed.barrier()
     setup_for_distributed(args.rank == 0)
+
+
+def reduce_across_processes(val):
+    if not is_dist_avail_and_initialized():
+        # nothing to sync, but we still convert to tensor for consistency with the distributed case.
+        return torch.tensor(val)
+
+    t = torch.tensor(val, device="cuda")
+    dist.barrier()
+    dist.all_reduce(t)
+    return t
diff --git a/references/similarity/train.py b/references/similarity/train.py
index 9c24ce73f3c..146e2bef688 100644
--- a/references/similarity/train.py
+++ b/references/similarity/train.py
@@ -88,6 +88,13 @@ def save(model, epoch, save_dir, file_name):
 
 def main(args):
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+    if args.use_deterministic_algorithms:
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True)
+    else:
+        torch.backends.cudnn.benchmark = True
+
     p = args.labels_per_batch
     k = args.samples_per_label
     batch_size = p * k
@@ -126,6 +133,13 @@ def main(args):
     )
     test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.workers)
 
+    if args.test_only:
+        # We disable the cudnn benchmarking because it can noticeably affect the accuracy
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
+        evaluate(model, test_loader, device)
+        return
+
     for epoch in range(1, args.epochs + 1):
         print("Training...")
         train_epoch(model, optimizer, criterion, train_loader, device, epoch, args.print_freq)
@@ -155,6 +169,15 @@ def parse_args():
     parser.add_argument("--print-freq", default=20, type=int, help="print frequency")
     parser.add_argument("--save-dir", default=".", type=str, help="Model save directory")
     parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
+    parser.add_argument(
+        "--test-only",
+        dest="test_only",
+        help="Only test the model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
+    )
 
     return parser.parse_args()
 
diff --git a/references/video_classification/train.py b/references/video_classification/train.py
index 918a012282e..727e4b3cc03 100644
--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -50,6 +50,7 @@ def evaluate(model, criterion, data_loader, device):
     model.eval()
     metric_logger = utils.MetricLogger(delimiter="  ")
     header = "Test:"
+    num_processed_samples = 0
     with torch.inference_mode():
         for video, target in metric_logger.log_every(data_loader, 100, header):
             video = video.to(device, non_blocking=True)
@@ -64,7 +65,22 @@ def evaluate(model, criterion, data_loader, device):
             metric_logger.update(loss=loss.item())
             metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
             metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
+            num_processed_samples += batch_size
     # gather the stats from all processes
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+    if (
+        hasattr(data_loader.dataset, "__len__")
+        and len(data_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        # See FIXME above
+        warnings.warn(
+            f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
+            "samples were used for the validation, which might bias the results. "
+            "Try adjusting the batch size and / or the world size. "
+            "Setting the world size to 1 is always a safe bet."
+        )
+
     metric_logger.synchronize_between_processes()
 
     print(
@@ -99,7 +115,11 @@ def main(args):
 
     device = torch.device(args.device)
 
-    torch.backends.cudnn.benchmark = True
+    if args.use_deterministic_algorithms:
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True)
+    else:
+        torch.backends.cudnn.benchmark = True
 
     # Data loading code
     print("Loading data")
@@ -173,7 +193,7 @@ def main(args):
     test_sampler = UniformClipSampler(dataset_test.video_clips, args.clips_per_video)
     if args.distributed:
         train_sampler = DistributedSampler(train_sampler)
-        test_sampler = DistributedSampler(test_sampler)
+        test_sampler = DistributedSampler(test_sampler, shuffle=False)
 
     data_loader = torch.utils.data.DataLoader(
         dataset,
@@ -248,6 +268,9 @@ def main(args):
             scaler.load_state_dict(checkpoint["scaler"])
 
     if args.test_only:
+        # We disable the cudnn benchmarking because it can noticeably affect the accuracy
+        torch.backends.cudnn.benchmark = False
+        torch.backends.cudnn.deterministic = True
         evaluate(model, criterion, data_loader_test, device=device)
         return
 
@@ -335,6 +358,9 @@ def parse_args():
         help="Only test the model",
         action="store_true",
     )
+    parser.add_argument(
+        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
+    )
 
     # distributed training parameters
     parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
diff --git a/references/video_classification/utils.py b/references/video_classification/utils.py
index 116adf8d72f..024426d5916 100644
--- a/references/video_classification/utils.py
+++ b/references/video_classification/utils.py
@@ -30,11 +30,7 @@ def synchronize_between_processes(self):
         """
         Warning: does not synchronize the deque!
         """
-        if not is_dist_avail_and_initialized():
-            return
-        t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda")
-        dist.barrier()
-        dist.all_reduce(t)
+        t = reduce_across_processes([self.count, self.total])
         t = t.tolist()
         self.count = int(t[0])
         self.total = t[1]
@@ -255,3 +251,14 @@ def init_distributed_mode(args):
     )
     torch.distributed.barrier()
     setup_for_distributed(args.rank == 0)
+
+
+def reduce_across_processes(val):
+    if not is_dist_avail_and_initialized():
+        # nothing to sync, but we still convert to tensor for consistency with the distributed case.
+        return torch.tensor(val)
+
+    t = torch.tensor(val, device="cuda")
+    dist.barrier()
+    dist.all_reduce(t)
+    return t

From 60347ac503f185bab5a6b69104d1909cfdba1b74 Mon Sep 17 00:00:00 2001
From: Yosua Michael Maranatha <yosuamichael@fb.com>
Date: Wed, 20 Apr 2022 14:55:28 +0100
Subject: [PATCH 2/5] Remove unnecessary new line

---
 references/classification/train.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/references/classification/train.py b/references/classification/train.py
index 92a77768ff3..6a3c289bc04 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -59,7 +59,6 @@ def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, arg
 
 
 def evaluate(model, criterion, data_loader, device, print_freq=100, log_suffix=""):
-
     model.eval()
     metric_logger = utils.MetricLogger(delimiter="  ")
     header = f"Test: {log_suffix}"

From 07f0622ae7cc45d876ae1cb9ed90766e7b5dfc45 Mon Sep 17 00:00:00 2001
From: Yosua Michael Maranatha <yosuamichael@fb.com>
Date: Thu, 21 Apr 2022 11:02:52 +0000
Subject: [PATCH 3/5] Fix missing import warnings

---
 references/segmentation/train.py         | 1 +
 references/video_classification/train.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index 2ea2d8482a1..95dfedb5e9a 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -1,6 +1,7 @@
 import datetime
 import os
 import time
+import warnings
 
 import presets
 import torch
diff --git a/references/video_classification/train.py b/references/video_classification/train.py
index 727e4b3cc03..bf75cd01453 100644
--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -1,6 +1,7 @@
 import datetime
 import os
 import time
+import warnings
 
 import presets
 import torch

From e92a13cb8805641517d0029d7ae349f248942090 Mon Sep 17 00:00:00 2001
From: Yosua Michael Maranatha <yosuamichael@fb.com>
Date: Mon, 25 Apr 2022 14:07:16 +0100
Subject: [PATCH 4/5] Fix the warning on video_classification

---
 references/video_classification/train.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/references/video_classification/train.py b/references/video_classification/train.py
index 727e4b3cc03..2ade3c42e8c 100644
--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -68,14 +68,20 @@ def evaluate(model, criterion, data_loader, device):
             num_processed_samples += batch_size
     # gather the stats from all processes
     num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+    if not utils.is_dist_avail_and_initialized():
+        num_data_from_sampler = len(data_loader.sampler)
+    else:
+        # Get the len of UniformClipSampler
+        num_data_from_sampler = len(data_loader.dataset.sampler)
+
     if (
         hasattr(data_loader.dataset, "__len__")
-        and len(data_loader.dataset) != num_processed_samples
+        and num_data_from_sampler != num_processed_samples
         and torch.distributed.get_rank() == 0
     ):
         # See FIXME above
         warnings.warn(
-            f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
+            f"It looks like the sampler has {num_data_from_sampler} samples, but {num_processed_samples} "
             "samples were used for the validation, which might bias the results. "
             "Try adjusting the batch size and / or the world size. "
             "Setting the world size to 1 is always a safe bet."

From 13165416ce8237bad4b5253bd19f8fa8b98e9a0f Mon Sep 17 00:00:00 2001
From: Yosua Michael Maranatha <yosuamichael@fb.com>
Date: Mon, 25 Apr 2022 15:53:02 +0000
Subject: [PATCH 5/5] Fix bug to get len of UniformClipSampler

---
 references/video_classification/train.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/references/video_classification/train.py b/references/video_classification/train.py
index 548b8ee7fb0..26c856da878 100644
--- a/references/video_classification/train.py
+++ b/references/video_classification/train.py
@@ -69,11 +69,11 @@ def evaluate(model, criterion, data_loader, device):
             num_processed_samples += batch_size
     # gather the stats from all processes
     num_processed_samples = utils.reduce_across_processes(num_processed_samples)
-    if not utils.is_dist_avail_and_initialized():
-        num_data_from_sampler = len(data_loader.sampler)
+    if isinstance(data_loader.sampler, DistributedSampler):
+        # Get the len of UniformClipSampler inside DistributedSampler
+        num_data_from_sampler = len(data_loader.sampler.dataset)
     else:
-        # Get the len of UniformClipSampler
-        num_data_from_sampler = len(data_loader.dataset.sampler)
+        num_data_from_sampler = len(data_loader.sampler)
 
     if (
         hasattr(data_loader.dataset, "__len__")