pytorch · peterbell10 · Aug 18, 2021 · Aug 18, 2021 · Aug 18, 2021 · Aug 19, 2021
diff --git a/aten/src/ATen/LegacyTHFunctionsCUDA.h b/aten/src/ATen/LegacyTHFunctionsCUDA.h
diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh
@@ -5,7 +5,7 @@
 #include <THC/THCAtomics.cuh>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/macros/Macros.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
+#include <ATen/native/Copy.h>
 
 #include <math.h>
 
@@ -453,13 +453,11 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 
   if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) {
     // Must perform in contiguous space
-    oldA = a;
-    a = a.contiguous();
+    oldA = std::exchange(a, a.contiguous());
   }
   if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) {
     // Must perform in contiguous space
-    oldB = b;
-    b = b.contiguous();
+    oldB = std::exchange(b, b.contiguous());
   }
 
   // It is possible that the tensor dimensions are able to be collapsed,
@@ -547,17 +545,11 @@ inline bool CUDA_tensor_apply2(at::Tensor a,
 #undef HANDLE_A_CASE
 
   if (oldA.defined()) {
-    // Ignore overlaps when copying back; if we use copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldA contiguous.
-    at::native::legacy::cuda::_th_copy_ignoring_overlaps_(oldA, a);
+    at::native::copy_ignoring_overlaps(oldA, a);
   }
 
   if (oldB.defined()) {
-    // Ignore overlaps when copying back; if we use copy
-    // instead, it will recursively try and invoke ourselves to make
-    // oldB contiguous.
-    at::native::legacy::cuda::_th_copy_ignoring_overlaps_(oldB, b);
+    at::native::copy_ignoring_overlaps(oldB, b);
   }
 
   return true;

diff --git a/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp b/aten/src/ATen/cuda/LegacyTHFunctionsCUDA.cpp
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
@@ -253,6 +253,22 @@ Tensor& copy_(Tensor& self, const Tensor& src, bool non_blocking) {
   return self;
 }
 
+void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src) {
+  // Called when we are copying into an overlapping index `dst`, but we don't
+  // care which writer wins. Hacky but it works. This is only used by
+  // CUDA_tensor_apply2 in case that there are write overlaps.
+  // FIXME: really, overlapping writes should be illegal/an error in Torch
+  auto iter = TensorIteratorConfig()
+      .add_output(dst)
+      .add_input(src)
+      .resize_outputs(false)
+      .set_check_mem_overlap(false)
+      .check_all_same_dtype(true)
+      .check_all_same_device(true)
+      .build();
+  copy_stub(iter.device_type(), iter, /*non_blocking=*/false);
+}
+
 DEFINE_DISPATCH(copy_stub);
 
 } // namespace native

diff --git a/aten/src/ATen/native/Copy.h b/aten/src/ATen/native/Copy.h
@@ -13,5 +13,7 @@ using copy_fn = void (*)(TensorIterator&, bool non_blocking);
 
 DECLARE_DISPATCH(copy_fn, copy_stub);
 
+TORCH_API void copy_ignoring_overlaps(const Tensor &dst, const Tensor &src);
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/cuda/DistributionBernoulli.cu b/aten/src/ATen/native/cuda/DistributionBernoulli.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu b/aten/src/ATen/native/cuda/DistributionCauchyKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu b/aten/src/ATen/native/cuda/DistributionExponentialKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu b/aten/src/ATen/native/cuda/DistributionGeometricKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu b/aten/src/ATen/native/cuda/DistributionLogNormalKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/DistributionNormal.cu b/aten/src/ATen/native/cuda/DistributionNormal.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/DistributionRandomKernel.cu b/aten/src/ATen/native/cuda/DistributionRandomKernel.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
@@ -16,7 +16,6 @@
 #include <ATen/native/Distributions.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/TensorIterator.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 
 #include <THC/THCGeneral.h>
 #include <THC/THCApply.cuh>

diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -1,6 +1,5 @@
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/CUDAFunctions.h>
 #include <ATen/cuda/CUDAContext.h>

diff --git a/aten/src/ATen/native/cuda/Sort.cu b/aten/src/ATen/native/cuda/Sort.cu
@@ -3,7 +3,6 @@
 #include <ATen/ATen.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/WrapDimUtils.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/core/Array.h>
 #include <ATen/cuda/cub.cuh>
 #include <ATen/cuda/CUDAContext.h>

diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -1,7 +1,6 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/detail/TensorInfo.cuh>
 #include <ATen/cuda/detail/OffsetCalculator.cuh>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/native/Resize.h>
 #include <ATen/native/cuda/SortingCommon.cuh>
 #include <ATen/native/cuda/SortingRadixSelect.cuh>

diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu
@@ -1,6 +1,5 @@
 #include <ATen/ATen.h>
 #include <ATen/AccumulateType.h>
-#include <ATen/LegacyTHFunctionsCUDA.h>
 #include <ATen/NativeFunctions.h>
 #include <ATen/TensorUtils.h>
 #include <ATen/Utils.h>

diff --git a/aten/src/ATen/templates/RegisterDispatchKey.cpp b/aten/src/ATen/templates/RegisterDispatchKey.cpp
@@ -33,7 +33,6 @@
 #include <ATen/core/op_registration/adaption.h>
 #include <torch/library.h>
 $extra_cuda_headers
-$legacy_th_headers
 $external_backend_headers
 $namespaced_headers
 

diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py
@@ -1096,9 +1096,6 @@ def make_file_manager(install_dir: str) -> FileManager:
 
         fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
             'extra_cuda_headers': extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else '',
-            'legacy_th_headers':
-                '#include <ATen/LegacyTHFunctionsCUDA.h>' if dispatch_key == DispatchKey.CUDA else
-                '',
             'external_backend_headers': '',
             'namespaced_headers': f'#include <ATen/{dispatch_key}Functions.h>' if dispatch_key in functions_keys else '',
             'DispatchKey': dispatch_key,

diff --git a/tools/codegen/gen_backend_stubs.py b/tools/codegen/gen_backend_stubs.py
@@ -227,7 +227,6 @@ def make_file_manager(install_dir: str) -> FileManager:
         for dispatch_key in [backend_dispatch_key, autograd_dispatch_key]:
             fm.write_with_template(f'Register{dispatch_key}.cpp', 'RegisterDispatchKey.cpp', lambda: {
                 'extra_cuda_headers': '',
-                'legacy_th_headers': '',
                 'external_backend_headers': f'#include "{output_dir}/{backend_key}NativeFunctions.h"',
                 'namespaced_headers': '',
                 'DispatchKey': dispatch_key,