rebase on "[TensorExpr] Fix lowering for aten::div."

Differential Revision: [D25130750](https://our.internmc.facebook.com/intern/diff/D25130750) [ghstack-poisoned]
pytorch · Nov 21, 2020 · 63c23be · 63c23be
2 parents 946b4ab + 9554129
commit 63c23be
Show file tree

Hide file tree

Showing 34 changed files with 228 additions and 99 deletions.
diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
@@ -131,10 +131,10 @@ def TruePred(_):
     WindowsJob(None, _VC2019, CudaVersion(10, 1)),
     WindowsJob(1, _VC2019, CudaVersion(10, 1)),
     WindowsJob(2, _VC2019, CudaVersion(10, 1)),
-    # VS2019 CUDA-11.1
-    WindowsJob(None, _VC2019, CudaVersion(11, 1)),
-    WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only_pred=TruePred),
-    WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only_pred=TruePred),
+    # VS2019 CUDA-11.0
+    WindowsJob(None, _VC2019, CudaVersion(11, 0)),
+    WindowsJob(1, _VC2019, CudaVersion(11, 0), master_only_pred=TruePred),
+    WindowsJob(2, _VC2019, CudaVersion(11, 0), master_only_pred=TruePred),
     # VS2019 CPU-only
     WindowsJob(None, _VC2019, None),
     WindowsJob(1, _VC2019, None, master_only_pred=TruePred),

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -7833,7 +7833,7 @@ workflows:
       - pytorch_windows_build:
           build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
           cuda_version: "11"
-          name: pytorch_windows_vs2019_py36_cuda11.1_build
+          name: pytorch_windows_vs2019_py36_cuda11.0_build
           python_version: "3.6"
           use_cuda: "1"
           vc_product: Community
@@ -7849,10 +7849,10 @@ workflows:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          name: pytorch_windows_vs2019_py36_cuda11.1_test1
+          name: pytorch_windows_vs2019_py36_cuda11.0_test1
           python_version: "3.6"
           requires:
-            - pytorch_windows_vs2019_py36_cuda11.1_build
+            - pytorch_windows_vs2019_py36_cuda11.0_build
           test_name: pytorch-windows-test1
           use_cuda: "1"
           vc_product: Community
@@ -7868,10 +7868,10 @@ workflows:
                 - master
                 - /ci-all\/.*/
                 - /release\/.*/
-          name: pytorch_windows_vs2019_py36_cuda11.1_test2
+          name: pytorch_windows_vs2019_py36_cuda11.0_test2
           python_version: "3.6"
           requires:
-            - pytorch_windows_vs2019_py36_cuda11.1_build
+            - pytorch_windows_vs2019_py36_cuda11.0_build
           test_name: pytorch-windows-test2
           use_cuda: "1"
           vc_product: Community

diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh
@@ -7,10 +7,10 @@ if [[ "$CUDA_VERSION" == "10" ]]; then
     msbuild_project_dir="CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
     cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
 elif [[ "$CUDA_VERSION" == "11" ]]; then
-    cuda_complete_version="11.1"
-    cuda_installer_name="cuda_11.1.0_456.43_win10"
+    cuda_complete_version="11.0"
+    cuda_installer_name="cuda_11.0.2_451.48_win10"
     msbuild_project_dir="visual_studio_integration/CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
-    cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
+    cuda_install_packages="nvcc_11.0 cuobjdump_11.0 nvprune_11.0 nvprof_11.0 cupti_11.0 cublas_11.0 cublas_dev_11.0 cudart_11.0 cufft_11.0 cufft_dev_11.0 curand_11.0 curand_dev_11.0 cusolver_11.0 cusolver_dev_11.0 cusparse_11.0 cusparse_dev_11.0 npp_11.0 npp_dev_11.0 nvrtc_11.0 nvrtc_dev_11.0 nvml_dev_11.0"
 else
     echo "CUDA_VERSION $CUDA_VERSION is not supported yet"
     exit 1

diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh
@@ -5,8 +5,8 @@ if [[ "$CUDA_VERSION" == "10" ]]; then
     cuda_complete_version="10.1"
     cudnn_installer_name="cudnn-10.1-windows10-x64-v7.6.4.38"
 elif [[ "$CUDA_VERSION" == "11" ]]; then
-    cuda_complete_version="11.1"
-    cudnn_installer_name="cudnn-11.1-windows-x64-v8.0.5.39"
+    cuda_complete_version="11.0"
+    cudnn_installer_name="cudnn-11.0-windows-x64-v8.0.4.30"
 else
     echo "CUDNN for CUDA_VERSION $CUDA_VERSION is not supported yet"
     exit 1

diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -58,8 +58,8 @@ goto cuda_build_common
 
 :cuda_build_11
 
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1
-set CUDA_PATH_V11_1=%CUDA_PATH%
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0
+set CUDA_PATH_V11_0=%CUDA_PATH%
 
 goto cuda_build_common
 

diff --git a/aten/src/ATen/DynamicLibrary.cpp b/aten/src/ATen/DynamicLibrary.cpp
@@ -6,7 +6,7 @@
 #include <dlfcn.h>
 #include <libgen.h>
 #else
-#include <Windows.h>
+#include <c10/util/win32-headers.h>
 #endif
 
 namespace at {

diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h
@@ -79,11 +79,6 @@ const char *cusparseGetErrorString(cusparseStatus_t status);
 
 #define AT_CUDA_CHECK(EXPR) C10_CUDA_CHECK(EXPR)
 
-// This should be used directly after every kernel launch to ensure
-// the launch happened correctly and provide an early, close-to-source
-// diagnostic if it didn't.
-#define TORCH_CUDA_KERNEL_LAUNCH_CHECK() AT_CUDA_CHECK(cudaGetLastError())
-
 // For CUDA Driver API
 //
 // This is here instead of in c10 because NVRTC is loaded dynamically via a stub

diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@@ -209,7 +209,7 @@ static void norm_kernel_tensor_iterator_impl(
       binary_kernel_reduce(
         iter,
         AbsMaxOps<scalar_t>(),
-        scalar_t(std::numeric_limits<scalar_t>::min())
+        scalar_t(0)
       );
     });
   } else if (val == -INFINITY) {

diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm
@@ -152,7 +152,9 @@ Tensor max_pool2d(
       strideInPixelsX:stride[0]
       strideInPixelsY:stride[1]];
   [pool setEdgeMode:MPSImageEdgeModeClamp];
-  [pool setOffset:{.x = kernel_size[0] / 2, .y = kernel_size[1] / 2, .z = 0}];
+  [pool setOffset:{.x = static_cast<NSInteger>(kernel_size[0] / 2),
+                   .y = static_cast<NSInteger>(kernel_size[1] / 2),
+                   .z = 0}];
 
   int64_t oN = iN;
   int64_t oC = iC;

diff --git a/aten/src/TH/THAllocator.cpp b/aten/src/TH/THAllocator.cpp
@@ -9,7 +9,7 @@
 
 /* stuff for mapped files */
 #ifdef _WIN32
-#include <windows.h>
+#include <c10/util/win32-headers.h>
 #endif
 
 #if defined(HAVE_MMAP)
@@ -333,7 +333,7 @@ typedef struct{
   HANDLE handle;
   HANDLE wait;
 } ReleaseContext;
-static VOID CALLBACK WaitForReleaseHandle(PVOID lpParam, BOOLEAN TimerOrWaitFired)
+static void CALLBACK WaitForReleaseHandle(PVOID lpParam, BOOLEAN TimerOrWaitFired)
 {
   if (lpParam) {
     ReleaseContext *ctx = (ReleaseContext *)lpParam;

diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
@@ -19,6 +19,9 @@
 #include <string>
 #include <thread>
 #ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #include <psapi.h>
 #endif

diff --git a/c10/cuda/CUDAException.h b/c10/cuda/CUDAException.h
@@ -29,3 +29,8 @@
       TORCH_WARN("CUDA warning: ", cudaGetErrorString(__err)); \
     }                                                          \
   } while (0)
+
+// This should be used directly after every kernel launch to ensure
+// the launch happened correctly and provide an early, close-to-source
+// diagnostic if it didn't.
+#define TORCH_CUDA_KERNEL_LAUNCH_CHECK() C10_CUDA_CHECK(cudaGetLastError())
diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
@@ -9,12 +9,8 @@
 #include <vector>
 
 #ifdef _MSC_VER
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
+#include <c10/util/win32-headers.h>
 #include <iomanip>
-#include <Windows.h>
-#include <dbghelp.h>
 #pragma comment(lib, "Dbghelp.lib")
 #endif
 

diff --git a/c10/util/C++17.h b/c10/util/C++17.h
@@ -24,6 +24,10 @@
 #error You need C++14 to compile PyTorch
 #endif
 
+#if defined(_WIN32) && (defined(min) || defined(max))
+#  error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows
+#endif
+
 /*
  * This header adds some polyfills with C++17 functionality
  */

diff --git a/c10/util/win32-headers.h b/c10/util/win32-headers.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#ifndef NOKERNEL
+#define NOKERNEL
+#endif
+#ifndef NOUSER
+#define NOUSER
+#endif
+#ifndef NOSERVICE
+#define NOSERVICE
+#endif
+#ifndef NOSOUND
+#define NOSOUND
+#endif
+#ifndef NOMCX
+#define NOMCX
+#endif
+#ifndef NOGDI
+#define NOGDI
+#endif
+#ifndef NOMSG
+#define NOMSG
+#endif
+#ifndef NOMB
+#define NOMB
+#endif
+#ifndef NOCLIPBOARD
+#define NOCLIPBOARD
+#endif
+
+#include <windows.h>
+#include <dbghelp.h>
+
+#undef VOID
+#undef DELETE
+#undef IN
+#undef THIS
+#undef CONST
+#undef NAN
+#undef UNKNOWN
+#undef NONE
+#undef ANY
+#undef IGNORE
+#undef STRICT
+#undef GetObject
+#undef CreateSemaphore
+#undef Yield
+#undef RotateRight32
+#undef RotateLeft32
+#undef RotateRight64
+#undef RotateLeft64
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -88,11 +88,11 @@ class GatherRangesToDenseOp final : public Operator<Context> {
     CAFFE_ENFORCE_EQ(
         ranges.size(1),
         lengths_.size(),
-        "Nummber of ranges should match number of lengths");
+        "Number of ranges should match number of lengths");
     CAFFE_ENFORCE_EQ(
         ranges.size(1),
         OutputSize(),
-        "Nummber of ranges should match number of outputs");
+        "Number of ranges should match number of outputs");
     CAFFE_ENFORCE_EQ(
         ranges.size(2), 2, "Ranges last dimension should be of size 2");
 

diff --git a/caffe2/python/core.py b/caffe2/python/core.py
@@ -2343,6 +2343,9 @@ def make_builder(t):
         )
 
     def is_external_input(self, blob):
+        if self._recreate_lookup_tables:
+            self._RecreateLookupTables()
+
         name = str(blob)
         return name in self._external_input_map
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -10,6 +10,25 @@ PyTorch documentation
 
 PyTorch is an optimized tensor library for deep learning using GPUs and CPUs.
 
+Features described in this documentation are classified by release status:
+
+  *Stable:*  These features will be maintained long-term and there should generally
+  be no major performance limitations or gaps in documentation.
+  We also expect to maintain backwards compatibility (although
+  breaking changes can happen and notice will be given one release ahead
+  of time).
+
+  *Beta:*  Features are tagged as Beta because the API may change based on
+  user feedback, because the performance needs to improve, or because
+  coverage across operators is not yet complete. For Beta features, we are
+  committing to seeing the feature through to the Stable classification.
+  We are not, however, committing to backwards compatibility.
+
+  *Prototype:*  These features are typically not available as part of
+  binary distributions like PyPI or Conda, except sometimes behind run-time
+  flags, and are at an early stage for feedback and testing.
+
+
 .. toctree::
    :glob:
    :maxdepth: 1

diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
@@ -18,6 +18,9 @@ defined(TARGET_IPHONE_SIMULATOR)
 #endif
 
 #ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
 #include <windows.h>
 #endif
 

diff --git a/test/jit/test_class_type.py b/test/jit/test_class_type.py
@@ -144,7 +144,7 @@ def test_conditional_set_attr(self):
             @torch.jit.script
             class FooTest(object):
                 def __init__(self, x):
-                    if True:
+                    if 1 == 1:
                         self.attr = x
 
     def test_class_type_as_param(self):

diff --git a/test/jit/test_list_dict.py b/test/jit/test_list_dict.py
@@ -46,52 +46,52 @@ def str_in(x):
     def test_list_literal(self):
         def reassign():
             x = [1]
-            if True:
+            if 1 == 1:
                 x = [2, 3]
             return
         self.checkScript(reassign, (), optimize=False)
 
         def reassign_arity_change():
             x = [1]
-            if True:
+            if 1 == 1:
                 x = [1, 2, 3]
             return
         self.checkScript(reassign_arity_change, (), optimize=False)
 
         def reassign_from_empty_literal():
             x = []
-            if True:
+            if 1 == 1:
                 x = [1, 2, 3]
             return
         with self.assertRaisesRegex(RuntimeError, r"previously has type List\[Tensor\]"):
             self.checkScript(reassign_from_empty_literal, (), optimize=False)
 
         def reassign_from_empty_builtin():
             x = torch.jit.annotate(List[int], [])
-            if True:
+            if 1 == 1:
                 x = [1, 2, 3]
             y = torch.jit.annotate(List[float], [])
-            if True:
+            if 1 == 1:
                 y = [1.0, 2.0, 3.0]
             z = []
-            if True:
+            if 1 == 1:
                 z = [torch.randn([1])]
             return
         self.checkScript(reassign_from_empty_builtin, (), optimize=False)
 
         def reassign_bad_type():
             x = [1]
-            if True:
+            if 1 == 1:
                 x = [1.0]
             return
         with self.assertRaisesRegex(RuntimeError, "previously has type"):
             self.checkScript(reassign_bad_type, (), optimize=False)
 
         def reassign_nested():
             x = torch.jit.annotate(List[int], [])
-            if True:
+            if 1 == 1:
                 x = [1, 2, 3]
-                if True:
+                if 1 == 1:
                     x = [1.0]
             return
         with self.assertRaisesRegex(RuntimeError, "previously has type"):
@@ -554,15 +554,15 @@ def test_append_2():
     def test_mutable_list_append_if(self):
         def test_append_if():
             a = [1]
-            if True:
+            if 1 == 1:
                 a.append(4)
             return a == [1, 4]
         self.checkScript(test_append_if, ())
 
     def test_mutable_list_append_if_else(self):
         def test_append_if_else():
             a = [1]
-            if False:
+            if 1 == 2:
                 a.append(4)
             else:
                 a.append(10)