Update on "[PyTorch] Avoid move-constructing a List in listConstruct"

List's move ctor is a little bit more expensive than you might expect, but we can easily avoid it. Differential Revision: [D25542190](https://our.internmc.facebook.com/intern/diff/D25542190/) **NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D25542190/)! [ghstack-poisoned]
pytorch · Dec 15, 2020 · 03d4bda · 03d4bda
2 parents 454082a + 523b8a0
commit 03d4bda
Show file tree

Hide file tree

Showing 104 changed files with 2,714 additions and 1,681 deletions.
diff --git a/.circleci/cimodel/data/simple/util/versions.py b/.circleci/cimodel/data/simple/util/versions.py
@@ -29,3 +29,6 @@ def __init__(self, major, minor):
         self.minor = minor
 
         super().__init__([self.major, self.minor], "cuda")
+
+    def __str__(self):
+        return f"{self.major}.{self.minor}"
diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
@@ -86,10 +86,11 @@ def gen_tree(self):
                 props_dict["executor"] = "windows-with-nvidia-gpu"
 
         props_dict["cuda_version"] = (
-            miniutils.quote(str(self.cuda_version.major))
+            miniutils.quote(str(self.cuda_version))
             if self.cuda_version
             else "cpu"
         )
+
         props_dict["name"] = "_".join(name_parts)
 
         return [{key_name: props_dict}]

diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -325,7 +325,7 @@ pytorch_windows_params: &pytorch_windows_params
       default: ""
     cuda_version:
       type: string
-      default: "10"
+      default: "10.1"
     python_version:
       type: string
       default: "3.6"
@@ -675,7 +675,7 @@ jobs:
         default: ""
       cuda_version:
         type: string
-        default: "10"
+        default: "10.1"
       python_version:
         type: string
         default: "3.6"
@@ -737,7 +737,7 @@ jobs:
         default: ""
       cuda_version:
         type: string
-        default: "10"
+        default: "10.1"
       python_version:
         type: string
         default: "3.6"
@@ -8077,7 +8077,7 @@ workflows:
                 - postnightly
       - pytorch_windows_build:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10"
+          cuda_version: "10.1"
           name: pytorch_windows_vs2019_py36_cuda10.1_build
           python_version: "3.6"
           use_cuda: "1"
@@ -8086,7 +8086,7 @@ workflows:
           vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10"
+          cuda_version: "10.1"
           executor: windows-with-nvidia-gpu
           name: pytorch_windows_vs2019_py36_cuda10.1_test1
           python_version: "3.6"
@@ -8099,7 +8099,7 @@ workflows:
           vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10"
+          cuda_version: "10.1"
           executor: windows-with-nvidia-gpu
           name: pytorch_windows_vs2019_py36_cuda10.1_test2
           python_version: "3.6"
@@ -8112,7 +8112,7 @@ workflows:
           vc_year: "2019"
       - pytorch_windows_build:
           build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11"
+          cuda_version: "11.1"
           name: pytorch_windows_vs2019_py36_cuda11.1_build
           python_version: "3.6"
           use_cuda: "1"
@@ -8121,7 +8121,7 @@ workflows:
           vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11"
+          cuda_version: "11.1"
           executor: windows-with-nvidia-gpu
           filters:
             branches:
@@ -8140,7 +8140,7 @@ workflows:
           vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
-          cuda_version: "11"
+          cuda_version: "11.1"
           executor: windows-with-nvidia-gpu
           filters:
             branches:
@@ -8204,7 +8204,7 @@ workflows:
           vc_year: "2019"
       - pytorch_windows_test:
           build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
-          cuda_version: "10"
+          cuda_version: "10.1"
           filters:
             branches:
               only:

diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh
@@ -1,13 +1,11 @@
 #!/bin/bash
 set -eux -o pipefail
 
-if [[ "$CUDA_VERSION" == "10" ]]; then
-    cuda_complete_version="10.1"
+if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
     cuda_installer_name="cuda_10.1.243_426.00_win10"
     msbuild_project_dir="CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
     cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
-elif [[ "$CUDA_VERSION" == "11" ]]; then
-    cuda_complete_version="11.1"
+elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
     cuda_installer_name="cuda_11.1.0_456.43_win10"
     msbuild_project_dir="visual_studio_integration/CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
     cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
@@ -16,7 +14,7 @@ else
     exit 1
 fi
 
-if [[ "${CUDA_VERSION}" != "10" && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
+if [[ "$CUDA_VERSION" =~ ^11.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
     cuda_install_packages="${cuda_install_packages} Display.Driver"
 fi
 
@@ -48,7 +46,7 @@ then
     export NVTOOLSEXT_PATH="C:\\Program Files\\NVIDIA Corporation\\NvToolsExt\\"
 fi
 
-if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${cuda_complete_version}/bin/nvcc.exe"
+if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/bin/nvcc.exe"
 then
     echo "CUDA installation failed"
     mkdir -p /c/w/build-results

diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh
@@ -1,12 +1,10 @@
 #!/bin/bash
 set -eux -o pipefail
 
-if [[ "$CUDA_VERSION" == "10" ]]; then
-    cuda_complete_version="10.1"
-    cudnn_installer_name="cudnn-10.1-windows10-x64-v7.6.4.38"
-elif [[ "$CUDA_VERSION" == "11" ]]; then
-    cuda_complete_version="11.1"
-    cudnn_installer_name="cudnn-11.1-windows-x64-v8.0.5.39"
+if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
+    cudnn_installer_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38"
+elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
+    cudnn_installer_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39"
 else
     echo "CUDNN for CUDA_VERSION $CUDA_VERSION is not supported yet"
     exit 1
@@ -16,6 +14,6 @@ cudnn_installer_link="https://ossci-windows.s3.amazonaws.com/${cudnn_installer_n
 
 curl --retry 3 -O $cudnn_installer_link
 7z x ${cudnn_installer_name}.zip -ocudnn
-cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${cuda_complete_version}/"
+cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/"
 rm -rf cudnn
 rm -f ${cudnn_installer_name}.zip
diff --git a/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml b/.circleci/verbatim-sources/build-parameters/pytorch-build-params.yml
@@ -59,7 +59,7 @@ pytorch_windows_params: &pytorch_windows_params
       default: ""
     cuda_version:
       type: string
-      default: "10"
+      default: "10.1"
     python_version:
       type: string
       default: "3.6"

diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -237,7 +237,7 @@ jobs:
         default: ""
       cuda_version:
         type: string
-        default: "10"
+        default: "10.1"
       python_version:
         type: string
         default: "3.6"
@@ -299,7 +299,7 @@ jobs:
         default: ""
       cuda_version:
         type: string
-        default: "10"
+        default: "10.1"
       python_version:
         type: string
         default: "3.6"

diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
@@ -11,6 +11,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
 echo "Testing pytorch"
 
+export LANG=C.UTF-8
+
 if [[ "$BUILD_ENVIRONMENT" == *-slow-* ]]; then
   export PYTORCH_TEST_WITH_SLOW=1
   export PYTORCH_TEST_SKIP_FAST=1

diff --git a/.jenkins/pytorch/win-test-helpers/build_pytorch.bat b/.jenkins/pytorch/win-test-helpers/build_pytorch.bat
@@ -37,33 +37,19 @@ if "%VC_VERSION%" == "" (
 @echo on
 popd
 
-if "%CUDA_VERSION%" == "9" goto cuda_build_9
-if "%CUDA_VERSION%" == "10" goto cuda_build_10
-if "%CUDA_VERSION%" == "11" goto cuda_build_11
-goto cuda_build_end
+if not "%USE_CUDA%"=="1" goto cuda_build_end
 
-:cuda_build_9
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%
 
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2
-set CUDA_PATH_V9_2=%CUDA_PATH%
+rem version transformer, for example 10.1 to 10_1.
+set VERSION_SUFFIX=%CUDA_VERSION:.=_%
+set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 
-goto cuda_build_common
-
-:cuda_build_10
-
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
-set CUDA_PATH_V10_1=%CUDA_PATH%
-
-goto cuda_build_common
-
-:cuda_build_11
-
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1
-set CUDA_PATH_V11_1=%CUDA_PATH%
-
-goto cuda_build_common
-
-:cuda_build_common
+set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
+set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
+set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%

diff --git a/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat b/.jenkins/pytorch/win-test-helpers/installation-helpers/install_magma.bat
@@ -1,9 +1,9 @@
-if "%CUDA_VERSION%" == "9" set CUDA_SUFFIX=cuda92
-if "%CUDA_VERSION%" == "10" set CUDA_SUFFIX=cuda101
-if "%CUDA_VERSION%" == "11" set CUDA_SUFFIX=cuda110
+rem remove dot in cuda_version, fox example 11.1 to 111
+set VERSION_SUFFIX=%CUDA_VERSION:.=%
+set CUDA_SUFFIX=cuda%VERSION_SUFFIX%
 
 if "%CUDA_SUFFIX%" == "" (
-  echo unknown CUDA version, please set `CUDA_VERSION` to 9, 10 or 11.
+  echo unknown CUDA version, please set `CUDA_VERSION` higher than 9.2
   exit /b 1
 )
 

diff --git a/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat b/.jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
@@ -46,33 +46,13 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% )
 
 set DISTUTILS_USE_SDK=1
 
-if "%CUDA_VERSION%" == "9" goto cuda_build_9
-if "%CUDA_VERSION%" == "10" goto cuda_build_10
-if "%CUDA_VERSION%" == "11" goto cuda_build_11
-goto cuda_build_end
+if not "%USE_CUDA%"=="1" goto cuda_build_end
 
-:cuda_build_9
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%
 
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2
-set CUDA_PATH_V9_2=%CUDA_PATH%
-
-goto cuda_build_common
-
-:cuda_build_10
-
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
-set CUDA_PATH_V10_1=%CUDA_PATH%
-
-goto cuda_build_common
-
-:cuda_build_11
-
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1
-set CUDA_PATH_V11_1=%CUDA_PATH%
-
-goto cuda_build_common
-
-:cuda_build_common
+rem version transformer, for example 10.1 to 10_1.
+set VERSION_SUFFIX=%CUDA_VERSION:.=_%
+set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%

diff --git a/BUILD.bazel b/BUILD.bazel
@@ -339,7 +339,8 @@ filegroup(
         "aten/src/ATen/cuda/CUDABlas.cpp",
         "aten/src/ATen/cuda/CUDASolver.cpp",
         "aten/src/ATen/cuda/CUDAContext.cpp",
-        "aten/src/ATen/cuda/CUDAGenerator.cpp",
+        "aten/src/ATen/cuda/CUDAGeneratorImpl.cpp",
+        "aten/src/ATen/cuda/CUDAGraph.cpp",
         "aten/src/ATen/cuda/CuSparseHandlePool.cpp",
         "aten/src/ATen/cuda/CublasHandlePool.cpp",
         "aten/src/ATen/cuda/CusolverDnHandlePool.cpp",

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -674,7 +674,7 @@ ccache -M 25Gi
 ```
 
 To check this is working, do two clean builds of pytorch in a row. The second
-build should be substantially and noticeably faster than the first build.
+build should be substantially and noticeably faster than the first build. If this doesn't seem to be the case, check that each of the symlinks above actually link to your installation of `ccache`. For example, if you followed the first option and installed `ccache` from source on a Linux machine, running `readlink -e $(which g++)` should return `~/ccache/bin/ccache`.
 
 
 #### Use a faster linker

diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
@@ -90,13 +90,13 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
 #endif
 
 #ifdef TRACE_ENABLED
-  static bool onFunctionEnter(
+  static std::unique_ptr<at::ObserverContext> onFunctionEnter(
       const at::RecordFunction& fn) {
     Trace::beginSection(fn.name().str());
-    return true;
+    return nullptr;
   }
 
-  static void onFunctionExit(const at::RecordFunction&) {
+  static void onFunctionExit(const at::RecordFunction&, at::ObserverContext*) {
     Trace::endSection();
   }
 #endif

diff --git a/aten/src/ATen/CUDAGeneratorImpl.h b/aten/src/ATen/CUDAGeneratorImpl.h
@@ -131,8 +131,8 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   uint64_t seed() override;
   void set_philox_offset_per_thread(uint64_t offset);
   uint64_t philox_offset_per_thread();
-  void graph_prologue(int64_t* offset_extragraph);
-  uint64_t graph_epilogue();
+  void capture_prologue(int64_t* offset_extragraph);
+  uint64_t capture_epilogue();
   PhiloxCudaState philox_cuda_state(uint64_t increment);
 
   // Temporarily accommodates call sites that use philox_engine_inputs.
@@ -147,6 +147,7 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
   uint64_t philox_offset_per_thread_ = 0;
   int64_t* offset_extragraph_;
   uint32_t offset_intragraph_ = 0;
+  bool graph_expects_this_gen_ = false;
 };
 
 namespace cuda {

diff --git a/aten/src/ATen/cpu/vec256/missing_vst1_neon.h b/aten/src/ATen/cpu/vec256/missing_vst1_neon.h
@@ -4,6 +4,5 @@ __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
 {
-  asm ("st1 {%S0.4s - %T0.4s}, [%1]" :: "w" (val), "r"(__a) :);
+  asm ("st1 {%S1.4s - %T1.4s}, [%2]" : "=m" (*__a) : "w" (val), "r"(__a) : "memory");
 }
-
diff --git a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
@@ -25,7 +25,7 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
 static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
   __m256i lo = _mm256_castps_si256(a);
   __m256i hi = _mm256_castps_si256(b);
-  __m256i nan = _mm256_set1_epi32(0x7fc0);
+  __m256i nan = _mm256_set1_epi32(0xffff);
   __m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q));
   __m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q));
   __m256i ones = _mm256_set1_epi32(0x1);