Skip to content

Commit

Permalink
Update on "[PyTorch] Avoid move-constructing a List in listConstruct"
Browse files Browse the repository at this point in the history
List's move ctor is a little bit more expensive than you might expect, but we can easily avoid it.

Differential Revision: [D25542190](https://our.internmc.facebook.com/intern/diff/D25542190/)

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D25542190/)!

[ghstack-poisoned]
  • Loading branch information
swolchok committed Dec 15, 2020
2 parents 454082a + 523b8a0 commit 03d4bda
Show file tree
Hide file tree
Showing 104 changed files with 2,714 additions and 1,681 deletions.
3 changes: 3 additions & 0 deletions .circleci/cimodel/data/simple/util/versions.py
Expand Up @@ -29,3 +29,6 @@ def __init__(self, major, minor):
self.minor = minor

super().__init__([self.major, self.minor], "cuda")

def __str__(self):
return f"{self.major}.{self.minor}"
3 changes: 2 additions & 1 deletion .circleci/cimodel/data/windows_build_definitions.py
Expand Up @@ -86,10 +86,11 @@ def gen_tree(self):
props_dict["executor"] = "windows-with-nvidia-gpu"

props_dict["cuda_version"] = (
miniutils.quote(str(self.cuda_version.major))
miniutils.quote(str(self.cuda_version))
if self.cuda_version
else "cpu"
)

props_dict["name"] = "_".join(name_parts)

return [{key_name: props_dict}]
Expand Down
20 changes: 10 additions & 10 deletions .circleci/config.yml
Expand Up @@ -325,7 +325,7 @@ pytorch_windows_params: &pytorch_windows_params
default: ""
cuda_version:
type: string
default: "10"
default: "10.1"
python_version:
type: string
default: "3.6"
Expand Down Expand Up @@ -675,7 +675,7 @@ jobs:
default: ""
cuda_version:
type: string
default: "10"
default: "10.1"
python_version:
type: string
default: "3.6"
Expand Down Expand Up @@ -737,7 +737,7 @@ jobs:
default: ""
cuda_version:
type: string
default: "10"
default: "10.1"
python_version:
type: string
default: "3.6"
Expand Down Expand Up @@ -8077,7 +8077,7 @@ workflows:
- postnightly
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10"
cuda_version: "10.1"
name: pytorch_windows_vs2019_py36_cuda10.1_build
python_version: "3.6"
use_cuda: "1"
Expand All @@ -8086,7 +8086,7 @@ workflows:
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10"
cuda_version: "10.1"
executor: windows-with-nvidia-gpu
name: pytorch_windows_vs2019_py36_cuda10.1_test1
python_version: "3.6"
Expand All @@ -8099,7 +8099,7 @@ workflows:
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10"
cuda_version: "10.1"
executor: windows-with-nvidia-gpu
name: pytorch_windows_vs2019_py36_cuda10.1_test2
python_version: "3.6"
Expand All @@ -8112,7 +8112,7 @@ workflows:
vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11"
cuda_version: "11.1"
name: pytorch_windows_vs2019_py36_cuda11.1_build
python_version: "3.6"
use_cuda: "1"
Expand All @@ -8121,7 +8121,7 @@ workflows:
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11"
cuda_version: "11.1"
executor: windows-with-nvidia-gpu
filters:
branches:
Expand All @@ -8140,7 +8140,7 @@ workflows:
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11"
cuda_version: "11.1"
executor: windows-with-nvidia-gpu
filters:
branches:
Expand Down Expand Up @@ -8204,7 +8204,7 @@ workflows:
vc_year: "2019"
- pytorch_windows_test:
build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
cuda_version: "10"
cuda_version: "10.1"
filters:
branches:
only:
Expand Down
10 changes: 4 additions & 6 deletions .circleci/scripts/windows_cuda_install.sh
@@ -1,13 +1,11 @@
#!/bin/bash
set -eux -o pipefail

if [[ "$CUDA_VERSION" == "10" ]]; then
cuda_complete_version="10.1"
if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
cuda_installer_name="cuda_10.1.243_426.00_win10"
msbuild_project_dir="CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
elif [[ "$CUDA_VERSION" == "11" ]]; then
cuda_complete_version="11.1"
elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
cuda_installer_name="cuda_11.1.0_456.43_win10"
msbuild_project_dir="visual_studio_integration/CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
Expand All @@ -16,7 +14,7 @@ else
exit 1
fi

if [[ "${CUDA_VERSION}" != "10" && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
if [[ "$CUDA_VERSION" =~ ^11.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
cuda_install_packages="${cuda_install_packages} Display.Driver"
fi

Expand Down Expand Up @@ -48,7 +46,7 @@ then
export NVTOOLSEXT_PATH="C:\\Program Files\\NVIDIA Corporation\\NvToolsExt\\"
fi

if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${cuda_complete_version}/bin/nvcc.exe"
if ! ls "/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/bin/nvcc.exe"
then
echo "CUDA installation failed"
mkdir -p /c/w/build-results
Expand Down
12 changes: 5 additions & 7 deletions .circleci/scripts/windows_cudnn_install.sh
@@ -1,12 +1,10 @@
#!/bin/bash
set -eux -o pipefail

if [[ "$CUDA_VERSION" == "10" ]]; then
cuda_complete_version="10.1"
cudnn_installer_name="cudnn-10.1-windows10-x64-v7.6.4.38"
elif [[ "$CUDA_VERSION" == "11" ]]; then
cuda_complete_version="11.1"
cudnn_installer_name="cudnn-11.1-windows-x64-v8.0.5.39"
if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
cudnn_installer_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38"
elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
cudnn_installer_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39"
else
echo "CUDNN for CUDA_VERSION $CUDA_VERSION is not supported yet"
exit 1
Expand All @@ -16,6 +14,6 @@ cudnn_installer_link="https://ossci-windows.s3.amazonaws.com/${cudnn_installer_n

curl --retry 3 -O $cudnn_installer_link
7z x ${cudnn_installer_name}.zip -ocudnn
cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${cuda_complete_version}/"
cp -r cudnn/cuda/* "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${CUDA_VERSION}/"
rm -rf cudnn
rm -f ${cudnn_installer_name}.zip
Expand Up @@ -59,7 +59,7 @@ pytorch_windows_params: &pytorch_windows_params
default: ""
cuda_version:
type: string
default: "10"
default: "10.1"
python_version:
type: string
default: "3.6"
Expand Down
4 changes: 2 additions & 2 deletions .circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
Expand Up @@ -237,7 +237,7 @@ jobs:
default: ""
cuda_version:
type: string
default: "10"
default: "10.1"
python_version:
type: string
default: "3.6"
Expand Down Expand Up @@ -299,7 +299,7 @@ jobs:
default: ""
cuda_version:
type: string
default: "10"
default: "10.1"
python_version:
type: string
default: "3.6"
Expand Down
2 changes: 2 additions & 0 deletions .jenkins/pytorch/test.sh
Expand Up @@ -11,6 +11,8 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

echo "Testing pytorch"

export LANG=C.UTF-8

if [[ "$BUILD_ENVIRONMENT" == *-slow-* ]]; then
export PYTORCH_TEST_WITH_SLOW=1
export PYTORCH_TEST_SKIP_FAST=1
Expand Down
34 changes: 10 additions & 24 deletions .jenkins/pytorch/win-test-helpers/build_pytorch.bat
Expand Up @@ -37,33 +37,19 @@ if "%VC_VERSION%" == "" (
@echo on
popd

if "%CUDA_VERSION%" == "9" goto cuda_build_9
if "%CUDA_VERSION%" == "10" goto cuda_build_10
if "%CUDA_VERSION%" == "11" goto cuda_build_11
goto cuda_build_end
if not "%USE_CUDA%"=="1" goto cuda_build_end

:cuda_build_9
set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2
set CUDA_PATH_V9_2=%CUDA_PATH%
rem version transformer, for example 10.1 to 10_1.
set VERSION_SUFFIX=%CUDA_VERSION:.=_%
set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%

goto cuda_build_common

:cuda_build_10

set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
set CUDA_PATH_V10_1=%CUDA_PATH%

goto cuda_build_common

:cuda_build_11

set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1
set CUDA_PATH_V11_1=%CUDA_PATH%

goto cuda_build_common

:cuda_build_common
set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
set CUDNN_ROOT_DIR=%CUDA_PATH%
set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%

set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
Expand Down
@@ -1,9 +1,9 @@
if "%CUDA_VERSION%" == "9" set CUDA_SUFFIX=cuda92
if "%CUDA_VERSION%" == "10" set CUDA_SUFFIX=cuda101
if "%CUDA_VERSION%" == "11" set CUDA_SUFFIX=cuda110
rem remove dot in cuda_version, fox example 11.1 to 111
set VERSION_SUFFIX=%CUDA_VERSION:.=%
set CUDA_SUFFIX=cuda%VERSION_SUFFIX%

if "%CUDA_SUFFIX%" == "" (
echo unknown CUDA version, please set `CUDA_VERSION` to 9, 10 or 11.
echo unknown CUDA version, please set `CUDA_VERSION` higher than 9.2
exit /b 1
)

Expand Down
30 changes: 5 additions & 25 deletions .jenkins/pytorch/win-test-helpers/setup_pytorch_env.bat
Expand Up @@ -46,33 +46,13 @@ if %errorlevel% neq 0 ( exit /b %errorlevel% )

set DISTUTILS_USE_SDK=1

if "%CUDA_VERSION%" == "9" goto cuda_build_9
if "%CUDA_VERSION%" == "10" goto cuda_build_10
if "%CUDA_VERSION%" == "11" goto cuda_build_11
goto cuda_build_end
if not "%USE_CUDA%"=="1" goto cuda_build_end

:cuda_build_9
set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2
set CUDA_PATH_V9_2=%CUDA_PATH%

goto cuda_build_common

:cuda_build_10

set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
set CUDA_PATH_V10_1=%CUDA_PATH%

goto cuda_build_common

:cuda_build_11

set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1
set CUDA_PATH_V11_1=%CUDA_PATH%

goto cuda_build_common

:cuda_build_common
rem version transformer, for example 10.1 to 10_1.
set VERSION_SUFFIX=%CUDA_VERSION:.=_%
set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%

set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
Expand Down
3 changes: 2 additions & 1 deletion BUILD.bazel
Expand Up @@ -339,7 +339,8 @@ filegroup(
"aten/src/ATen/cuda/CUDABlas.cpp",
"aten/src/ATen/cuda/CUDASolver.cpp",
"aten/src/ATen/cuda/CUDAContext.cpp",
"aten/src/ATen/cuda/CUDAGenerator.cpp",
"aten/src/ATen/cuda/CUDAGeneratorImpl.cpp",
"aten/src/ATen/cuda/CUDAGraph.cpp",
"aten/src/ATen/cuda/CuSparseHandlePool.cpp",
"aten/src/ATen/cuda/CublasHandlePool.cpp",
"aten/src/ATen/cuda/CusolverDnHandlePool.cpp",
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Expand Up @@ -674,7 +674,7 @@ ccache -M 25Gi
```

To check this is working, do two clean builds of pytorch in a row. The second
build should be substantially and noticeably faster than the first build.
build should be substantially and noticeably faster than the first build. If this doesn't seem to be the case, check that each of the symlinks above actually link to your installation of `ccache`. For example, if you followed the first option and installed `ccache` from source on a Linux machine, running `readlink -e $(which g++)` should return `~/ccache/bin/ccache`.


#### Use a faster linker
Expand Down
6 changes: 3 additions & 3 deletions android/pytorch_android/src/main/cpp/pytorch_jni_jit.cpp
Expand Up @@ -90,13 +90,13 @@ class PytorchJni : public facebook::jni::HybridClass<PytorchJni> {
#endif

#ifdef TRACE_ENABLED
static bool onFunctionEnter(
static std::unique_ptr<at::ObserverContext> onFunctionEnter(
const at::RecordFunction& fn) {
Trace::beginSection(fn.name().str());
return true;
return nullptr;
}

static void onFunctionExit(const at::RecordFunction&) {
static void onFunctionExit(const at::RecordFunction&, at::ObserverContext*) {
Trace::endSection();
}
#endif
Expand Down
5 changes: 3 additions & 2 deletions aten/src/ATen/CUDAGeneratorImpl.h
Expand Up @@ -131,8 +131,8 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
uint64_t seed() override;
void set_philox_offset_per_thread(uint64_t offset);
uint64_t philox_offset_per_thread();
void graph_prologue(int64_t* offset_extragraph);
uint64_t graph_epilogue();
void capture_prologue(int64_t* offset_extragraph);
uint64_t capture_epilogue();
PhiloxCudaState philox_cuda_state(uint64_t increment);

// Temporarily accommodates call sites that use philox_engine_inputs.
Expand All @@ -147,6 +147,7 @@ struct TORCH_CUDA_API CUDAGeneratorImpl : public c10::GeneratorImpl {
uint64_t philox_offset_per_thread_ = 0;
int64_t* offset_extragraph_;
uint32_t offset_intragraph_ = 0;
bool graph_expects_this_gen_ = false;
};

namespace cuda {
Expand Down
3 changes: 1 addition & 2 deletions aten/src/ATen/cpu/vec256/missing_vst1_neon.h
Expand Up @@ -4,6 +4,5 @@ __extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_f32_x2 (float32_t * __a, float32x4x2_t val)
{
asm ("st1 {%S0.4s - %T0.4s}, [%1]" :: "w" (val), "r"(__a) :);
asm ("st1 {%S1.4s - %T1.4s}, [%2]" : "=m" (*__a) : "w" (val), "r"(__a) : "memory");
}

2 changes: 1 addition & 1 deletion aten/src/ATen/cpu/vec256/vec256_bfloat16.h
Expand Up @@ -25,7 +25,7 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) {
static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) {
__m256i lo = _mm256_castps_si256(a);
__m256i hi = _mm256_castps_si256(b);
__m256i nan = _mm256_set1_epi32(0x7fc0);
__m256i nan = _mm256_set1_epi32(0xffff);
__m256i mask_lo = _mm256_castps_si256(_mm256_cmp_ps(a, a, _CMP_ORD_Q));
__m256i mask_hi = _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_ORD_Q));
__m256i ones = _mm256_set1_epi32(0x1);
Expand Down

0 comments on commit 03d4bda

Please sign in to comment.