Update on "[GPU] Calculate strides for metal tensors"

Previously, in order to unblock the dogfooding, we did some hacks to calculate the strides for the output tensor. Now it's time to fix that. Differential Revision: [D25821766](https://our.internmc.facebook.com/intern/diff/D25821766/) [ghstack-poisoned]
pytorch · Jan 11, 2021 · 63e740f · 63e740f
1 parent f4faf08
commit 63e740f
Show file tree

Hide file tree

Showing 366 changed files with 8,682 additions and 5,210 deletions.
diff --git a/.circleci/scripts/windows_cuda_install.sh b/.circleci/scripts/windows_cuda_install.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
 set -eux -o pipefail
 
-if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
+cuda_major_version=${CUDA_VERSION%.*}
+
+if [[ "$cuda_major_version" == "10" ]]; then
     cuda_installer_name="cuda_10.1.243_426.00_win10"
     msbuild_project_dir="CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
     cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
-elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
+elif [[ "$cuda_major_version" == "11" ]]; then
     cuda_installer_name="cuda_11.1.0_456.43_win10"
     msbuild_project_dir="visual_studio_integration/CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
     cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
@@ -14,7 +16,7 @@ else
     exit 1
 fi
 
-if [[ "$CUDA_VERSION" =~ ^11.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
+if [[ "$cuda_major_version" == "11" && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
     cuda_install_packages="${cuda_install_packages} Display.Driver"
 fi
 

diff --git a/.circleci/scripts/windows_cudnn_install.sh b/.circleci/scripts/windows_cudnn_install.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -eux -o pipefail
 
-if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
+cuda_major_version=${CUDA_VERSION%.*}
+
+if [[ "$cuda_major_version" == "10" ]]; then
     cudnn_installer_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38"
-elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
+elif [[ "$cuda_major_version" == "11" ]]; then
     cudnn_installer_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39"
 else
     echo "CUDNN for CUDA_VERSION $CUDA_VERSION is not supported yet"

diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@
 
 .coverage
 coverage.xml
+.dmypy.json
 .gradle
 .hypothesis
 .mypy_cache

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -207,7 +207,7 @@ cmake_dependent_option(
     USE_VALGRIND "Use Valgrind. Only available on Linux." ON
     "LINUX" OFF)
 option(USE_VULKAN "Use Vulkan GPU backend" OFF)
-option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON)
+option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF)
 option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON)
@@ -318,7 +318,7 @@ set(OP_DEPENDENCY "" CACHE STRING
 # symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk
 # https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu
 if(LINUX)
-  set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed ${CMAKE_SHARED_LINKER_FLAGS}")
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed")
 endif()
 
 if(MSVC)
@@ -505,6 +505,20 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
 endif()
 
+# The below means we are cross compiling for arm64 or x86_64 on MacOSX
+if(NOT IOS AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
+  set(CROSS_COMPILING_MACOSX TRUE)
+  # We need to compile a universal protoc to not fail protobuf build
+  execute_process(COMMAND ./scripts/build_host_protoc.sh --other-flags "-DCMAKE_OSX_ARCHITECTURES=x86_64;arm64"
+                  WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                  RESULT_VARIABLE BUILD_HOST_PROTOC_RESULT)
+  if(NOT BUILD_HOST_PROTOC_RESULT EQUAL "0")
+    message(FATAL_ERROR "Could not compile universal protoc.")
+  endif()
+  set(PROTOBUF_PROTOC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
+  set(CAFFE2_CUSTOM_PROTOC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
+endif()
+
 # ---[ Misc checks to cope with various compiler modes
 include(cmake/MiscCheck.cmake)
 

diff --git a/Dockerfile b/Dockerfile
@@ -59,6 +59,7 @@ RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERS
 RUN /opt/conda/bin/pip install torchelastic
 
 FROM ${BASE_IMAGE} as official
+ARG PYTORCH_VERSION
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 RUN --mount=type=cache,id=apt-final,target=/var/cache/apt \
     apt-get update && apt-get install -y --no-install-recommends \
@@ -71,6 +72,7 @@ ENV PATH /opt/conda/bin:$PATH
 ENV NVIDIA_VISIBLE_DEVICES all
 ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
 ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
+ENV PYTORCH_VERSION ${PYTORCH_VERSION}
 WORKDIR /workspace
 
 FROM official as dev

diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py
@@ -20,101 +20,85 @@ def forward(self, input):
         return None
 
     @torch.jit.script_method
-    def eqBool(self, input):
-        # type: (bool) -> bool
+    def eqBool(self, input: bool) -> bool:
         return input
 
     @torch.jit.script_method
-    def eqInt(self, input):
-        # type: (int) -> int
+    def eqInt(self, input: int) -> int:
         return input
 
     @torch.jit.script_method
-    def eqFloat(self, input):
-        # type: (float) -> float
+    def eqFloat(self, input: float) -> float:
         return input
 
     @torch.jit.script_method
-    def eqStr(self, input):
-        # type: (str) -> str
+    def eqStr(self, input: str) -> str:
         return input
 
     @torch.jit.script_method
-    def eqTensor(self, input):
-        # type: (Tensor) -> Tensor
+    def eqTensor(self, input: Tensor) -> Tensor:
         return input
 
     @torch.jit.script_method
-    def eqDictStrKeyIntValue(self, input):
-        # type: (Dict[str, int]) -> Dict[str, int]
+    def eqDictStrKeyIntValue(self, input: Dict[str, int]) -> Dict[str, int]:
         return input
 
     @torch.jit.script_method
-    def eqDictIntKeyIntValue(self, input):
-        # type: (Dict[int, int]) -> Dict[int, int]
+    def eqDictIntKeyIntValue(self, input: Dict[int, int]) -> Dict[int, int]:
         return input
 
     @torch.jit.script_method
-    def eqDictFloatKeyIntValue(self, input):
-        # type: (Dict[float, int]) -> Dict[float, int]
+    def eqDictFloatKeyIntValue(self, input: Dict[float, int]) -> Dict[float, int]:
         return input
 
     @torch.jit.script_method
-    def listIntSumReturnTuple(self, input):
-        # type: (List[int]) -> Tuple[List[int], int]
+    def listIntSumReturnTuple(self, input: List[int]) -> Tuple[List[int], int]:
         sum = 0
         for x in input:
             sum += x
         return (input, sum)
 
     @torch.jit.script_method
-    def listBoolConjunction(self, input):
-        # type: (List[bool]) -> bool
+    def listBoolConjunction(self, input: List[bool]) -> bool:
         res = True
         for x in input:
             res = res and x
         return res
 
     @torch.jit.script_method
-    def listBoolDisjunction(self, input):
-        # type: (List[bool]) -> bool
+    def listBoolDisjunction(self, input: List[bool]) -> bool:
         res = False
         for x in input:
             res = res or x
         return res
 
     @torch.jit.script_method
-    def tupleIntSumReturnTuple(self, input):
-        # type: (Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]
+    def tupleIntSumReturnTuple(self, input: Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]:
         sum = 0
         for x in input:
             sum += x
         return (input, sum)
 
     @torch.jit.script_method
-    def optionalIntIsNone(self, input):
-        # type: (Optional[int]) -> bool
+    def optionalIntIsNone(self, input: Optional[int]) -> bool:
         return input is None
 
     @torch.jit.script_method
-    def intEq0None(self, input):
-        # type: (int) -> Optional[int]
+    def intEq0None(self, input: int) -> Optional[int]:
         if input == 0:
             return None
         return input
 
     @torch.jit.script_method
-    def str3Concat(self, input):
-        # type: (str) -> str
+    def str3Concat(self, input: str) -> str:
         return input + input + input
 
     @torch.jit.script_method
     def newEmptyShapeWithItem(self, input):
         return torch.tensor([int(input.item())])[0]
 
     @torch.jit.script_method
-    def testAliasWithOffset(self):
-        # type: () -> List[Tensor]
+    def testAliasWithOffset(self) -> List[Tensor]:
         x = torch.tensor([100, 200])
         a = [x[0], x[1]]
         return a
@@ -128,8 +112,7 @@ def testNonContiguous(self):
         return x
 
     @torch.jit.script_method
-    def conv2d(self, x, w, toChannelsLast):
-        # type: (Tensor, Tensor, bool) -> Tensor
+    def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
         r = torch.nn.functional.conv2d(x, w)
         if (toChannelsLast):
             r = r.contiguous(memory_format=torch.channels_last)
@@ -138,18 +121,15 @@ def conv2d(self, x, w, toChannelsLast):
         return r
 
     @torch.jit.script_method
-    def contiguous(self, x):
-        # type: (Tensor) -> Tensor
+    def contiguous(self, x: Tensor) -> Tensor:
         return x.contiguous()
 
     @torch.jit.script_method
-    def contiguousChannelsLast(self, x):
-        # type: (Tensor) -> Tensor
+    def contiguousChannelsLast(self, x: Tensor) -> Tensor:
         return x.contiguous(memory_format=torch.channels_last)
 
     @torch.jit.script_method
-    def contiguousChannelsLast3d(self, x):
-        # type: (Tensor) -> Tensor
+    def contiguousChannelsLast3d(self, x: Tensor) -> Tensor:
         return x.contiguous(memory_format=torch.channels_last_3d)
 
 scriptAndSave(Test(), "test.pt")
diff --git a/android/pytorch_android/test_asset.jit b/android/pytorch_android/test_asset.jit
@@ -1,85 +1,69 @@
 def forward(self, input):
     return None
 
-def eqBool(self, input):
-    # type: (bool) -> bool
+def eqBool(self, input: bool) -> bool:
     return input
 
-def eqInt(self, input):
-    # type: (int) -> int
+def eqInt(self, input: int) -> int:
     return input
 
-def eqFloat(self, input):
-    # type: (float) -> float
+def eqFloat(self, input: float) -> float:
     return input
 
-def eqStr(self, input):
-    # type: (str) -> str
+def eqStr(self, input: str) -> str:
     return input
 
-def eqTensor(self, input):
-    # type: (Tensor) -> Tensor
+def eqTensor(self, input: Tensor) -> Tensor:
     return input
 
-def eqDictStrKeyIntValue(self, input):
-    # type: (Dict[str, int]) -> Dict[str, int]
+def eqDictStrKeyIntValue(self, input: Dict[str, int]) -> Dict[str, int]:
     return input
 
-def eqDictIntKeyIntValue(self, input):
-    # type: (Dict[int, int]) -> Dict[int, int]
+def eqDictIntKeyIntValue(self, input: Dict[int, int]) -> Dict[int, int]:
     return input
 
-def eqDictFloatKeyIntValue(self, input):
-    # type: (Dict[float, int]) -> Dict[float, int]
+def eqDictFloatKeyIntValue(self, input: Dict[float, int]) -> Dict[float, int]:
     return input
 
-def listIntSumReturnTuple(self, input):
-    # type: (List[int]) -> Tuple[List[int], int]
+def listIntSumReturnTuple(self, input: List[int]) -> Tuple[List[int], int]:
     sum = 0
     for x in input:
         sum += x
     return (input, sum)
 
-def listBoolConjunction(self, input):
-    # type: (List[bool]) -> bool
+def listBoolConjunction(self, input: List[bool]) -> bool:
     res = True
     for x in input:
         res = res and x
     return res
 
-def listBoolDisjunction(self, input):
-    # type: (List[bool]) -> bool
+def listBoolDisjunction(self, input: List[bool]) -> bool:
     res = False
     for x in input:
         res = res or x
     return res
 
-def tupleIntSumReturnTuple(self, input):
-    # type: (Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]
+def tupleIntSumReturnTuple(self, input: Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]:
     sum = 0
     for x in input:
         sum += x
     return (input, sum)
 
-def optionalIntIsNone(self, input):
-    # type: (Optional[int]) -> bool
+def optionalIntIsNone(self, input: Optional[int]) -> bool:
     return input is None
 
-def intEq0None(self, input):
-    # type: (int) -> Optional[int]
+def intEq0None(self, input: int) -> Optional[int]:
     if input == 0:
         return None
     return input
 
-def str3Concat(self, input):
-    # type: (str) -> str
+def str3Concat(self, input: str) -> str:
     return input + input + input
 
 def newEmptyShapeWithItem(self, input):
     return torch.tensor([int(input.item())])[0]
 
-def testAliasWithOffset(self):
-    # type: () -> List[Tensor]
+def testAliasWithOffset(self) -> List[Tensor]:
     x = torch.tensor([100, 200])
     a = [x[0], x[1]]
     return a
@@ -91,8 +75,7 @@ def testNonContiguous(self):
     assert x[1] == 300
     return x
 
-def conv2d(self, x, w, toChannelsLast):
-    # type: (Tensor, Tensor, bool) -> Tensor
+def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
     r = torch.conv2d(x, w)
     if (toChannelsLast):
         # memory_format=torch.channels_last
@@ -101,16 +84,13 @@ def conv2d(self, x, w, toChannelsLast):
         r = r.contiguous()
     return r
 
-def contiguous(self, x):
-    # type: (Tensor) -> Tensor
+def contiguous(self, x: Tensor) -> Tensor:
     return x.contiguous()
 
-def contiguousChannelsLast(self, x):
-    # type: (Tensor) -> Tensor
+def contiguousChannelsLast(self, x: Tensor) -> Tensor:
     # memory_format=torch.channels_last
     return x.contiguous(memory_format=2)
 
-def contiguousChannelsLast3d(self, x):
-    # type: (Tensor) -> Tensor
+def contiguousChannelsLast3d(self, x: Tensor) -> Tensor:
     # memory_format=torch.channels_last_3d
     return x.contiguous(memory_format=3)