Skip to content

Commit

Permalink
Update on "[GPU] Calculate strides for metal tensors"
Browse files Browse the repository at this point in the history
Previously, in order to unblock the dogfooding, we did some hacks to calculate the strides for the output tensor. Now it's time to fix that.

Differential Revision: [D25821766](https://our.internmc.facebook.com/intern/diff/D25821766/)

[ghstack-poisoned]
  • Loading branch information
xta0 committed Jan 11, 2021
1 parent f4faf08 commit 63e740f
Show file tree
Hide file tree
Showing 366 changed files with 8,682 additions and 5,210 deletions.
8 changes: 5 additions & 3 deletions .circleci/scripts/windows_cuda_install.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#!/bin/bash
set -eux -o pipefail

if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
cuda_major_version=${CUDA_VERSION%.*}

if [[ "$cuda_major_version" == "10" ]]; then
cuda_installer_name="cuda_10.1.243_426.00_win10"
msbuild_project_dir="CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
cuda_install_packages="nvcc_10.1 cuobjdump_10.1 nvprune_10.1 cupti_10.1 cublas_10.1 cublas_dev_10.1 cudart_10.1 cufft_10.1 cufft_dev_10.1 curand_10.1 curand_dev_10.1 cusolver_10.1 cusolver_dev_10.1 cusparse_10.1 cusparse_dev_10.1 nvgraph_10.1 nvgraph_dev_10.1 npp_10.1 npp_dev_10.1 nvrtc_10.1 nvrtc_dev_10.1 nvml_dev_10.1"
elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
elif [[ "$cuda_major_version" == "11" ]]; then
cuda_installer_name="cuda_11.1.0_456.43_win10"
msbuild_project_dir="visual_studio_integration/CUDAVisualStudioIntegration/extras/visual_studio_integration/MSBuildExtensions"
cuda_install_packages="nvcc_11.1 cuobjdump_11.1 nvprune_11.1 nvprof_11.1 cupti_11.1 cublas_11.1 cublas_dev_11.1 cudart_11.1 cufft_11.1 cufft_dev_11.1 curand_11.1 curand_dev_11.1 cusolver_11.1 cusolver_dev_11.1 cusparse_11.1 cusparse_dev_11.1 npp_11.1 npp_dev_11.1 nvrtc_11.1 nvrtc_dev_11.1 nvml_dev_11.1"
Expand All @@ -14,7 +16,7 @@ else
exit 1
fi

if [[ "$CUDA_VERSION" =~ ^11.* && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
if [[ "$cuda_major_version" == "11" && "${JOB_EXECUTOR}" == "windows-with-nvidia-gpu" ]]; then
cuda_install_packages="${cuda_install_packages} Display.Driver"
fi

Expand Down
6 changes: 4 additions & 2 deletions .circleci/scripts/windows_cudnn_install.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#!/bin/bash
set -eux -o pipefail

if [[ "$CUDA_VERSION" =~ ^10.* ]]; then
cuda_major_version=${CUDA_VERSION%.*}

if [[ "$cuda_major_version" == "10" ]]; then
cudnn_installer_name="cudnn-${CUDA_VERSION}-windows10-x64-v7.6.4.38"
elif [[ "$CUDA_VERSION" =~ ^11.* ]]; then
elif [[ "$cuda_major_version" == "11" ]]; then
cudnn_installer_name="cudnn-${CUDA_VERSION}-windows-x64-v8.0.5.39"
else
echo "CUDNN for CUDA_VERSION $CUDA_VERSION is not supported yet"
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

.coverage
coverage.xml
.dmypy.json
.gradle
.hypothesis
.mypy_cache
Expand Down
18 changes: 16 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ cmake_dependent_option(
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
"LINUX" OFF)
option(USE_VULKAN "Use Vulkan GPU backend" OFF)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" ON)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference even on fp32 tensors" OFF)
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation (needs libshaderc)" OFF)
option(USE_VULKAN_WRAPPER "Vulkan - Dynamically load Vulkan functions" ON)
Expand Down Expand Up @@ -318,7 +318,7 @@ set(OP_DEPENDENCY "" CACHE STRING
# symbol lookup error: miniconda3/envs/pytorch-py3.7/lib/libmkl_intel_lp64.so: undefined symbol: mkl_blas_dsyrk
# https://software.intel.com/en-us/articles/symbol-lookup-error-when-linking-intel-mkl-with-gcc-on-ubuntu
if(LINUX)
set(CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-as-needed ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed")
endif()

if(MSVC)
Expand Down Expand Up @@ -505,6 +505,20 @@ if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build from: Debug Release RelWithDebInfo MinSizeRel Coverage." FORCE)
endif()

# The below means we are cross compiling for arm64 or x86_64 on MacOSX
if(NOT IOS AND CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
set(CROSS_COMPILING_MACOSX TRUE)
# We need to compile a universal protoc to not fail protobuf build
execute_process(COMMAND ./scripts/build_host_protoc.sh --other-flags "-DCMAKE_OSX_ARCHITECTURES=x86_64;arm64"
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
RESULT_VARIABLE BUILD_HOST_PROTOC_RESULT)
if(NOT BUILD_HOST_PROTOC_RESULT EQUAL "0")
message(FATAL_ERROR "Could not compile universal protoc.")
endif()
set(PROTOBUF_PROTOC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
set(CAFFE2_CUSTOM_PROTOC_EXECUTABLE "${PROJECT_SOURCE_DIR}/build_host_protoc/bin/protoc")
endif()

# ---[ Misc checks to cope with various compiler modes
include(cmake/MiscCheck.cmake)

Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ RUN /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -y python=${PYTHON_VERS
RUN /opt/conda/bin/pip install torchelastic

FROM ${BASE_IMAGE} as official
ARG PYTORCH_VERSION
LABEL com.nvidia.volumes.needed="nvidia_driver"
RUN --mount=type=cache,id=apt-final,target=/var/cache/apt \
apt-get update && apt-get install -y --no-install-recommends \
Expand All @@ -71,6 +72,7 @@ ENV PATH /opt/conda/bin:$PATH
ENV NVIDIA_VISIBLE_DEVICES all
ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64
ENV PYTORCH_VERSION ${PYTORCH_VERSION}
WORKDIR /workspace

FROM official as dev
Expand Down
60 changes: 20 additions & 40 deletions android/pytorch_android/generate_test_torchscripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,101 +20,85 @@ def forward(self, input):
return None

@torch.jit.script_method
def eqBool(self, input):
# type: (bool) -> bool
def eqBool(self, input: bool) -> bool:
return input

@torch.jit.script_method
def eqInt(self, input):
# type: (int) -> int
def eqInt(self, input: int) -> int:
return input

@torch.jit.script_method
def eqFloat(self, input):
# type: (float) -> float
def eqFloat(self, input: float) -> float:
return input

@torch.jit.script_method
def eqStr(self, input):
# type: (str) -> str
def eqStr(self, input: str) -> str:
return input

@torch.jit.script_method
def eqTensor(self, input):
# type: (Tensor) -> Tensor
def eqTensor(self, input: Tensor) -> Tensor:
return input

@torch.jit.script_method
def eqDictStrKeyIntValue(self, input):
# type: (Dict[str, int]) -> Dict[str, int]
def eqDictStrKeyIntValue(self, input: Dict[str, int]) -> Dict[str, int]:
return input

@torch.jit.script_method
def eqDictIntKeyIntValue(self, input):
# type: (Dict[int, int]) -> Dict[int, int]
def eqDictIntKeyIntValue(self, input: Dict[int, int]) -> Dict[int, int]:
return input

@torch.jit.script_method
def eqDictFloatKeyIntValue(self, input):
# type: (Dict[float, int]) -> Dict[float, int]
def eqDictFloatKeyIntValue(self, input: Dict[float, int]) -> Dict[float, int]:
return input

@torch.jit.script_method
def listIntSumReturnTuple(self, input):
# type: (List[int]) -> Tuple[List[int], int]
def listIntSumReturnTuple(self, input: List[int]) -> Tuple[List[int], int]:
sum = 0
for x in input:
sum += x
return (input, sum)

@torch.jit.script_method
def listBoolConjunction(self, input):
# type: (List[bool]) -> bool
def listBoolConjunction(self, input: List[bool]) -> bool:
res = True
for x in input:
res = res and x
return res

@torch.jit.script_method
def listBoolDisjunction(self, input):
# type: (List[bool]) -> bool
def listBoolDisjunction(self, input: List[bool]) -> bool:
res = False
for x in input:
res = res or x
return res

@torch.jit.script_method
def tupleIntSumReturnTuple(self, input):
# type: (Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]
def tupleIntSumReturnTuple(self, input: Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]:
sum = 0
for x in input:
sum += x
return (input, sum)

@torch.jit.script_method
def optionalIntIsNone(self, input):
# type: (Optional[int]) -> bool
def optionalIntIsNone(self, input: Optional[int]) -> bool:
return input is None

@torch.jit.script_method
def intEq0None(self, input):
# type: (int) -> Optional[int]
def intEq0None(self, input: int) -> Optional[int]:
if input == 0:
return None
return input

@torch.jit.script_method
def str3Concat(self, input):
# type: (str) -> str
def str3Concat(self, input: str) -> str:
return input + input + input

@torch.jit.script_method
def newEmptyShapeWithItem(self, input):
return torch.tensor([int(input.item())])[0]

@torch.jit.script_method
def testAliasWithOffset(self):
# type: () -> List[Tensor]
def testAliasWithOffset(self) -> List[Tensor]:
x = torch.tensor([100, 200])
a = [x[0], x[1]]
return a
Expand All @@ -128,8 +112,7 @@ def testNonContiguous(self):
return x

@torch.jit.script_method
def conv2d(self, x, w, toChannelsLast):
# type: (Tensor, Tensor, bool) -> Tensor
def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
r = torch.nn.functional.conv2d(x, w)
if (toChannelsLast):
r = r.contiguous(memory_format=torch.channels_last)
Expand All @@ -138,18 +121,15 @@ def conv2d(self, x, w, toChannelsLast):
return r

@torch.jit.script_method
def contiguous(self, x):
# type: (Tensor) -> Tensor
def contiguous(self, x: Tensor) -> Tensor:
return x.contiguous()

@torch.jit.script_method
def contiguousChannelsLast(self, x):
# type: (Tensor) -> Tensor
def contiguousChannelsLast(self, x: Tensor) -> Tensor:
return x.contiguous(memory_format=torch.channels_last)

@torch.jit.script_method
def contiguousChannelsLast3d(self, x):
# type: (Tensor) -> Tensor
def contiguousChannelsLast3d(self, x: Tensor) -> Tensor:
return x.contiguous(memory_format=torch.channels_last_3d)

scriptAndSave(Test(), "test.pt")
60 changes: 20 additions & 40 deletions android/pytorch_android/test_asset.jit
Original file line number Diff line number Diff line change
@@ -1,85 +1,69 @@
def forward(self, input):
return None

def eqBool(self, input):
# type: (bool) -> bool
def eqBool(self, input: bool) -> bool:
return input

def eqInt(self, input):
# type: (int) -> int
def eqInt(self, input: int) -> int:
return input

def eqFloat(self, input):
# type: (float) -> float
def eqFloat(self, input: float) -> float:
return input

def eqStr(self, input):
# type: (str) -> str
def eqStr(self, input: str) -> str:
return input

def eqTensor(self, input):
# type: (Tensor) -> Tensor
def eqTensor(self, input: Tensor) -> Tensor:
return input

def eqDictStrKeyIntValue(self, input):
# type: (Dict[str, int]) -> Dict[str, int]
def eqDictStrKeyIntValue(self, input: Dict[str, int]) -> Dict[str, int]:
return input

def eqDictIntKeyIntValue(self, input):
# type: (Dict[int, int]) -> Dict[int, int]
def eqDictIntKeyIntValue(self, input: Dict[int, int]) -> Dict[int, int]:
return input

def eqDictFloatKeyIntValue(self, input):
# type: (Dict[float, int]) -> Dict[float, int]
def eqDictFloatKeyIntValue(self, input: Dict[float, int]) -> Dict[float, int]:
return input

def listIntSumReturnTuple(self, input):
# type: (List[int]) -> Tuple[List[int], int]
def listIntSumReturnTuple(self, input: List[int]) -> Tuple[List[int], int]:
sum = 0
for x in input:
sum += x
return (input, sum)

def listBoolConjunction(self, input):
# type: (List[bool]) -> bool
def listBoolConjunction(self, input: List[bool]) -> bool:
res = True
for x in input:
res = res and x
return res

def listBoolDisjunction(self, input):
# type: (List[bool]) -> bool
def listBoolDisjunction(self, input: List[bool]) -> bool:
res = False
for x in input:
res = res or x
return res

def tupleIntSumReturnTuple(self, input):
# type: (Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]
def tupleIntSumReturnTuple(self, input: Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], int]:
sum = 0
for x in input:
sum += x
return (input, sum)

def optionalIntIsNone(self, input):
# type: (Optional[int]) -> bool
def optionalIntIsNone(self, input: Optional[int]) -> bool:
return input is None

def intEq0None(self, input):
# type: (int) -> Optional[int]
def intEq0None(self, input: int) -> Optional[int]:
if input == 0:
return None
return input

def str3Concat(self, input):
# type: (str) -> str
def str3Concat(self, input: str) -> str:
return input + input + input

def newEmptyShapeWithItem(self, input):
return torch.tensor([int(input.item())])[0]

def testAliasWithOffset(self):
# type: () -> List[Tensor]
def testAliasWithOffset(self) -> List[Tensor]:
x = torch.tensor([100, 200])
a = [x[0], x[1]]
return a
Expand All @@ -91,8 +75,7 @@ def testNonContiguous(self):
assert x[1] == 300
return x

def conv2d(self, x, w, toChannelsLast):
# type: (Tensor, Tensor, bool) -> Tensor
def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
r = torch.conv2d(x, w)
if (toChannelsLast):
# memory_format=torch.channels_last
Expand All @@ -101,16 +84,13 @@ def conv2d(self, x, w, toChannelsLast):
r = r.contiguous()
return r

def contiguous(self, x):
# type: (Tensor) -> Tensor
def contiguous(self, x: Tensor) -> Tensor:
return x.contiguous()

def contiguousChannelsLast(self, x):
# type: (Tensor) -> Tensor
def contiguousChannelsLast(self, x: Tensor) -> Tensor:
# memory_format=torch.channels_last
return x.contiguous(memory_format=2)

def contiguousChannelsLast3d(self, x):
# type: (Tensor) -> Tensor
def contiguousChannelsLast3d(self, x: Tensor) -> Tensor:
# memory_format=torch.channels_last_3d
return x.contiguous(memory_format=3)

0 comments on commit 63e740f

Please sign in to comment.