Skip to content

Commit

Permalink
Update base for Update on "Revert "Skip test_memory_format_nn_BatchNo…
Browse files Browse the repository at this point in the history
…rm2d in inductor (#125970)""


This reverts commit 0a9c6e9.


enable the test since it's fixed.

[ghstack-poisoned]
  • Loading branch information
shunting314 committed May 20, 2024
2 parents e02f02a + 53f73cd commit cb03e29
Show file tree
Hide file tree
Showing 266 changed files with 9,974 additions and 7,165 deletions.
15 changes: 15 additions & 0 deletions .ci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,21 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
CONDA_CMAKE=yes
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
CUDA_VERSION=11.8.0
CUDNN_VERSION=8
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/common/install_acl.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
set -euo pipefail

readonly version=v23.08
readonly version=v24.04
readonly src_host=https://review.mlplatform.org/ml
readonly src_repo=ComputeLibrary

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/docker-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ jobs:
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
pytorch-linux-focal-py3.8-clang10,
pytorch-linux-focal-py3.11-clang10,
Expand Down
21 changes: 21 additions & 0 deletions .github/workflows/inductor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,27 @@ jobs:
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
name: cuda12.1-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
name: cuda12.1-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
with:
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}

linux-jammy-cpu-py3_8-gcc11-inductor-build:
name: linux-jammy-cpu-py3.8-gcc11-inductor
uses: ./.github/workflows/_linux-build.yml
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/trunk.yml
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ jobs:
{ include: [
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
{ config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
]}
linux-focal-rocm6_1-py3_8-test:
Expand All @@ -209,4 +210,4 @@ jobs:
build-environment: linux-focal-rocm6.1-py3.8
docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
43 changes: 43 additions & 0 deletions .github/workflows/upload_test_stats_intermediate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Upload test stats intermediate

on:
workflow_dispatch:
inputs:
workflow_id:
description: workflow_id of the run
required: true
workflow_run_attempt:
description: workflow_run_attempt of the run
required: true

jobs:
intermediate_upload_test_stats:
name: Intermediate upload test stats for ${{ inputs.workflow_id }}
runs-on: ubuntu-22.04
environment: upload-stats
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
fetch-depth: 1
submodules: false

- uses: actions/setup-python@v4
with:
python-version: '3.11'
cache: pip

- run: |
pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
- name: Upload test stats
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
run: |
python3 -m tools.stats.upload_test_stats_intermediate \
--workflow-run-id "${WORKFLOW_RUN_ID}" \
--workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \
16 changes: 4 additions & 12 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@
ignore = dirty
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
[submodule "third_party/cub"]
ignore = dirty
path = third_party/cub
url = https://github.com/NVlabs/cub.git
[submodule "third_party/eigen"]
ignore = dirty
path = third_party/eigen
Expand Down Expand Up @@ -50,10 +46,6 @@
ignore = dirty
path = third_party/psimd
url = https://github.com/Maratyszcza/psimd.git
[submodule "third_party/zstd"]
ignore = dirty
path = third_party/zstd
url = https://github.com/facebook/zstd.git
[submodule "third_party/cpuinfo"]
ignore = dirty
path = third_party/cpuinfo
Expand All @@ -66,10 +58,6 @@
ignore = dirty
path = third_party/onnx
url = https://github.com/onnx/onnx.git
[submodule "third_party/onnx-tensorrt"]
ignore = dirty
path = third_party/onnx-tensorrt
url = https://github.com/onnx/onnx-tensorrt
[submodule "third_party/sleef"]
ignore = dirty
path = third_party/sleef
Expand Down Expand Up @@ -152,3 +140,7 @@
[submodule "third_party/opentelemetry-cpp"]
path = third_party/opentelemetry-cpp
url = https://github.com/open-telemetry/opentelemetry-cpp.git
[submodule "third_party/cpp-httplib"]
path = third_party/cpp-httplib
url = https://github.com/yhirose/cpp-httplib.git
branch = v0.15.3
2 changes: 0 additions & 2 deletions .lintrunner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -1929,8 +1929,6 @@ exclude_patterns = [
'torch/utils/_mode_utils.py',
'torch/utils/_python_dispatch.py',
'torch/utils/_stats.py',
'torch/utils/_sympy/__init__.py',
'torch/utils/_sympy/functions.py',
'torch/utils/_traceback.py',
'torch/utils/_zip.py',
'torch/utils/backcompat/__init__.py',
Expand Down
2 changes: 1 addition & 1 deletion BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ cc_library(
[
"torch/*.h",
"torch/csrc/**/*.h",
"torch/csrc/distributed/c10d/*.hpp",
"torch/csrc/distributed/c10d/**/*.hpp",
"torch/lib/libshm/*.h",
],
exclude = [
Expand Down
9 changes: 3 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,6 @@ option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
option(USE_SYSTEM_EIGEN_INSTALL
"Use system Eigen instead of the one under third_party" OFF)
option(USE_TENSORRT "Using Nvidia TensorRT library" OFF)
cmake_dependent_option(
USE_VALGRIND "Use Valgrind. Only available on Linux." ON
"LINUX" OFF)
Expand All @@ -279,11 +278,13 @@ endif()
option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
cmake_dependent_option(
USE_LITE_AOTI "Include AOTI sources" OFF
"BUILD_LITE_INTERPRETER" OFF)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
# option USE_XNNPACK: try to enable xnnpack by default.
option(USE_XNNPACK "Use XNNPACK" ON)
option(USE_ZSTD "Use ZSTD" OFF)
option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
# Ensure that an ITT build is the default for x86 CPUs
cmake_dependent_option(
Expand Down Expand Up @@ -413,7 +414,6 @@ option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
option(USE_SYSTEM_ZSTD "Use system-provided zstd." OFF)
option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
if(USE_SYSTEM_LIBS)
set(USE_SYSTEM_CPUINFO ON)
Expand All @@ -435,9 +435,6 @@ if(USE_SYSTEM_LIBS)
if(USE_TBB)
set(USE_SYSTEM_TBB ON)
endif()
if(USE_ZSTD)
set(USE_SYSTEM_ZSTD ON)
endif()
endif()

# Used when building Caffe2 through setup.py
Expand Down
5 changes: 0 additions & 5 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,4 @@ local_repository(
path = "third_party/onnx/third_party/benchmark",
)

local_repository(
name = "unused_onnx_tensorrt_benchmark",
path = "third_party/onnx-tensorrt/third_party/onnx/third_party/benchmark",
)

### Unused repos end
8 changes: 4 additions & 4 deletions aten/src/ATen/core/ivalue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -887,12 +887,12 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::create(
}

IValue IValue::deepcopy(std::optional<at::Device> device) const {
IValue::HashAliasedIValueMap memo;
IValue::HashIdentityIValueMap memo;
return deepcopy(memo, device);
}

IValue IValue::deepcopy(
IValue::HashAliasedIValueMap& memo,
IValue::HashIdentityIValueMap& memo,
std::optional<at::Device> device) const {
if (memo.count(*this)) {
return memo.at(*this);
Expand Down Expand Up @@ -1028,12 +1028,12 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::copy_to_weak_compilation_ref(

c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
std::optional<at::Device> device) const {
IValue::HashAliasedIValueMap memo;
IValue::HashIdentityIValueMap memo;
return deepcopy(memo, device);
}

c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
IValue::HashAliasedIValueMap& memo,
IValue::HashIdentityIValueMap& memo,
std::optional<at::Device> device) const {
auto cu = type_.cu_;
auto object = ivalue::Object::create(WeakOrStrongTypePtr(type_.cu_, type_.type_), type()->numAttributes());
Expand Down
19 changes: 18 additions & 1 deletion aten/src/ATen/core/ivalue.h
Original file line number Diff line number Diff line change
Expand Up @@ -1117,6 +1117,23 @@ struct TORCH_API IValue final {
using HashAliasedIValueMap =
std::unordered_map<IValue, IValue, HashAliasedIValue, CompAliasedIValues>;

struct HashIdentityIValue {
size_t operator()(const IValue& val) const {
return val.payload.u.as_int;
}
};

struct CompIdentityIValues {
bool operator()(const IValue& lhs, const IValue& rhs) const {
return lhs.is(rhs);
}
};

using HashIdentityIValues =
std::unordered_set<IValue, HashIdentityIValue, CompIdentityIValues>;
using HashIdentityIValueMap =
std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;

// Chechs if this and rhs has a subvalues in common.
// [t1,t2] and [t2, t3] returns true.
bool overlaps(const IValue& rhs) const;
Expand All @@ -1130,7 +1147,7 @@ struct TORCH_API IValue final {
void visit(const std::function<bool(const IValue&)>& visitor) const;
IValue deepcopy(std::optional<at::Device> device = c10::nullopt) const;
IValue deepcopy(
HashAliasedIValueMap& memo,
HashIdentityIValueMap& memo,
std::optional<at::Device> device = c10::nullopt) const;

private:
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/core/ivalue_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1589,7 +1589,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
std::optional<at::Device> device = c10::nullopt) const;

c10::intrusive_ptr<Object> deepcopy(
IValue::HashAliasedIValueMap& memo,
IValue::HashIdentityIValueMap& memo,
std::optional<at::Device> device = c10::nullopt) const;

bool is_weak_compilation_ref() const {
Expand Down
9 changes: 6 additions & 3 deletions aten/src/ATen/cuda/CUDABlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1422,10 +1422,13 @@ void scaled_gemm(
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
// Amax support in ROCm as of 6.2
if (isFloat8Type(result_dtype)) {
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
}
#endif
#ifndef USE_ROCM
if (isFloat8Type(result_dtype)) {
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
}
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
#endif
CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
Expand Down

0 comments on commit cb03e29

Please sign in to comment.