Update on "[GPU] Add an attribute to the torchscript model exported b…

…y metal" As title Differential Revision: [D24616430](https://our.internmc.facebook.com/intern/diff/D24616430/) [ghstack-poisoned]
pytorch · Nov 3, 2020 · 46292f9 · 46292f9
2 parents d999abe + 1fe273d
commit 46292f9
Show file tree

Hide file tree

Showing 92 changed files with 1,721 additions and 780 deletions.
diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py
@@ -43,7 +43,7 @@ def get_workflow_jobs():
         parameters = OrderedDict({
             "name": quote(f"docker-{image_name}"),
             "image_name": quote(image_name),
-        }) 
+        })
         if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
             # pushing documentation on tags requires CircleCI to also
             # build all the dependencies on tags, including this docker image

diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh
@@ -77,9 +77,7 @@ TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/u
 # from scratch
 case "$image" in
   pytorch-linux-xenial-py3.8)
-    # TODO: This is a hack, get rid of this as soon as you get rid of the travis downloads
-    TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/16.04/x86_64"
-    TRAVIS_PYTHON_VERSION=3.8
+    ANACONDA_PYTHON_VERSION=3.8
     GCC_VERSION=7
     # Do not install PROTOBUF, DB, and VISION as a test
     ;;
@@ -362,7 +360,6 @@ docker build \
        --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
        --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
        --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
-       --build-arg "TRAVIS_PYTHON_VERSION=${TRAVIS_PYTHON_VERSION}" \
        --build-arg "GCC_VERSION=${GCC_VERSION}" \
        --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
        --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
@@ -405,19 +402,6 @@ if [[ "$OS" == "ubuntu" ]]; then
   fi
 fi
 
-if [ -n "$TRAVIS_PYTHON_VERSION" ]; then
-  if [[ "$TRAVIS_PYTHON_VERSION" != nightly ]]; then
-    if !(drun python --version 2>&1 | grep -qF "Python $TRAVIS_PYTHON_VERSION"); then
-      echo "TRAVIS_PYTHON_VERSION=$TRAVIS_PYTHON_VERSION, but:"
-      drun python --version
-      exit 1
-    fi
-  else
-    echo "Please manually check nightly is OK:"
-    drun python --version
-  fi
-fi
-
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
   if !(drun python --version 2>&1 | grep -qF "Python $ANACONDA_PYTHON_VERSION"); then
     echo "ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION, but:"

diff --git a/.circleci/docker/common/install_base.sh b/.circleci/docker/common/install_base.sh
@@ -18,7 +18,6 @@ install_ubuntu() {
   # Install common dependencies
   apt-get update
   # TODO: Some of these may not be necessary
-  # TODO: libiomp also gets installed by conda, aka there's a conflict
   ccache_deps="asciidoc docbook-xml docbook-xsl xsltproc"
   numpy_deps="gfortran"
   apt-get install -y --no-install-recommends \
@@ -40,10 +39,6 @@ install_ubuntu() {
     libjpeg-dev \
     libasound2-dev \
     libsndfile-dev \
-    python \
-    python-dev \
-    python-setuptools \
-    python-wheel \
     software-properties-common \
     sudo \
     wget \

diff --git a/.circleci/docker/common/install_travis_python.sh b/.circleci/docker/common/install_travis_python.sh
diff --git a/.circleci/docker/ubuntu-cuda/Dockerfile b/.circleci/docker/ubuntu-cuda/Dockerfile
@@ -40,12 +40,6 @@ ARG CLANG_VERSION
 ADD ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 
-# Install non-standard Python versions (via Travis binaries)
-ARG TRAVIS_PYTHON_VERSION
-ENV PATH /opt/python/$TRAVIS_PYTHON_VERSION/bin:$PATH
-ADD ./common/install_travis_python.sh install_travis_python.sh
-RUN bash ./install_travis_python.sh && rm install_travis_python.sh
-
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 ADD ./common/install_protobuf.sh install_protobuf.sh

diff --git a/.circleci/docker/ubuntu/Dockerfile b/.circleci/docker/ubuntu/Dockerfile
@@ -48,13 +48,6 @@ RUN bash ./install_gcc.sh && rm install_gcc.sh
 ADD ./common/install_lcov.sh install_lcov.sh
 RUN  bash ./install_lcov.sh && rm install_lcov.sh
 
-# Install non-standard Python versions (via Travis binaries)
-ARG TRAVIS_PYTHON_VERSION
-ARG TRAVIS_DL_URL_PREFIX
-ENV PATH /opt/python/$TRAVIS_PYTHON_VERSION/bin:$PATH
-ADD ./common/install_travis_python.sh install_travis_python.sh
-RUN bash ./install_travis_python.sh && rm install_travis_python.sh
-
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 ADD ./common/install_protobuf.sh install_protobuf.sh

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -69,7 +69,7 @@ jobs:
         id: get_pr_tip
       - name: Run flake8
         run: |
-          set -eux -o pipefail
+          set -eux
           pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
           flake8 --version
           flake8 | tee ${GITHUB_WORKSPACE}/flake8-output.txt

diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
@@ -168,9 +168,6 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
   # JIT C++ extensions require ninja, so put it into PATH.
   export PATH="/var/lib/jenkins/.local/bin:$PATH"
   if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
-    # default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
-    # Fix the pip error: Couldn't find a version that satisfies the requirement
-    pip install --upgrade pip
     pip install -q --user onnxruntime==1.5.2
   fi
   "$ROOT_DIR/scripts/onnx/test.sh"

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -443,6 +443,8 @@ endif()
 
 list(APPEND ATen_MOBILE_BENCHMARK_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/tensor_add.cpp)
+list(APPEND ATen_MOBILE_BENCHMARK_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/quantize_per_channel.cpp)
 list(APPEND ATen_MOBILE_BENCHMARK_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/stateful_conv1d.cpp)
 

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.cpp b/aten/src/ATen/LegacyTHFunctionsCPU.cpp
@@ -780,50 +780,7 @@ Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
     }
     return result;
 }
-Tensor _th_trace(const Tensor & self) {
-    // DeviceGuard omitted
-    auto dispatch_scalar_type = infer_scalar_type(self);
 
-    switch (dispatch_scalar_type) {
-        case ScalarType::Byte: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<uint8_t>(THByteTensor_trace(self_)), options(ScalarType::Byte));
-            break;
-        }
-        case ScalarType::Char: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<int8_t>(THCharTensor_trace(self_)), options(ScalarType::Char));
-            break;
-        }
-        case ScalarType::Double: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<double>(THDoubleTensor_trace(self_)), options(ScalarType::Double));
-            break;
-        }
-        case ScalarType::Float: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<float>(THFloatTensor_trace(self_)), options(ScalarType::Float));
-            break;
-        }
-        case ScalarType::Int: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<int>(THIntTensor_trace(self_)), options(ScalarType::Int));
-            break;
-        }
-        case ScalarType::Long: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<int64_t>(THLongTensor_trace(self_)), options(ScalarType::Long));
-            break;
-        }
-        case ScalarType::Short: {
-            auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
-            return at::scalar_tensor(convert<int16_t>(THShortTensor_trace(self_)), options(ScalarType::Short));
-            break;
-        }
-        default:
-            AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type);
-    }
-}
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
     // DeviceGuard omitted
     auto dispatch_scalar_type = infer_scalar_type(self);

diff --git a/aten/src/ATen/LegacyTHFunctionsCPU.h b/aten/src/ATen/LegacyTHFunctionsCPU.h
@@ -36,7 +36,6 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
 Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
 Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
-Tensor _th_trace(const Tensor & self);
 std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
 std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
 std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);

diff --git a/aten/src/ATen/benchmarks/quantize_per_channel.cpp b/aten/src/ATen/benchmarks/quantize_per_channel.cpp
@@ -0,0 +1,85 @@
+#include <ATen/ATen.h>
+#include <iostream>
+
+#include <benchmark/benchmark.h>
+
+static void quantize_per_channel_4d_contiguous(benchmark::State& state) {
+  const size_t batches = static_cast<size_t>(state.range(0));
+  const size_t channels = static_cast<size_t>(state.range(1));
+  const size_t height = static_cast<size_t>(state.range(2));
+  const size_t width = static_cast<size_t>(state.range(3));
+
+  at::Tensor a = at::rand({batches, channels, height, width});
+  at::Tensor scales = at::rand({channels});
+  at::Tensor zero_points = at::randint(
+      0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));
+
+  at::Tensor qa;
+  for (auto _ : state) {
+    qa = at::native::quantize_per_channel_cpu(
+        a, scales, zero_points, 1, at::ScalarType::QUInt8);
+  }
+}
+
+static void quantize_per_channel_4d_channels_last(benchmark::State& state) {
+  const size_t batches = static_cast<size_t>(state.range(0));
+  const size_t channels = static_cast<size_t>(state.range(1));
+  const size_t height = static_cast<size_t>(state.range(2));
+  const size_t width = static_cast<size_t>(state.range(3));
+
+  at::Tensor a = at::rand(
+      {batches, channels, height, width},
+      at::TensorOptions().memory_format(at::MemoryFormat::ChannelsLast));
+  at::Tensor scales = at::rand({channels});
+  at::Tensor zero_points = at::randint(
+      0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));
+
+  at::Tensor qa;
+  for (auto _ : state) {
+    qa = at::native::quantize_per_channel_cpu(
+        a, scales, zero_points, 1, at::ScalarType::QUInt8);
+  }
+}
+
+static void quantize_per_channel_2d(benchmark::State& state) {
+  const size_t channels = static_cast<size_t>(state.range(0));
+  const size_t nelem = static_cast<size_t>(state.range(1));
+
+  at::Tensor a = at::rand({channels, nelem});
+  at::Tensor scales = at::rand({channels});
+  at::Tensor zero_points = at::randint(
+      0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));
+
+  at::Tensor qa;
+  for (auto _ : state) {
+    qa = at::native::quantize_per_channel_cpu(
+        a, scales, zero_points, 0, at::ScalarType::QUInt8);
+  }
+}
+
+static void GenerateSizes4d(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"N", "C", "H", "W"});
+
+  for (size_t n = 16; n < 256; n *= 2) {
+    for (size_t c = 4; c < 256; c *= 2) {
+      for (size_t hw = 4; hw < 256; hw *= 2) {
+        b->Args({n, c, hw, hw});
+      }
+    }
+  }
+}
+
+static void GenerateSizes2d(benchmark::internal::Benchmark* b) {
+  b->ArgNames({"C", "N"});
+
+  for (size_t c = 4; c < 512; c *= 2) {
+    for (size_t n = 4; n < 512; n *= 2) {
+      b->Args({c, n});
+    }
+  }
+}
+
+BENCHMARK(quantize_per_channel_2d)->Apply(GenerateSizes2d);
+BENCHMARK(quantize_per_channel_4d_contiguous)->Apply(GenerateSizes4d);
+BENCHMARK(quantize_per_channel_4d_channels_last)->Apply(GenerateSizes4d);
+BENCHMARK_MAIN();
diff --git a/aten/src/ATen/core/Dimname.h b/aten/src/ATen/core/Dimname.h
@@ -21,7 +21,7 @@ struct CAFFE2_API Dimname {
   bool isWildcard() const { return type_ == NameType::WILDCARD; }
 
   bool matches(Dimname other) const;
-  optional<Dimname> unify(Dimname other) const;
+  c10::optional<Dimname> unify(Dimname other) const;
 
  private:
   Dimname(Symbol name)

diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h
@@ -99,7 +99,7 @@ void check_names_valid_for(const Tensor& tensor, DimnameList names);
 void check_names_valid_for(size_t tensor_dim, DimnameList names);
 
 // Sets the names of `tensor` to be `names`.
-CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, optional<DimnameList> names);
+CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, c10::optional<DimnameList> names);
 CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, std::vector<Dimname>&& names, bool validate_names);
 
 constexpr size_t kMaxNamedTensorDim = 64;
@@ -110,7 +110,7 @@ namespace impl {
 
 // Some helper functions on TensorImpl. Useful for working with names in TH.
 // XXX: Ideally these would exist as methods on TensorImpl
-CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, optional<DimnameList> names, bool validate_names);
+CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
 CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);
 
 void check_names_valid_for(TensorImpl* impl, DimnameList names);
@@ -131,7 +131,7 @@ CAFFE2_API DimnameList get_names(const TensorImpl* impl);
 // Returns the names of the tensor if they have been allocated; returns nullopt
 // instead if the haven't been. The names of a tensor are not allocated if a
 // tensor is constructed with names=None.
-CAFFE2_API optional<DimnameList> get_opt_names(const TensorImpl* impl);
+CAFFE2_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);
 
 
 } // namespace impl

diff --git a/aten/src/ATen/cpu/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec256/vec256_bfloat16.h
@@ -465,7 +465,7 @@ Vec256<BFloat16> inline Vec256<BFloat16>::operator==(const Vec256<BFloat16>& oth
 }
 Vec256<BFloat16> inline Vec256<BFloat16>::operator!=(const Vec256<BFloat16>& other) const {
   return bfloat16_binary_op_as_fp32(*this, other, [](__m256 x, __m256 y) {
-    return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ);
+    return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ);
   });
 }