Skip to content

Commit

Permalink
Update on "[GPU] Add an attribute to the torchscript model exported b…
Browse files Browse the repository at this point in the history
…y metal"

As title

Differential Revision: [D24616430](https://our.internmc.facebook.com/intern/diff/D24616430/)

[ghstack-poisoned]
  • Loading branch information
xta0 committed Nov 3, 2020
2 parents d999abe + 1fe273d commit 46292f9
Show file tree
Hide file tree
Showing 92 changed files with 1,721 additions and 780 deletions.
2 changes: 1 addition & 1 deletion .circleci/cimodel/data/simple/docker_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def get_workflow_jobs():
parameters = OrderedDict({
"name": quote(f"docker-{image_name}"),
"image_name": quote(image_name),
})
})
if image_name == "pytorch-linux-xenial-py3.6-gcc5.4":
# pushing documentation on tags requires CircleCI to also
# build all the dependencies on tags, including this docker image
Expand Down
18 changes: 1 addition & 17 deletions .circleci/docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/u
# from scratch
case "$image" in
pytorch-linux-xenial-py3.8)
# TODO: This is a hack, get rid of this as soon as you get rid of the travis downloads
TRAVIS_DL_URL_PREFIX="https://s3.amazonaws.com/travis-python-archives/binaries/ubuntu/16.04/x86_64"
TRAVIS_PYTHON_VERSION=3.8
ANACONDA_PYTHON_VERSION=3.8
GCC_VERSION=7
# Do not install PROTOBUF, DB, and VISION as a test
;;
Expand Down Expand Up @@ -362,7 +360,6 @@ docker build \
--build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
--build-arg "CLANG_VERSION=${CLANG_VERSION}" \
--build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
--build-arg "TRAVIS_PYTHON_VERSION=${TRAVIS_PYTHON_VERSION}" \
--build-arg "GCC_VERSION=${GCC_VERSION}" \
--build-arg "CUDA_VERSION=${CUDA_VERSION}" \
--build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
Expand Down Expand Up @@ -405,19 +402,6 @@ if [[ "$OS" == "ubuntu" ]]; then
fi
fi

if [ -n "$TRAVIS_PYTHON_VERSION" ]; then
if [[ "$TRAVIS_PYTHON_VERSION" != nightly ]]; then
if !(drun python --version 2>&1 | grep -qF "Python $TRAVIS_PYTHON_VERSION"); then
echo "TRAVIS_PYTHON_VERSION=$TRAVIS_PYTHON_VERSION, but:"
drun python --version
exit 1
fi
else
echo "Please manually check nightly is OK:"
drun python --version
fi
fi

if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
if !(drun python --version 2>&1 | grep -qF "Python $ANACONDA_PYTHON_VERSION"); then
echo "ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION, but:"
Expand Down
5 changes: 0 additions & 5 deletions .circleci/docker/common/install_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ install_ubuntu() {
# Install common dependencies
apt-get update
# TODO: Some of these may not be necessary
# TODO: libiomp also gets installed by conda, aka there's a conflict
ccache_deps="asciidoc docbook-xml docbook-xsl xsltproc"
numpy_deps="gfortran"
apt-get install -y --no-install-recommends \
Expand All @@ -40,10 +39,6 @@ install_ubuntu() {
libjpeg-dev \
libasound2-dev \
libsndfile-dev \
python \
python-dev \
python-setuptools \
python-wheel \
software-properties-common \
sudo \
wget \
Expand Down
79 changes: 0 additions & 79 deletions .circleci/docker/common/install_travis_python.sh

This file was deleted.

6 changes: 0 additions & 6 deletions .circleci/docker/ubuntu-cuda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,6 @@ ARG CLANG_VERSION
ADD ./common/install_clang.sh install_clang.sh
RUN bash ./install_clang.sh && rm install_clang.sh

# Install non-standard Python versions (via Travis binaries)
ARG TRAVIS_PYTHON_VERSION
ENV PATH /opt/python/$TRAVIS_PYTHON_VERSION/bin:$PATH
ADD ./common/install_travis_python.sh install_travis_python.sh
RUN bash ./install_travis_python.sh && rm install_travis_python.sh

# (optional) Install protobuf for ONNX
ARG PROTOBUF
ADD ./common/install_protobuf.sh install_protobuf.sh
Expand Down
7 changes: 0 additions & 7 deletions .circleci/docker/ubuntu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,6 @@ RUN bash ./install_gcc.sh && rm install_gcc.sh
ADD ./common/install_lcov.sh install_lcov.sh
RUN bash ./install_lcov.sh && rm install_lcov.sh

# Install non-standard Python versions (via Travis binaries)
ARG TRAVIS_PYTHON_VERSION
ARG TRAVIS_DL_URL_PREFIX
ENV PATH /opt/python/$TRAVIS_PYTHON_VERSION/bin:$PATH
ADD ./common/install_travis_python.sh install_travis_python.sh
RUN bash ./install_travis_python.sh && rm install_travis_python.sh

# (optional) Install protobuf for ONNX
ARG PROTOBUF
ADD ./common/install_protobuf.sh install_protobuf.sh
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ jobs:
id: get_pr_tip
- name: Run flake8
run: |
set -eux -o pipefail
set -eux
pip install flake8==3.8.2 flake8-bugbear flake8-comprehensions flake8-executable flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0
flake8 --version
flake8 | tee ${GITHUB_WORKSPACE}/flake8-output.txt
Expand Down
3 changes: 0 additions & 3 deletions .jenkins/caffe2/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,6 @@ if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# JIT C++ extensions require ninja, so put it into PATH.
export PATH="/var/lib/jenkins/.local/bin:$PATH"
if [[ "$BUILD_ENVIRONMENT" == *py3* ]]; then
# default pip version is too old(9.0.2), unable to support tag `manylinux2010`.
# Fix the pip error: Couldn't find a version that satisfies the requirement
pip install --upgrade pip
pip install -q --user onnxruntime==1.5.2
fi
"$ROOT_DIR/scripts/onnx/test.sh"
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,8 @@ endif()

list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/tensor_add.cpp)
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/quantize_per_channel.cpp)
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/stateful_conv1d.cpp)

Expand Down
43 changes: 0 additions & 43 deletions aten/src/ATen/LegacyTHFunctionsCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -780,50 +780,7 @@ Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
}
return result;
}
Tensor _th_trace(const Tensor & self) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);

switch (dispatch_scalar_type) {
case ScalarType::Byte: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<uint8_t>(THByteTensor_trace(self_)), options(ScalarType::Byte));
break;
}
case ScalarType::Char: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int8_t>(THCharTensor_trace(self_)), options(ScalarType::Char));
break;
}
case ScalarType::Double: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<double>(THDoubleTensor_trace(self_)), options(ScalarType::Double));
break;
}
case ScalarType::Float: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<float>(THFloatTensor_trace(self_)), options(ScalarType::Float));
break;
}
case ScalarType::Int: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int>(THIntTensor_trace(self_)), options(ScalarType::Int));
break;
}
case ScalarType::Long: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int64_t>(THLongTensor_trace(self_)), options(ScalarType::Long));
break;
}
case ScalarType::Short: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int16_t>(THShortTensor_trace(self_)), options(ScalarType::Short));
break;
}
default:
AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type);
}
}
std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/LegacyTHFunctionsCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
Tensor _th_trace(const Tensor & self);
std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
Expand Down
85 changes: 85 additions & 0 deletions aten/src/ATen/benchmarks/quantize_per_channel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <ATen/ATen.h>
#include <iostream>

#include <benchmark/benchmark.h>

static void quantize_per_channel_4d_contiguous(benchmark::State& state) {
const size_t batches = static_cast<size_t>(state.range(0));
const size_t channels = static_cast<size_t>(state.range(1));
const size_t height = static_cast<size_t>(state.range(2));
const size_t width = static_cast<size_t>(state.range(3));

at::Tensor a = at::rand({batches, channels, height, width});
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 1, at::ScalarType::QUInt8);
}
}

static void quantize_per_channel_4d_channels_last(benchmark::State& state) {
const size_t batches = static_cast<size_t>(state.range(0));
const size_t channels = static_cast<size_t>(state.range(1));
const size_t height = static_cast<size_t>(state.range(2));
const size_t width = static_cast<size_t>(state.range(3));

at::Tensor a = at::rand(
{batches, channels, height, width},
at::TensorOptions().memory_format(at::MemoryFormat::ChannelsLast));
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 1, at::ScalarType::QUInt8);
}
}

static void quantize_per_channel_2d(benchmark::State& state) {
const size_t channels = static_cast<size_t>(state.range(0));
const size_t nelem = static_cast<size_t>(state.range(1));

at::Tensor a = at::rand({channels, nelem});
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 0, at::ScalarType::QUInt8);
}
}

static void GenerateSizes4d(benchmark::internal::Benchmark* b) {
b->ArgNames({"N", "C", "H", "W"});

for (size_t n = 16; n < 256; n *= 2) {
for (size_t c = 4; c < 256; c *= 2) {
for (size_t hw = 4; hw < 256; hw *= 2) {
b->Args({n, c, hw, hw});
}
}
}
}

static void GenerateSizes2d(benchmark::internal::Benchmark* b) {
b->ArgNames({"C", "N"});

for (size_t c = 4; c < 512; c *= 2) {
for (size_t n = 4; n < 512; n *= 2) {
b->Args({c, n});
}
}
}

BENCHMARK(quantize_per_channel_2d)->Apply(GenerateSizes2d);
BENCHMARK(quantize_per_channel_4d_contiguous)->Apply(GenerateSizes4d);
BENCHMARK(quantize_per_channel_4d_channels_last)->Apply(GenerateSizes4d);
BENCHMARK_MAIN();
2 changes: 1 addition & 1 deletion aten/src/ATen/core/Dimname.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct CAFFE2_API Dimname {
bool isWildcard() const { return type_ == NameType::WILDCARD; }

bool matches(Dimname other) const;
optional<Dimname> unify(Dimname other) const;
c10::optional<Dimname> unify(Dimname other) const;

private:
Dimname(Symbol name)
Expand Down
6 changes: 3 additions & 3 deletions aten/src/ATen/core/NamedTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void check_names_valid_for(const Tensor& tensor, DimnameList names);
void check_names_valid_for(size_t tensor_dim, DimnameList names);

// Sets the names of `tensor` to be `names`.
CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, optional<DimnameList> names);
CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, c10::optional<DimnameList> names);
CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, std::vector<Dimname>&& names, bool validate_names);

constexpr size_t kMaxNamedTensorDim = 64;
Expand All @@ -110,7 +110,7 @@ namespace impl {

// Some helper functions on TensorImpl. Useful for working with names in TH.
// XXX: Ideally these would exist as methods on TensorImpl
CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, optional<DimnameList> names, bool validate_names);
CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);

void check_names_valid_for(TensorImpl* impl, DimnameList names);
Expand All @@ -131,7 +131,7 @@ CAFFE2_API DimnameList get_names(const TensorImpl* impl);
// Returns the names of the tensor if they have been allocated; returns nullopt
// instead if the haven't been. The names of a tensor are not allocated if a
// tensor is constructed with names=None.
CAFFE2_API optional<DimnameList> get_opt_names(const TensorImpl* impl);
CAFFE2_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);


} // namespace impl
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/cpu/vec256/vec256_bfloat16.h
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ Vec256<BFloat16> inline Vec256<BFloat16>::operator==(const Vec256<BFloat16>& oth
}
Vec256<BFloat16> inline Vec256<BFloat16>::operator!=(const Vec256<BFloat16>& other) const {
return bfloat16_binary_op_as_fp32(*this, other, [](__m256 x, __m256 y) {
return _mm256_cmp_ps(x, y, _CMP_NEQ_UQ);
return _mm256_cmp_ps(x, y, _CMP_NEQ_OQ);
});
}

Expand Down

0 comments on commit 46292f9

Please sign in to comment.