Skip to content

Commit

Permalink
Update on "[quant][graphmode][fx] Add support for qat convbn{relu}1d"
Browse files Browse the repository at this point in the history
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

Differential Revision: [D24696524](https://our.internmc.facebook.com/intern/diff/D24696524)

[ghstack-poisoned]
  • Loading branch information
jerryzh168 committed Nov 3, 2020
2 parents 602acd6 + 782f92b commit 2abea89
Show file tree
Hide file tree
Showing 87 changed files with 2,263 additions and 1,160 deletions.
1 change: 0 additions & 1 deletion .circleci/docker/centos-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ ENV PATH /opt/rocm/hcc/bin:$PATH
ENV PATH /opt/rocm/hip/bin:$PATH
ENV PATH /opt/rocm/opencl/bin:$PATH
ENV PATH /opt/rocm/llvm/bin:$PATH
ENV HIP_PLATFORM hcc
ENV LANG en_US.utf8
ENV LC_ALL en_US.utf8

Expand Down
1 change: 0 additions & 1 deletion .circleci/docker/ubuntu-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ ENV PATH /opt/rocm/hcc/bin:$PATH
ENV PATH /opt/rocm/hip/bin:$PATH
ENV PATH /opt/rocm/opencl/bin:$PATH
ENV PATH /opt/rocm/llvm/bin:$PATH
ENV HIP_PLATFORM hcc
ENV LANG C.UTF-8
ENV LC_ALL C.UTF-8

Expand Down
2 changes: 2 additions & 0 deletions .jenkins/caffe2/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ if [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
fi
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
unset HIP_PLATFORM
if which sccache > /dev/null; then
# Save sccache logs to file
sccache --stop-server || true
Expand Down
2 changes: 2 additions & 0 deletions .jenkins/pytorch/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ SCRIPT_DIR="$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )"

# Figure out which Python to use for ROCm
if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]] && [[ "${BUILD_ENVIRONMENT}" =~ py((2|3)\.?[0-9]?\.?[0-9]?) ]]; then
# HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
unset HIP_PLATFORM
PYTHON=$(which "python${BASH_REMATCH[1]}")
# non-interactive bashs do not expand aliases by default
shopt -s expand_aliases
Expand Down
28 changes: 3 additions & 25 deletions .jenkins/pytorch/macos-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,8 @@ git submodule update --init --recursive
export CMAKE_PREFIX_PATH=${WORKSPACE_DIR}/miniconda3/

# Build PyTorch
if [[ "${BUILD_ENVIRONMENT}" == *cuda9.2* ]]; then
export CUDA_VERSION=9.2
export TORCH_CUDA_ARCH_LIST=5.2
export PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/bin${PATH:+:${PATH}}
export DYLD_LIBRARY_PATH=/Developer/NVIDIA/CUDA-${CUDA_VERSION}/lib${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}
export CUDA_HOME=/Developer/NVIDIA/CUDA-${CUDA_VERSION}
export USE_CUDA=1

if [ -z "${IN_CI}" ]; then
# Eigen gives "explicit specialization of class must precede its first use" error
# when compiling with Xcode 9.1 toolchain, so we have to use Xcode 8.2 toolchain instead.
export DEVELOPER_DIR=/Library/Developer/CommandLineTools
fi
else
if [ -z "${IN_CI}" ]; then
export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
fi
if [ -z "${IN_CI}" ]; then
export DEVELOPER_DIR=/Applications/Xcode9.app/Contents/Developer
fi

if which sccache > /dev/null; then
Expand All @@ -34,17 +19,10 @@ if which sccache > /dev/null; then
printf "#!/bin/sh\nexec sccache $(which clang) \$*" > "${WORKSPACE_DIR}/clang"
chmod a+x "${WORKSPACE_DIR}/clang"

if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
printf "#!/bin/sh\nexec sccache $(which nvcc) \$*" > "${WORKSPACE_DIR}/nvcc"
chmod a+x "${WORKSPACE_DIR}/nvcc"
export CUDA_NVCC_EXECUTABLE="${WORKSPACE_DIR}/nvcc"
fi

export PATH="${WORKSPACE_DIR}:$PATH"
fi

# If we run too many parallel jobs, we will OOM
MAX_JOBS=2 USE_DISTRIBUTED=1 python setup.py install
USE_DISTRIBUTED=1 python setup.py install

assert_git_not_dirty

Expand Down
22 changes: 12 additions & 10 deletions .jenkins/pytorch/win-test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash -ex

#!/bin/bash
set -ex
# shellcheck disable=SC2034
COMPACT_JOB_NAME=pytorch-win-ws2019-cuda10-cudnn7-py3-test

Expand Down Expand Up @@ -42,28 +42,30 @@ fi

run_tests() {
if [ -z "${JOB_BASE_NAME}" ] || [[ "${JOB_BASE_NAME}" == *-test ]]; then
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_custom_script_ops.bat && \
$SCRIPT_HELPERS_DIR/test_custom_backend.bat && \
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_custom_script_ops.bat
$SCRIPT_HELPERS_DIR/test_custom_backend.bat
$SCRIPT_HELPERS_DIR/test_libtorch.bat
else
if [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
export PYTORCH_COLLECT_COVERAGE=1
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_python_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_libtorch.bat
if [[ "${USE_CUDA}" == "1" ]]; then
$SCRIPT_HELPERS_DIR/test_python_jit_legacy.bat "$DETERMINE_FROM"
fi
elif [[ "${JOB_BASE_NAME}" == *-test2 ]]; then
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM" && \
$SCRIPT_HELPERS_DIR/test_custom_backend.bat && \
$SCRIPT_HELPERS_DIR/test_python_all_except_nn.bat "$DETERMINE_FROM"
$SCRIPT_HELPERS_DIR/test_custom_backend.bat
$SCRIPT_HELPERS_DIR/test_custom_script_ops.bat
fi
fi
}

run_tests && assert_git_not_dirty && echo "TEST PASSED"
run_tests
assert_git_not_dirty
echo "TEST PASSED"

if [[ "${BUILD_ENVIRONMENT}" == "pytorch-win-vs2019-cuda10-cudnn7-py3" ]] && [[ "${JOB_BASE_NAME}" == *-test1 ]]; then
pushd $TEST_DIR
Expand Down
2 changes: 2 additions & 0 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,8 @@ endif()

list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/tensor_add.cpp)
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/quantize_per_channel.cpp)
list(APPEND ATen_MOBILE_BENCHMARK_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/benchmarks/stateful_conv1d.cpp)

Expand Down
43 changes: 0 additions & 43 deletions aten/src/ATen/LegacyTHFunctionsCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -780,50 +780,7 @@ Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max) {
}
return result;
}
Tensor _th_trace(const Tensor & self) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);

switch (dispatch_scalar_type) {
case ScalarType::Byte: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<uint8_t>(THByteTensor_trace(self_)), options(ScalarType::Byte));
break;
}
case ScalarType::Char: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int8_t>(THCharTensor_trace(self_)), options(ScalarType::Char));
break;
}
case ScalarType::Double: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<double>(THDoubleTensor_trace(self_)), options(ScalarType::Double));
break;
}
case ScalarType::Float: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<float>(THFloatTensor_trace(self_)), options(ScalarType::Float));
break;
}
case ScalarType::Int: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int>(THIntTensor_trace(self_)), options(ScalarType::Int));
break;
}
case ScalarType::Long: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int64_t>(THLongTensor_trace(self_)), options(ScalarType::Long));
break;
}
case ScalarType::Short: {
auto self_ = checked_dense_tensor_unwrap(self, "self", 1, "_th_trace", false, DeviceType::CPU, dispatch_scalar_type);
return at::scalar_tensor(convert<int16_t>(THShortTensor_trace(self_)), options(ScalarType::Short));
break;
}
default:
AT_ERROR("_th_trace not supported on CPUType for ", dispatch_scalar_type);
}
}
std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A) {
// DeviceGuard omitted
auto dispatch_scalar_type = infer_scalar_type(self);
Expand Down
1 change: 0 additions & 1 deletion aten/src/ATen/LegacyTHFunctionsCPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ Tensor _th_renorm(const Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
Tensor & _th_renorm_(Tensor & self, Scalar p, int64_t dim, Scalar maxnorm);
Tensor & _th_histc_out(Tensor & result, const Tensor & self, int64_t bins, Scalar min, Scalar max);
Tensor _th_histc(const Tensor & self, int64_t bins, Scalar min, Scalar max);
Tensor _th_trace(const Tensor & self);
std::tuple<Tensor &,Tensor &> _th_gels_out(Tensor & res1, Tensor & res2, const Tensor & self, const Tensor & A);
std::tuple<Tensor,Tensor> _th_gels(const Tensor & self, const Tensor & A);
std::tuple<Tensor &,Tensor &> _th_eig_out(Tensor & res1, Tensor & res2, const Tensor & self, bool eigenvectors);
Expand Down
85 changes: 85 additions & 0 deletions aten/src/ATen/benchmarks/quantize_per_channel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <ATen/ATen.h>
#include <iostream>

#include <benchmark/benchmark.h>

static void quantize_per_channel_4d_contiguous(benchmark::State& state) {
const size_t batches = static_cast<size_t>(state.range(0));
const size_t channels = static_cast<size_t>(state.range(1));
const size_t height = static_cast<size_t>(state.range(2));
const size_t width = static_cast<size_t>(state.range(3));

at::Tensor a = at::rand({batches, channels, height, width});
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 1, at::ScalarType::QUInt8);
}
}

static void quantize_per_channel_4d_channels_last(benchmark::State& state) {
const size_t batches = static_cast<size_t>(state.range(0));
const size_t channels = static_cast<size_t>(state.range(1));
const size_t height = static_cast<size_t>(state.range(2));
const size_t width = static_cast<size_t>(state.range(3));

at::Tensor a = at::rand(
{batches, channels, height, width},
at::TensorOptions().memory_format(at::MemoryFormat::ChannelsLast));
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 1, at::ScalarType::QUInt8);
}
}

static void quantize_per_channel_2d(benchmark::State& state) {
const size_t channels = static_cast<size_t>(state.range(0));
const size_t nelem = static_cast<size_t>(state.range(1));

at::Tensor a = at::rand({channels, nelem});
at::Tensor scales = at::rand({channels});
at::Tensor zero_points = at::randint(
0, 10, {channels}, at::TensorOptions().dtype(at::ScalarType::Int));

at::Tensor qa;
for (auto _ : state) {
qa = at::native::quantize_per_channel_cpu(
a, scales, zero_points, 0, at::ScalarType::QUInt8);
}
}

static void GenerateSizes4d(benchmark::internal::Benchmark* b) {
b->ArgNames({"N", "C", "H", "W"});

for (size_t n = 16; n < 256; n *= 2) {
for (size_t c = 4; c < 256; c *= 2) {
for (size_t hw = 4; hw < 256; hw *= 2) {
b->Args({n, c, hw, hw});
}
}
}
}

static void GenerateSizes2d(benchmark::internal::Benchmark* b) {
b->ArgNames({"C", "N"});

for (size_t c = 4; c < 512; c *= 2) {
for (size_t n = 4; n < 512; n *= 2) {
b->Args({c, n});
}
}
}

BENCHMARK(quantize_per_channel_2d)->Apply(GenerateSizes2d);
BENCHMARK(quantize_per_channel_4d_contiguous)->Apply(GenerateSizes4d);
BENCHMARK(quantize_per_channel_4d_channels_last)->Apply(GenerateSizes4d);
BENCHMARK_MAIN();
2 changes: 1 addition & 1 deletion aten/src/ATen/core/Dimname.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct CAFFE2_API Dimname {
bool isWildcard() const { return type_ == NameType::WILDCARD; }

bool matches(Dimname other) const;
optional<Dimname> unify(Dimname other) const;
c10::optional<Dimname> unify(Dimname other) const;

private:
Dimname(Symbol name)
Expand Down
6 changes: 3 additions & 3 deletions aten/src/ATen/core/NamedTensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void check_names_valid_for(const Tensor& tensor, DimnameList names);
void check_names_valid_for(size_t tensor_dim, DimnameList names);

// Sets the names of `tensor` to be `names`.
CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, optional<DimnameList> names);
CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, c10::optional<DimnameList> names);
CAFFE2_API Tensor& internal_set_names_inplace(Tensor& tensor, std::vector<Dimname>&& names, bool validate_names);

constexpr size_t kMaxNamedTensorDim = 64;
Expand All @@ -110,7 +110,7 @@ namespace impl {

// Some helper functions on TensorImpl. Useful for working with names in TH.
// XXX: Ideally these would exist as methods on TensorImpl
CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, optional<DimnameList> names, bool validate_names);
CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, c10::optional<DimnameList> names, bool validate_names);
CAFFE2_API void internal_set_names_inplace(TensorImpl* impl, std::vector<Dimname>&& names, bool validate_names);

void check_names_valid_for(TensorImpl* impl, DimnameList names);
Expand All @@ -131,7 +131,7 @@ CAFFE2_API DimnameList get_names(const TensorImpl* impl);
// Returns the names of the tensor if they have been allocated; returns nullopt
// instead if the haven't been. The names of a tensor are not allocated if a
// tensor is constructed with names=None.
CAFFE2_API optional<DimnameList> get_opt_names(const TensorImpl* impl);
CAFFE2_API c10::optional<DimnameList> get_opt_names(const TensorImpl* impl);


} // namespace impl
Expand Down
26 changes: 26 additions & 0 deletions aten/src/ATen/native/TensorShape.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <algorithm>
#include <vector>
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/ExpandUtils.h>
#include <ATen/InferSize.h>
#include <ATen/NativeFunctions.h>
Expand Down Expand Up @@ -1984,4 +1985,29 @@ Tensor movedim(const Tensor& self, int64_t src, int64_t dst) {
return at::movedim(self, IntArrayRef{src}, IntArrayRef{dst});
}

Tensor trace_cpu(const Tensor& self) {
Tensor result = at::empty({}, self.options());
AT_DISPATCH_ALL_TYPES(self.scalar_type(), "trace", [&] {
using accscalar_t = at::acc_type<scalar_t, false>;
accscalar_t sum = 0;
const auto* t_data = self.data_ptr<scalar_t>();

int64_t t_stride_0, t_stride_1, t_diag_size;

TORCH_CHECK(self.dim() == 2, "trace: expected a matrix, but got tensor with dim ", self.dim());

t_stride_0 = self.stride(0);
t_stride_1 = self.stride(1);

t_diag_size = std::min(self.size(0), self.size(1));
for (int64_t i = 0; i < t_diag_size; i++) {
sum += t_data[i * (t_stride_0 + t_stride_1)];
}

*result.data_ptr<scalar_t>() = sum;
});

return result;
}

}} // at::native
3 changes: 2 additions & 1 deletion aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <ATen/core/op_registration/op_registration.h>
#include <ATen/native/metal/MetalPrepackOpContext.h>
#include <torch/script.h>
#include <ATen/ATen.h>


#if defined(C10_IOS)
#import <ATen/native/metal/mpscnn/MPSCNNOps.h>
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/native/native_functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5650,7 +5650,7 @@
use_c10_dispatcher: full
variants: method, function
dispatch:
CPU: legacy::cpu::_th_trace
CPU: trace_cpu
CUDA: trace_cuda

- func: trace_backward(Tensor grad, int[] sizes) -> Tensor
Expand Down

0 comments on commit 2abea89

Please sign in to comment.