-
Notifications
You must be signed in to change notification settings - Fork 21.3k
/
qtanh.cpp
91 lines (78 loc) · 2.94 KB
/
qtanh.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <torch/library.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cpu/Loops.h>
#include <ATen/quantized/Quantizer.h>
#include <ATen/native/quantized/cpu/quantized_ops.h>
#include <ATen/native/quantized/cpu/init_qnnpack.h>
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
#include <algorithm>
namespace at {
namespace native {
DEFINE_DISPATCH(qtanh_stub);
#ifdef USE_PYTORCH_QNNPACK
// This ALWAYS outputs scale=2.0/256, zp=128, dtype=quint8
Tensor qnnpack_tanh(Tensor input) {
TORCH_CHECK(input.ndimension() > 0, "qnnpack_tanh(): Got empty input tensor");
Tensor qy;
constexpr float output_scale = 2.0f / 256.0f;
constexpr int32_t output_zero_point = 128;
initQNNPACK();
Tensor input_contig = input.contiguous(input.suggest_memory_format());
size_t num_elems = 1;
for (int i = 1; i < input_contig.ndimension(); ++i) {
num_elems *= input_contig.size(i);
}
const auto zero_point = input_contig.q_zero_point();
const auto scale = input_contig.q_scale();
pytorch_qnnp_operator_t tanh_op{nullptr};
const pytorch_qnnp_status createStatus = pytorch_qnnp_create_tanh_nc_q8(
num_elems /* channels */,
zero_point /* input zero point */,
scale /* input scale */,
output_zero_point /* output zero point */,
output_scale /* output scale */,
std::numeric_limits<uint8_t>::min() /* output min */,
std::numeric_limits<uint8_t>::max() /* output max */,
0 /* flags */,
&tanh_op);
TORCH_INTERNAL_ASSERT(createStatus == pytorch_qnnp_status_success,
"failed to create QNNPACK TanH operator");
qy = at::_empty_affine_quantized(
input_contig.sizes(),
at::device(kCPU).dtype(input_contig.dtype()),
output_scale,
output_zero_point,
input_contig.suggest_memory_format());
const pytorch_qnnp_status setupStatus = pytorch_qnnp_setup_tanh_nc_q8(
tanh_op,
input_contig.size(0) /* batch size */,
(uint8_t*)input_contig.data_ptr<c10::quint8>() /* input data */,
num_elems /* input stride */,
(uint8_t*)qy.data_ptr<c10::quint8>() /* output data */,
num_elems /* output stride */);
TORCH_INTERNAL_ASSERT(setupStatus == pytorch_qnnp_status_success,
"failed to setup QNNPACK TanH operator");
pthreadpool_t threadpool = caffe2::pthreadpool_();
const pytorch_qnnp_status runStatus =
pytorch_qnnp_run_operator(tanh_op, threadpool);
TORCH_INTERNAL_ASSERT(
runStatus == pytorch_qnnp_status_success,
"failed to run QNNPACK TanH operator");
return qy;
}
#endif // USE_PYTORCH_QNNPACK
Tensor tanh_quantized_cpu(const Tensor& qx) {
#ifdef USE_PYTORCH_QNNPACK
if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
qx.scalar_type() == kQUInt8) {
return qnnpack_tanh(qx);
}
#endif // USE_PYTORCH_QNNPACK
Tensor qy;
qtanh_stub(qx.device().type(), qx, qy);
return qy;
}
}} // namespace at::native