-
Notifications
You must be signed in to change notification settings - Fork 21.4k
/
qreduction.cpp
145 lines (133 loc) · 4.5 KB
/
qreduction.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#include <ATen/ATen.h>
#include <ATen/NamedTensorUtils.h>
#include <ATen/NativeFunctions.h>
#include <ATen/native/quantized/cpu/init_qnnpack.h>
#include <ATen/native/quantized/cpu/qnnpack_utils.h>
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
namespace at {
namespace native {
#ifdef USE_PYTORCH_QNNPACK
Tensor qnnpack_mean(const Tensor& input, IntArrayRef dim) {
Tensor output;
TORCH_CHECK(
input.ndimension() == 4,
"qnnpack_global_average_pool: Expected input to be 4-dimensional: got ",
input.ndimension());
TORCH_CHECK(
dim.size() == 2,
"qnnpack_global_average_pool: dim size must be a tuple of two ints");
TORCH_CHECK(
dim[0] == 2 && dim[1] == 3,
"qnnpack_global_average_pool: Reduction dimensions must match last 2 dimensions of input tensor")
const int64_t batch_size = input.size(0);
const int64_t inC = input.size(1);
const int64_t inH = input.size(2);
const int64_t inW = input.size(3);
Tensor input_contig = input.contiguous(MemoryFormat::ChannelsLast);
initQNNPACK();
const auto scale = input_contig.q_scale();
const auto zero_point = input_contig.q_zero_point();
const auto outC = inC;
output = at::_empty_affine_quantized(
{batch_size, outC}, at::device(kCPU).dtype(kQUInt8), scale, zero_point);
pytorch_qnnp_operator_t qnnpack_operator{nullptr};
const pytorch_qnnp_status createStatus =
pytorch_qnnp_create_global_average_pooling_nwc_q8(
inC,
zero_point,
scale,
zero_point,
scale,
std::numeric_limits<uint8_t>::min() /* output min */,
std::numeric_limits<uint8_t>::max() /* output max */,
0,
&qnnpack_operator);
CAFFE_ENFORCE(
createStatus == pytorch_qnnp_status_success,
"failed to create QNNPACK Global Average Pooling operator");
std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>
qnnpack_uniq_ptr(qnnpack_operator);
const pytorch_qnnp_status setupStatus =
pytorch_qnnp_setup_global_average_pooling_nwc_q8(
qnnpack_operator,
batch_size,
inH * inW,
(uint8_t*)input_contig.data_ptr<c10::quint8>() /* input data */,
inC,
(uint8_t*)output.data_ptr<c10::quint8>() /* output data */,
outC);
CAFFE_ENFORCE(
setupStatus == pytorch_qnnp_status_success,
"failed to setup QNNPACK Global Average Pooling operator");
pthreadpool_t threadpool = caffe2::pthreadpool_();
const pytorch_qnnp_status runStatus =
pytorch_qnnp_run_operator(qnnpack_operator, threadpool);
TORCH_INTERNAL_ASSERT(
runStatus == pytorch_qnnp_status_success,
"failed to run QNNPACK Global Average Pool operator");
return output;
}
#endif
Tensor& mean_out_quantized_cpu(
Tensor& result,
const Tensor& self,
IntArrayRef dim,
bool keepdim,
c10::optional<ScalarType> opt_dtype) {
#ifdef USE_PYTORCH_QNNPACK
if (at::globalContext().qEngine() == at::QEngine::QNNPACK &&
self.scalar_type() == kQUInt8 &&
// QNNPACK currently is only supported for NCHW + dim=(2, 3)
// Remove these checks after generic version is implemented.
self.ndimension() == 4 &&
dim.size() == 2 &&
dim[0] == 2 &&
dim[1] == 3
){
result = qnnpack_mean(self, dim);
return result;
}
#endif
auto self_dequantized = self.dequantize();
auto result_dequantized =
at::native::mean_cpu_gpu(self_dequantized, dim, keepdim, opt_dtype);
result = at::quantize_per_tensor(
result_dequantized,
self.q_scale(),
self.q_zero_point(),
opt_dtype.value_or(self.scalar_type()));
return result;
}
Tensor mean_quantized_cpu(const Tensor& self, optional<ScalarType> dtype) {
Tensor result;
mean_out_quantized_cpu(result, self, IntArrayRef{}, false, dtype);
return result;
}
Tensor mean_quantized_cpu(
const Tensor& self,
IntArrayRef dim,
bool keepdim,
optional<ScalarType> dtype) {
Tensor result;
mean_out_quantized_cpu(result, self, dim, keepdim, dtype);
return result;
}
Tensor mean_quantized_cpu(
const Tensor& self,
DimnameList dim,
bool keepdim,
optional<ScalarType> dtype) {
return mean_quantized_cpu(
self, dimnames_to_positions(self, dim), keepdim, dtype);
}
Tensor& mean_out_quantized_cpu(
Tensor& result,
const Tensor& self,
DimnameList dim,
bool keepdim,
c10::optional<ScalarType> opt_dtype) {
return mean_out_quantized_cpu(
result, self, dimnames_to_positions(self, dim), keepdim, opt_dtype);
}
} // namespace native
} // namespace at