Skip to content

Commit e79700f

Browse files
author
ilia-cher
committed
Update on "Add HIP to the memory profiler device list"
Summary: Add HIP alongside CUDA Test Plan: rocm CI Differential Revision: [D21665627](https://our.internmc.facebook.com/intern/diff/D21665627) [ghstack-poisoned]
2 parents 7461432 + 5b1814e commit e79700f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+8300
-867
lines changed

aten/src/ATen/native/quantized/cpu/qconv.cpp

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -474,12 +474,7 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
474474
act.ndimension(), stride_.size(), padding_.size(), dilation_.size());
475475

476476
auto* pack_w = w.get();
477-
// Adjust weight zero point, similar to weight data.
478-
auto kernel_zp = w_zp + 128;
479-
const auto& kernel_scale = w_scale;
480477

481-
const uint32_t kernel_h = kernel[0];
482-
const uint32_t kernel_w = kernel[1];
483478
// TODO Can be replaced with packB->getOutputChannels() when update pre-pack
484479
// to actually do the packing.
485480
const auto out_ch = bias.size(0);
@@ -490,14 +485,12 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
490485
const int W = act.size(3);
491486
const int M = out_ch; // output channels
492487

493-
const at::Tensor act_nhwc = act.contiguous(c10::MemoryFormat::ChannelsLast);
488+
TORCH_CHECK( M == orig_weight.size(0),
489+
"Output channel size of weight and bias must match.");
490+
TORCH_CHECK( C == groups_ * orig_weight.size(1),
491+
"Output channel size of weight and bias must match.");
494492

495-
const uint32_t stride_h = stride_[0];
496-
const uint32_t stride_w = stride_[1];
497-
const uint32_t pad_h = padding_[0];
498-
const uint32_t pad_w = padding_[1];
499-
const uint32_t dilation_h = dilation_[0];
500-
const uint32_t dilation_w = dilation_[1];
493+
const at::Tensor act_nhwc = act.contiguous(c10::MemoryFormat::ChannelsLast);
501494

502495
auto output_min = kReluFused
503496
? activationLimits(output_scale, output_zero_point, Activation::RELU)
@@ -507,20 +500,6 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
507500
? activationLimits(output_scale, output_zero_point, Activation::RELU)
508501
.second
509502
: std::numeric_limits<uint8_t>::max();
510-
qnnpack::conv_param_t conv_p(
511-
{kernel_w, kernel_h},
512-
{stride_w, stride_h},
513-
{dilation_w, dilation_h},
514-
{pad_h, pad_w, pad_h, pad_w},
515-
/*adjustment=*/{0, 0},
516-
groups_,
517-
C,
518-
M,
519-
kernel_zp,
520-
kernel_scale,
521-
output_min,
522-
output_max,
523-
/*transpose=*/false);
524503

525504
double act_input_scale = act_nhwc.q_scale();
526505

@@ -532,27 +511,51 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
532511
auto bias_fp32 = bias;
533512
int8_t* w_data =
534513
reinterpret_cast<int8_t*>(weight_contig.template data_ptr<c10::qint8>());
514+
515+
float* weight_scales_data = w_scales.data_ptr<float>();
516+
// We calculate requant scale here as the vector holding the requant scale
517+
// is owned by this module. The pointer is then passed to qnnpack backend.
518+
generate_requantization_scales(
519+
w_scales, act_input_scale, output_scale, requantization_scales);
520+
521+
// TODO Kimish, we are allocating affine_quantized regardless of per channel or not.
522+
// This allocation is actually used only for packing weight and thus will be freed.
523+
// Still we should be consistent. Fix this.
535524
at::Tensor qnnp_weight = at::_empty_affine_quantized(
536525
weight_contig.sizes(),
537526
at::device(c10::kCPU)
538527
.dtype(c10::kQUInt8)
539528
.memory_format(c10::MemoryFormat::ChannelsLast),
540-
kernel_scale,
541-
kernel_zp,
529+
weight_scales_data[0],
530+
w_zero_points[0],
542531
c10::nullopt);
543532
auto* qnnp_w_data = qnnp_weight.template data_ptr<c10::quint8>();
544533
auto wt_numel = weight_contig.numel();
545534
for (int i = 0; i < wt_numel; ++i) {
546535
qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
547536
}
537+
at::Tensor qbias;
548538
// Original bias was float, so we requantize it here.
549-
auto qbias = at::quantize_per_tensor(
550-
bias_fp32, kernel_scale * act_input_scale, 0, c10::kQInt32);
539+
if (conv_p.per_channel) {
540+
at::Tensor bias_quant_scales =
541+
weight_contig.q_per_channel_scales() * act_input_scale;
542+
at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
543+
qbias = at::native::quantize_per_channel_cpu(
544+
bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
545+
} else {
546+
qbias = at::native::quantize_per_tensor(
547+
bias_fp32,
548+
weight_contig.q_scale() * act_input_scale,
549+
0,
550+
c10::kQInt32);
551+
}
552+
551553
// Update the input scale to not pack again.
552554
input_scale = act_input_scale;
553555
w.reset();
554556
w = std::make_unique<qnnpack::PrePackConvWeights>(
555557
conv_p,
558+
w_zero_points.data(),
556559
reinterpret_cast<uint8_t*>(qnnp_w_data),
557560
reinterpret_cast<int32_t*>(qbias.template data_ptr<c10::qint32>()));
558561
pack_w = w.get();
@@ -562,9 +565,10 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
562565
orig_weight.reset();
563566
}
564567
}
568+
565569
TORCH_INTERNAL_ASSERT(pack_w != nullptr, "Packed Weights are NULL");
566570
const auto output_shape = MakeConvOutputShape<kSpatialDim>(
567-
N, M, {H, W}, kernel, stride_, padding_, dilation_);
571+
N, M, {H, W}, kernel_, stride_, padding_, dilation_);
568572
if (act_nhwc.numel() > 0) {
569573
TORCH_CHECK(
570574
std::all_of(
@@ -591,11 +595,13 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl(
591595
N,
592596
H,
593597
W,
594-
act_nhwc.q_scale(),
595598
act_nhwc.q_zero_point(),
596599
reinterpret_cast<uint8_t*>(act_nhwc.template data_ptr<c10::quint8>()),
597-
output.q_scale(),
600+
w_zero_points.data(),
601+
requantization_scales.data(),
598602
output.q_zero_point(),
603+
output_min,
604+
output_max,
599605
reinterpret_cast<uint8_t*>(output.template data_ptr<c10::quint8>()),
600606
caffe2::mobile_pthreadpool());
601607

aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp

Lines changed: 9 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -172,11 +172,6 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
172172
weight.ndimension() == 4,
173173
"quantized::conv2d_prepack (qnnpack): Weights are expected to have 4 "
174174
"dimensions");
175-
const auto qtype = weight.qscheme();
176-
TORCH_CHECK(
177-
weight.qscheme() == c10::kPerTensorAffine,
178-
"quantized::conv2d_prepack (qnnpack): only supports Per Tensor "
179-
"Quantization Scheme")
180175
TORCH_CHECK(
181176
stride.size() == 2,
182177
"quantized::conv2d_prepack (qnnpack): 2D convolution only");
@@ -193,7 +188,6 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
193188
// QNNPACK expects weights to be of the format {out_c, kH, kW, in_c/groups},
194189
// but PyTorch lays them out as {out_c, in_c/groups, kH, kW}
195190
const size_t out_ch = weight.size(0);
196-
const size_t in_ch = weight.size(1) * groups;
197191
const uint32_t kernel_h = weight.size(2);
198192
const uint32_t kernel_w = weight.size(3);
199193

@@ -203,6 +197,7 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
203197
} else {
204198
bias_fp32 = at::zeros(out_ch, weight.options().dtype(at::kFloat));
205199
}
200+
206201
TORCH_CHECK(
207202
!bias_fp32.defined() ||
208203
(bias_fp32.ndimension() == 1 && bias_fp32.size(0) == out_ch),
@@ -214,30 +209,13 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
214209
bias_fp32.sizes(),
215210
" instead");
216211

217-
uint32_t stride_h = stride[0];
218-
uint32_t stride_w = stride[1];
219-
uint32_t pad_t = padding[0];
220-
uint32_t pad_l = padding[1];
221-
uint32_t dilation_h = dilation[0];
222-
uint32_t dilation_w = dilation[1];
223-
224-
qnnpack::conv_param_t conv_p(
225-
{kernel_w, kernel_h},
226-
{stride_w, stride_h},
227-
{dilation_w, dilation_h},
228-
{pad_t, pad_l, pad_t, pad_l},
229-
/*adjustment=*/{0, 0},
230-
groups,
231-
in_ch,
232-
out_ch,
233-
weight.q_zero_point(),
234-
weight.q_scale(),
235-
std::numeric_limits<uint8_t>::min(),
236-
std::numeric_limits<uint8_t>::max(),
237-
/*transpose=*/false);
238-
239212
auto weight_contig = weight.contiguous(c10::MemoryFormat::ChannelsLast);
213+
const bool is_per_channel = weight_contig.qscheme() == at::kPerChannelAffine;
240214

215+
std::vector<uint8_t> w_zero_points;
216+
at::Tensor w_scales;
217+
std::tie(w_zero_points, w_scales) =
218+
make_zero_points_and_scales_tensor(weight_contig);
241219
// We set the pre-packed conv weights to nullptr below as we call pre-pack
242220
// during the first invocation of operator run. Refer to qconv.cpp for more
243221
// details. TODO Update to actually call pre-pack here once bias is removed
@@ -254,8 +232,9 @@ c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> PackedConvWeightsQnnp<
254232
groups,
255233
c10::nullopt, /* input_scale */
256234
{kernel_h, kernel_w},
257-
static_cast<float>(weight.q_scale()),
258-
static_cast<int32_t>(weight.q_zero_point())});
235+
w_scales,
236+
std::move(w_zero_points),
237+
is_per_channel});
259238

260239
return ret_ptr;
261240
}

aten/src/ATen/native/quantized/cpu/qlinear.cpp

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,6 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
236236
auto input_contig = input.contiguous();
237237

238238
auto packB = w.get();
239-
// Adjust weight zero point, similar to weight data.
240-
auto kernel_zp = w_zp + 128;
241-
auto kernel_scale = w_scale;
242239
size_t rows_w = bias_.size(0);
243240
size_t cols_w = input_contig.size(input_contig.dim() - 1);
244241
auto input_scale = input_contig.q_scale();
@@ -249,29 +246,48 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
249246
auto weight_contig = orig_weight;
250247
auto bias_fp32 = bias_;
251248
int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
249+
250+
float* weight_scales_data = w_scales.data_ptr<float>();
251+
// We calculate requant scale here as the vector holding the requant scale
252+
// is owned by this module. The pointer is then passed to qnnpack backend.
253+
generate_requantization_scales(
254+
w_scales, input_scale, output_scale, requantization_scales);
255+
252256
at::Tensor qnnp_weight = at::_empty_affine_quantized(
253257
weight_contig.sizes(),
254258
at::device(c10::kCPU).dtype(c10::kQUInt8),
255-
kernel_scale,
256-
kernel_zp);
259+
weight_scales_data[0],
260+
w_zero_points[0]);
257261
auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
258262
auto wt_numel = weight_contig.numel();
259263
for (int i = 0; i < wt_numel; ++i) {
260264
qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
261265
}
262266
// Original bias was float, so we requantize it here.
263-
auto qbias = at::quantize_per_tensor(
264-
bias_fp32, kernel_scale * input_scale, 0, c10::kQInt32);
267+
const bool is_per_channel = orig_weight.qscheme() == at::kPerChannelAffine;
268+
at::Tensor qbias;
269+
// Original bias was float, so we requantize it here.
270+
if (is_per_channel) {
271+
at::Tensor bias_quant_scales =
272+
weight_contig.q_per_channel_scales() * input_scale;
273+
at::Tensor bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt);
274+
qbias = at::native::quantize_per_channel_cpu(
275+
bias_fp32, bias_quant_scales, bias_zp, 0, c10::kQInt32);
276+
} else {
277+
qbias = at::native::quantize_per_tensor(
278+
bias_fp32, weight_contig.q_scale() * input_scale, 0, c10::kQInt32);
279+
}
280+
265281
// Update the input scale to not pack again.
266282
this->input_scale = input_scale;
267283
w.reset();
268284
w = std::make_unique<qnnpack::PackBMatrix>(
269285
cols_w /* input_channels */,
270286
rows_w /* output_channels */,
271-
kernel_zp,
272-
kernel_scale,
273-
(uint8_t*)qnnp_w_data,
274-
(int32_t*)qbias.data_ptr<c10::qint32>());
287+
w_zero_points.data(),
288+
requantization_scales.data(),
289+
reinterpret_cast<uint8_t*>(qnnp_w_data),
290+
reinterpret_cast<int32_t*>(qbias.data_ptr<c10::qint32>()));
275291
packB = w.get();
276292
if (at::globalContext().releaseWeightsWhenPrepacking()) {
277293
// On mobile, we release the original weight by resetting the intrusive_ptr.
@@ -315,11 +331,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl(
315331
cols_input /* input_channels */,
316332
rows_w /* output_channels */,
317333
input_contig.q_zero_point(),
318-
input_contig.q_scale(),
319-
kernel_zp,
320-
kernel_scale,
334+
w_zero_points.data(),
335+
requantization_scales.data(),
321336
output_zero_point,
322-
output_scale,
323337
output_min,
324338
output_max,
325339
(uint8_t*)input_contig.data_ptr<c10::quint8>(),

aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,6 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
227227
// matrices, respectively.
228228

229229
auto packB = w.get();
230-
// Adjust weight zero point, similar to weight data.
231-
auto kernel_zp = w_zp + 128;
232-
auto kernel_scale = w_scale;
233230
size_t rows_w = bias_.size(0);
234231
size_t cols_w = input_contig.size(input_contig.dim() - 1);
235232

@@ -250,30 +247,38 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
250247
/*max=*/x_max,
251248
/*qmin=*/0,
252249
/*qmax=*/255);
250+
float* weight_scales_data = w_scales.data_ptr<float>();
251+
if (!input_scale.has_value() || input_scale.value() != q_params.scale) {
252+
generate_requantization_scales(
253+
w_scales, q_params.scale, 1.f, requantization_scales);
254+
}
255+
253256
if (!input_scale.has_value()) {
254257
// Get the original weight and adjust it to uint8 from int8
255258
auto weight_contig = orig_weight;
256-
int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
259+
260+
// TODO(kimishpatel), we are allocating affine_quantized regardless of per channel or not.
261+
// This allocation is actually used only for packing weight and thus will be freed.
262+
// Still we should be consistent. Fix this.
257263
Tensor qnnp_weight = at::_empty_affine_quantized(
258264
weight_contig.sizes(),
259265
at::device(c10::kCPU).dtype(c10::kQUInt8),
260-
kernel_scale,
261-
kernel_zp);
266+
weight_scales_data[0],
267+
w_zero_points[0]);
262268
auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
269+
int8_t* w_data = (int8_t*)weight_contig.data_ptr<c10::qint8>();
263270
auto wt_numel = weight_contig.numel();
264271
for (int i = 0; i < wt_numel; ++i) {
265272
qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
266273
}
267274

268-
// Update the input scale to not pack again.
269275
// Pass in nullptr for bias, as we pass FP32 bias to run function.
270-
input_scale = q_params.scale;
271276
w.reset();
272277
w = std::make_unique<qnnpack::PackBMatrix>(
273278
cols_w /* input_channels */,
274279
rows_w /* output_channels */,
275-
kernel_zp,
276-
kernel_scale,
280+
w_zero_points.data(),
281+
requantization_scales.data(),
277282
(uint8_t*)qnnp_w_data,
278283
nullptr);
279284
packB = w.get();
@@ -284,6 +289,10 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
284289
}
285290
}
286291

292+
// Update the input scale to not pack weights again.
293+
// as well as to avoid repopulating requant scale if scale has not changed.
294+
input_scale = q_params.scale;
295+
287296
// Quantize input
288297
Tensor q_input = at::quantize_per_tensor(
289298
input_contig, q_params.scale, q_params.zero_point, c10::kQUInt8);
@@ -307,9 +316,9 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) {
307316
cols_input /* input_channels */,
308317
rows_w /* output_channels */,
309318
q_input.q_zero_point(),
310-
q_input.q_scale(),
311-
kernel_zp,
312-
kernel_scale,
319+
w_zero_points.data(),
320+
/* for dynamic should really be called dequant scale */
321+
requantization_scales.data(),
313322
(uint8_t*)q_input.data_ptr<c10::quint8>(),
314323
cols_input /* input_stride */,
315324
packB->getPackedWeights(),

0 commit comments

Comments
 (0)