quantize bias of the quantization parameters (#48749)

Summary: Pull Request resolved: #48749 this change reverts D25179863 (55e225a) because in 1.0.0.14 this behavior got reintroduced we believe this was already working pre 1.0.0.9, then intel regressed which is why we had to remove this quantization section, and in 1.0.0.14 they fixed it Test Plan: we tested ctr_instagram_5x which now passes with bitwise matching hl475 will test the top6 models and if they match, we will use this point to lock any further changes in the future Reviewed By: venkatacrc Differential Revision: D25283605 fbshipit-source-id: 33aa9af008c113d4d61e3461a44932b502bf42ea
pytorch · Dec 3, 2020 · b726a1b · b726a1b
1 parent dabc286
commit b726a1b
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 1 deletion.
diff --git a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
@@ -8,7 +8,12 @@
 from caffe2.python.fakelowp.test_utils import print_test_debug_info
 import caffe2.python.serialized_test.serialized_test_util as serial
 
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
+core.GlobalInit(["caffe2",
+                 "--caffe2_log_level=-3",
+                 "--glow_global_fp16=1",
+                 "--glow_clip_quant_range_to_fp16=1",
+                 "--glow_global_fp16_constants=1"
+                 ])
 
 
 class Int8OpsTest(serial.SerializedTestCase):

diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc
@@ -207,6 +207,13 @@ void QuantizeConvBias(
         bias.data<int32_t>(), bias.data<int32_t>() + bias.numel());
   } else {
     const float* bdata = bias.data<float>();
+    vector<float> bdata_local;
+    if (use_fp16) {
+      bdata_local.resize(bias.numel());
+      fbgemm::RoundToFloat16(
+              bdata, bdata_local.data(), bias.numel(), false /* FLAGS_caffe2_fbgemm_fake_fp16_clamp */);
+      bdata = bdata_local.data();
+    }
     b_quantized.resize(bias.numel());
     for (int g = 0; g < filter_qparams.size(); ++g) {
       int i_begin = g * (M / filter_qparams.size());