pytorch · mciprian13 · Aug 20, 2019 · Aug 20, 2019 · Aug 20, 2019 · Aug 21, 2019
diff --git a/include/glow/Quantization/Base/Base.h b/include/glow/Quantization/Base/Base.h
@@ -53,7 +53,7 @@ struct QuantizationTransform32To8 {
   int32_t transform(int32_t input) {
     // The operation x >> y is rounded down to negative infinity. To get to
     // round-nearest we add (1 << (shift - 1)) to the value prior to shifting.
-    int rtn = (1 << (post - 1));
+    int rtn = (post > 0) ? (1 << (post - 1)) : 0;
     return ((((input >> pre) * scale) + rtn) >> post) + offset;
   }
 };
@@ -118,6 +118,14 @@ enum Schema {
   /// version of the quantized type with an offset of zero:
   /// For example, int8 is [-128; 127] - (-128) == uint8 [0; 255] - 0
   SymmetricWithUnsigned,
+  /// Quantization schema with:
+  /// - range centered on 0 (symmetric): offset == 0.
+  /// - scale parameter is a power of 2: scale = 2^E where E is a signed
+  ///   exponent. Since the scale parameter is mostly subunitary, the
+  ///   exponent is mostly negative.
+  /// Since the scale parameter is stored as floating point, the values
+  /// of E which are exactly representable range from -126 to 127.
+  SymmetricWithPower2Scale,
 };
 
 /// Configuration for Quantization, passed into \ref quantizeFunction().
@@ -163,7 +171,9 @@ template <class SrcTy, class DestTy> DestTy clip(SrcTy in) {
 template <class DestTy = int8_t>
 inline DestTy quantize(float input, const TensorQuantizationParams &TQP) {
   float result = input / TQP.scale + TQP.offset;
-  return quantization::clip<int32_t, DestTy>((int32_t)nearbyintf(result));
+  // Note: use int64_t since casts of large values might be wrapped around
+  // before clipping, for example for result = 2147483648.00 (float).
+  return quantization::clip<int64_t, DestTy>((int64_t)nearbyintf(result));
 }
 
 /// Converts a quantized value (type eTy) to floating point based on the
@@ -347,6 +357,12 @@ void tensorFusedRowwiseQuantization(const Tensor &input, Tensor &output) {
   }
 }
 
+/// Verify if float is an exact power of 2 (mantissa is exactly 1.0).
+bool isFloatPowerOf2(float val);
+
+/// Get float 2's exponent.
+int getFloat2Exp(float val);
+
 } // namespace quantization
 } // namespace glow
 

diff --git a/lib/Quantization/Base/Base.cpp b/lib/Quantization/Base/Base.cpp
@@ -1,5 +1,6 @@
 /**
  * Copyright (c) 2017-present, Facebook, Inc.
+ * Copyright (c) 2019-present, NXP Semiconductor, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -201,6 +202,27 @@ QuantizationTransform32To8 quantizeScaleOffset32To8(float scale,
   int preShift = 0;
   int postShift = 0;
 
+  // We treat first the particular case when scale is a power of 2 (2 ^ exp,
+  // where exp is a signed integer exponent). The operation is specialized as:
+  // - for positive 2's exponent:
+  //     x * scale + offset (pre = 0, post = 0, scale = (int)scale).
+  // - for negative 2's exponent:
+  //     x >> post + offset (pre = 0, post = -exp, scale = 1).
+  if (isFloatPowerOf2(scale)) {
+    int exp = getFloat2Exp(scale);
+    if (exp > 0) {
+      return QuantizationTransform32To8(0,                       // pre
+                                        0,                       // post
+                                        static_cast<int>(scale), // scale
+                                        offset);                 // offset
+    } else {
+      return QuantizationTransform32To8(0,       // pre
+                                        -exp,    // post
+                                        1,       // scale
+                                        offset); // offset
+    }
+  }
+
   // Calculate the post-shift value. It's always safe to increase scale as long
   // as it's below one, and it's always legal to shift at least 15 bits for
   // small scale values.
@@ -277,7 +299,8 @@ TensorQuantizationParams chooseQuantizationParams(float min, float max,
       schema = quantization::Schema::Symmetric;
     }
   }
-  if (schema == quantization::Schema::Symmetric) {
+  if (schema == quantization::Schema::Symmetric ||
+      schema == quantization::Schema::SymmetricWithPower2Scale) {
     // Check which end saturates the output dynamic range earlier
     // and extend the other end to map the zero-point to quantized 0.
     double rmin = min / (double)qmin;
@@ -338,6 +361,11 @@ TensorQuantizationParams chooseQuantizationParams(float min, float max,
     nudgedZeroPoint = static_cast<int32_t>(round(initialZeroPoint));
   }
 
+  // For SymmetricWithPower2Scale, round scale to nearest higher power of 2.
+  if (schema == quantization::Schema::SymmetricWithPower2Scale) {
+    scale = std::exp2(std::ceil(std::log2(scale)));
+  }
+
   TensorQuantizationParams result{static_cast<float>(scale), nudgedZeroPoint};
   // The only valid offset for symmetric quantization is 0.
   assert((result.offset == 0 || schema != quantization::Schema::Symmetric) &&
@@ -349,6 +377,17 @@ TensorQuantizationParams chooseQuantizationParams(float min, float max,
           schema != quantization::Schema::SymmetricWithUnsigned) &&
          "Symmetric quantization with unsigned should be centered on 0 or on "
          "-qmin");
+
+  // For SymmetricWithPower2Scale schema the offset should be 0.
+  assert((result.offset == 0 ||
+          schema != quantization::Schema::SymmetricWithPower2Scale) &&
+         "Symmetric quantization should be centered on 0");
+
+  // For SymmetricWithPower2Scale schema the scale should be a power of 2.
+  assert((isFloatPowerOf2(result.scale) ||
+          schema != quantization::Schema::SymmetricWithPower2Scale) &&
+         "Scale quantization parameter should be a power of 2");
+
   return result;
 }
 
@@ -377,5 +416,13 @@ std::vector<int8_t> createMapping(TypeRef inTy, TypeRef outTy,
   return mapping;
 }
 
+bool isFloatPowerOf2(float val) {
+  // frexp returns mantissa normalized in [0.5,1) so compare with 0.5.
+  int exp;
+  return (std::abs(std::frexp(val, &exp)) == 0.5);
+}
+
+int getFloat2Exp(float val) { return std::ilogb(val); }
+
 } // namespace quantization
 } // namespace glow
diff --git a/lib/Quantization/Serialization.cpp b/lib/Quantization/Serialization.cpp
@@ -1,5 +1,6 @@
 /**
  * Copyright (c) 2017-present, Facebook, Inc.
+ * Copyright (c) 2019-present, NXP Semiconductor, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,11 +29,52 @@
 namespace llvm {
 namespace yaml {
 
+/// The default behavior of YAML is to serialize floating point numbers
+/// using the "%g" format specifier which is not guaranteed to print all
+/// the decimals. During a round-trip (serialize, deserialize) decimals
+/// might be lost and hence precision is lost. Although this might not be
+/// critical for some quantization schema, for "SymmetricWithPower2Scale"
+/// the round-trip must preserve the exact representation of the floating
+/// point scale which is a power of 2. The code below is a workaround to
+/// overwrite the behavior of the YAML serializer to print all the digits.
+struct FloatWrapper {
+  float _val;
+  FloatWrapper(float val) : _val(val) {}
+};
+
+template <> struct ScalarTraits<FloatWrapper> {
+  static void output(const FloatWrapper &value, void *ctxt,
+                     llvm::raw_ostream &out) {
+    // Print number with all the digits and without trailing 0's
+    char buffer[200];
+    snprintf(buffer, sizeof(buffer), "%.126f", value._val);
+    int n = strlen(buffer) - 1;
+    while ((n > 0) && (buffer[n] == '0') && (buffer[n - 1] != '.')) {
+      buffer[n--] = '\0';
+    }
+    out << buffer;
+  }
+  static StringRef input(StringRef scalar, void *ctxt, FloatWrapper &value) {
+    if (to_float(scalar, value._val))
+      return StringRef();
+    return "invalid floating point number";
+  }
+  static QuotingType mustQuote(StringRef) { return QuotingType::None; }
+};
+
 /// Mapping for NodeQuantizationInfo yaml serializer.
 template <> struct MappingTraits<glow::NodeQuantizationInfo> {
+  struct FloatNormalized {
+    FloatNormalized(IO &io) : _val(0.0) {}
+    FloatNormalized(IO &, float &val) : _val(val) {}
+    float denormalize(IO &) { return _val._val; }
+    FloatWrapper _val;
+  };
   static void mapping(IO &io, glow::NodeQuantizationInfo &info) {
+    MappingNormalization<FloatNormalized, float> scale(
+        io, info.tensorQuantizationParams_.scale);
     io.mapRequired("nodeOutputName", info.nodeOutputName_);
-    io.mapRequired("scale", info.tensorQuantizationParams_.scale);
+    io.mapRequired("scale", scale->_val);
     io.mapRequired("offset", info.tensorQuantizationParams_.offset);
   }
 };

diff --git a/tests/unittests/QuantizationTest.cpp b/tests/unittests/QuantizationTest.cpp
@@ -102,9 +102,21 @@ void testSerialization(const std::vector<NodeQuantizationInfo> &expected) {
 }
 
 TEST(Quantization, Serialize) {
-  std::vector<NodeQuantizationInfo> expected{
-      {"first", {1, 10}}, {"second", {-1, 3}}, {"third", {-10, 30}}};
+  std::vector<NodeQuantizationInfo> expected{{"first", {1, 10}},
+                                             {"second", {-1, 3}},
+                                             {"third", {-10, 30}},
+                                             {"fourth", {0.1, -10}},
+                                             {"fifth", {0.123, -30}}};
+  testSerialization(expected);
+}
 
+TEST(Quantization, SerializePower2Scale) {
+  std::vector<NodeQuantizationInfo> expected{
+      {"pwr_neg_0", {1.0000000000f, 0}}, {"pwr_neg_1", {0.5000000000f, 0}},
+      {"pwr_neg_2", {0.2500000000f, 0}}, {"pwr_neg_3", {0.1250000000f, 0}},
+      {"pwr_neg_4", {0.0625000000f, 0}}, {"pwr_neg_5", {0.0312500000f, 0}},
+      {"pwr_neg_6", {0.0156250000f, 0}}, {"pwr_neg_7", {0.0078125000f, 0}},
+      {"pwr_neg_8", {0.0039062500f, 0}}, {"pwr_neg_9", {0.0019531250f, 0}}};
   testSerialization(expected);
 }
 
@@ -150,6 +162,34 @@ TEST(Quantization, quantScaleOffset) {
   }
 }
 
+TEST(Quantization, quantScaleOffsetPower2Scale) {
+  // Test different power of 2 scale values (from 2^-10 to 2^1).
+  float scales[] = {0.0009765625f, 0.0019531250f, 0.0039062500f, 0.0078125000f,
+                    0.0156250000f, 0.0312500000f, 0.0625000000f, 0.1250000000f,
+                    0.2500000000f, 0.5000000000f, 1.0000000000f, 2.0000000000f};
+
+  // Try all scale factors:
+  for (float scale : scales) {
+    // Try all legal integers within the range:
+    for (int8_t input = -128; input < 127; input++) {
+      int32_t sum32num = round(input / scale);
+      auto TR = quantization::quantizeScaleOffset32To8(scale, 0);
+      EXPECT_EQ(quantization::isFloatPowerOf2(scale), true);
+      EXPECT_EQ(TR.pre, 0);
+      int exp = quantization::getFloat2Exp(scale);
+      if (exp > 0) {
+        EXPECT_EQ(TR.scale, (int)scale);
+        EXPECT_EQ(TR.post, 0);
+      } else {
+        EXPECT_EQ(TR.scale, 1);
+        EXPECT_EQ(TR.post, -exp);
+      }
+      int32_t computed = TR.transform(sum32num);
+      EXPECT_NEAR(input, computed, 1);
+    }
+  }
+}
+
 template <class qtype>
 void quantizeTensorTest(ElemKind qTy, quantization::Schema schema) {
   // Map float [0.0; 6.0] to a quantized type using its entire value range.
@@ -233,6 +273,18 @@ TEST(Quantization, quantizeTensorSymmetricUInt32) {
   quantizeTensorTest<int32_t>(ElemKind::Int32QTy,
                               quantization::Schema::SymmetricWithUnsigned);
 }
+TEST(Quantization, quantizeTensorSymmetricPwr2Int8) {
+  quantizeTensorTest<int8_t>(ElemKind::Int8QTy,
+                             quantization::Schema::SymmetricWithPower2Scale);
+}
+TEST(Quantization, quantizeTensorSymmetricPwr2Int16) {
+  quantizeTensorTest<int16_t>(ElemKind::Int16QTy,
+                              quantization::Schema::SymmetricWithPower2Scale);
+}
+TEST(Quantization, quantizeTensorSymmetricPwr2Int32) {
+  quantizeTensorTest<int32_t>(ElemKind::Int32QTy,
+                              quantization::Schema::SymmetricWithPower2Scale);
+}
 
 /// Helper for quantizing a simple Conv with precision \p quantizationPrecision.
 static void quantizeSimpleConvGraph(ElemKind quantizationPrecision) {

diff --git a/tools/loader/Loader.cpp b/tools/loader/Loader.cpp
@@ -85,7 +85,10 @@ llvm::cl::opt<quantization::Schema> quantizationSchema(
                    "Use symmetric ranges"),
         clEnumValN(quantization::Schema::SymmetricWithUnsigned,
                    "symmetric_with_uint8",
-                   "Use symmetric ranges with potentially uint8 ranges")),
+                   "Use symmetric ranges with potentially uint8 ranges"),
+        clEnumValN(quantization::Schema::SymmetricWithPower2Scale,
+                   "symmetric_with_power2_scale",
+                   "Use symmetric ranges with power of 2 scaling factor")),
     llvm::cl::init(quantization::Schema::Asymmetric), llvm::cl::cat(loaderCat));
 
 llvm::cl::opt<ElemKind> quantizationPrecision(