From b0eca69164efc29b149fe42c2c7a5e27c51b107d Mon Sep 17 00:00:00 2001
From: Hongyu Xiong <richardxiong@fb.com>
Date: Mon, 17 Jun 2019 10:36:56 -0700
Subject: [PATCH] Add new regression loss function type to FBLearner (#21080)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/21080

Add Huber loss as a new option for regression training (refer to TensorFlow implementation: https://fburl.com/9va71wwo)
  # huber loss
  def huber(true, pred, delta):
    error = abs(true-pred)
    loss = 0.5 * min(error, delta)^2 + delta * max(error - delta, 0)
    return mean(loss)

As a combination of MSE loss (`x < delta`) and MAE loss (`x >= delta`), the advantage of Huber loss is to reduce the training dependence on outlier.

One thing worth to note is that Huber loss is not 2nd differential at `x = delta`. To further address this problem, one could consider adopt the loss of `LOG(cosh(x))`.

Differential Revision: D15524377

fbshipit-source-id: 4a7b0f69b859de3ca903f3bb070c318e0d1f298a
---
 caffe2/python/layers/batch_huber_loss.py | 119 +++++++++++++++++++++++
 caffe2/python/layers_test.py             |   8 ++
 2 files changed, 127 insertions(+)
 create mode 100644 caffe2/python/layers/batch_huber_loss.py

diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py
new file mode 100644
index 000000000000..48b6ebcf8f58
--- /dev/null
+++ b/caffe2/python/layers/batch_huber_loss.py
@@ -0,0 +1,119 @@
+# @package batch_huber_loss
+# Module caffe2.python.layers.batch_huber_loss
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from caffe2.python import core, schema
+from caffe2.python.layers.layers import (
+    ModelLayer,
+)
+from caffe2.python.layers.tags import (
+    Tags
+)
+import numpy as np
+
+
+class BatchHuberLoss(ModelLayer):
+
+    def __init__(self, model, input_record, name='batch_huber_loss', delta=1.0, **kwargs):
+        super(BatchHuberLoss, self).__init__(model, name, input_record, **kwargs)
+
+        assert delta > 0
+
+        self._delta = delta
+
+        assert schema.is_schema_subset(
+            schema.Struct(
+                ('label', schema.Scalar()),
+                ('prediction', schema.Scalar())
+            ),
+            input_record
+        )
+        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
+
+        self.output_schema = schema.Scalar(
+            np.float32,
+            self.get_next_blob_reference('output'))
+
+    def add_ops(self, net):
+        prediction = net.Squeeze(
+            self.input_record.prediction(),
+            net.NextScopedBlob('squeezed_prediction'),
+            dims=[1]
+        )
+
+        label = self.input_record.label.field_blobs()
+        if self.input_record.label.field_type().base != (
+                self.input_record.prediction.field_type().base):
+            label = net.Cast(
+                label,
+                net.NextScopedBlob('cast_label'),
+                to=schema.data_type_for_dtype(
+                    self.input_record.prediction.field_type()
+                )
+            )
+
+        const_delta = net.ConstantFill(
+            label,
+            net.NextScopedBlob("delta"),
+            value=self._delta,
+            dtype=core.DataType.FLOAT,
+        )
+
+        label = net.StopGradient(
+            label,
+            net.NextScopedBlob('stopped_label')
+        )
+
+        const_delta = net.StopGradient(
+            const_delta,
+            net.NextScopedBlob('stopped_delta')
+        )
+
+        # abs_error = np.abs(true - pred)
+        abs_error = net.L1Distance(
+            [label, prediction], net.NextScopedBlob("abs_error")
+        )
+
+        # quadratic = 0.5*min(abs_error, delta)^2, linear = delta*max(abs_error-delta, 0)
+        min_error = net.Min(
+            [abs_error, const_delta], net.NextScopedBlob("min_error_delta")
+        )
+
+        quadratic_term = net.Scale(
+            net.Sqr(min_error), scale=float(0.5)
+        )
+
+        linear_term = net.Mul(
+            [
+                net.Sub([abs_error, min_error]),
+                const_delta,
+            ],
+            net.NextScopedBlob("huber_linear_term")
+        )
+
+        # huber = 0.5 * min(abs_error, delta)^2 + delta * max(abs_error-delta, 0)
+        huber_dist = net.Add(
+            [quadratic_term, linear_term], net.NextScopedBlob("huber_dist")
+        )
+
+        if 'weight' in self.input_record.fields:
+            weight_blob = self.input_record.weight()
+            if self.input_record.weight.field_type().base != np.float32:
+                weight_blob = net.Cast(
+                    weight_blob,
+                    weight_blob + '_float32',
+                    to=core.DataType.FLOAT
+                )
+            weight_blob = net.StopGradient(
+                [weight_blob],
+                [net.NextScopedBlob('weight_stop_gradient')],
+            )
+            huber_dist = net.Mul(
+                [huber_dist, weight_blob],
+                net.NextScopedBlob("weighted_huber_distance"),
+            )
+
+        net.AveragedLoss(huber_dist, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
index 6551916a7ed1..69118320c62a 100644
--- a/caffe2/python/layers_test.py
+++ b/caffe2/python/layers_test.py
@@ -779,6 +779,14 @@ def testBatchMSELoss(self):
         loss = self.model.BatchMSELoss(input_record)
         self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
 
+    def testBatchHuberLoss(self):
+        input_record = self.new_record(schema.Struct(
+            ('label', schema.Scalar((np.float32, (1,)))),
+            ('prediction', schema.Scalar((np.float32, (2,)))),
+        ))
+        loss = self.model.BatchHuberLoss(input_record)
+        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
+
     def testBatchSigmoidCrossEntropyLoss(self):
         input_record = self.new_record(schema.Struct(
             ('label', schema.Scalar((np.float32, (32,)))),