rbgirshick · sorinescu · Nov 23, 2016
diff --git a/src/caffe/layers/roi_pooling_layer.cpp b/src/caffe/layers/roi_pooling_layer.cpp
@@ -16,6 +16,12 @@ using std::ceil;
 
 namespace caffe {
 
+/*
+ * There are two bottom layers, 0 (actual data) and 1 (ROIs).
+ *
+ * A ROI is defined as [batch_index x1 y1 x2 y2]
+ * The ROI layer (Channels x Width x Height) must == 5
+ */
 template <typename Dtype>
 void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
@@ -33,6 +39,9 @@ void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {
+  CHECK_EQ(bottom.size(), 2) << "number of bottom layers must be == 2";
+  CHECK_EQ(bottom[1]->channels() * bottom[1]->height() * bottom[1]->width(), 5)
+    << "ROI layer C x W x H must be == 5";
   channels_ = bottom[0]->channels();
   height_ = bottom[0]->height();
   width_ = bottom[0]->width();
@@ -52,8 +61,9 @@ void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   int batch_size = bottom[0]->num();
   int top_count = top[0]->count();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  caffe_set(top_count, Dtype(-FLT_MAX), top_data);
   int* argmax_data = max_idx_.mutable_cpu_data();
+
+  caffe_set(top_count, Dtype(-FLT_MAX), top_data);
   caffe_set(top_count, -1, argmax_data);
 
   // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
@@ -127,9 +137,91 @@ void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
 template <typename Dtype>
 void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  NOT_IMPLEMENTED;
-}
+  if (!propagate_down[0])
+    return;
+
+  // Number of ROIs
+  const int num_rois = bottom[1]->num();
+  CHECK_EQ(num_rois, top[0]->num());
+  const int bottom_count = bottom[0]->count();
+
+  const Dtype* bottom_rois = bottom[1]->cpu_data();
+  const Dtype* top_diff = top[0]->cpu_diff();
+  Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
+  const int* argmax_data = max_idx_.cpu_data();
+
+  caffe_set(bottom_count, Dtype(0), bottom_diff);
+
+  for (int n = 0; n < num_rois; ++n) {
+    int roi_batch_ind = bottom_rois[0];
+    int roi_start_w = round(bottom_rois[1] * spatial_scale_);
+    int roi_start_h = round(bottom_rois[2] * spatial_scale_);
+    int roi_end_w = round(bottom_rois[3] * spatial_scale_);
+    int roi_end_h = round(bottom_rois[4] * spatial_scale_);
+
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+
+    Dtype bin_size_h = static_cast<Dtype>(roi_height)
+                       / static_cast<Dtype>(pooled_height_);
+    Dtype bin_size_w = static_cast<Dtype>(roi_width)
+                       / static_cast<Dtype>(pooled_width_);
 
+    // Skip if ROI doesn't include (h, w)
+    int start_h = max(0, roi_start_h);
+    int end_h = min(height_, roi_end_h + 1);
+    int start_w = max(0, roi_start_w);
+    int end_w = min(width_, roi_end_w + 1);
+
+    // Reverse engineer indices of elements pooled by this ROI
+    Dtype* offset_bottom_diff = bottom_diff + bottom[0]->offset(roi_batch_ind);
+
+    for (int c = 0; c < channels_; ++c) {
+      for (int h = start_h; h < end_h; ++h) {
+        for (int w = start_w; w < end_w; ++w) {
+          int index = h * width_ + w;
+
+          // Compute feasible set of pooled units that could have pooled
+          // this bottom unit
+
+          int phstart = floor(static_cast<Dtype>(h - roi_start_h) / bin_size_h);
+          int phend = ceil(static_cast<Dtype>(h - roi_start_h + 1) / bin_size_h);
+          int pwstart = floor(static_cast<Dtype>(w - roi_start_w) / bin_size_w);
+          int pwend = ceil(static_cast<Dtype>(w - roi_start_w + 1) / bin_size_w);
+
+          phstart = min(max(phstart, 0), pooled_height_);
+          phend = min(max(phend, 0), pooled_height_);
+          pwstart = min(max(pwstart, 0), pooled_width_);
+          pwend = min(max(pwend, 0), pooled_width_);
+
+          Dtype gradient = Dtype(0);
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2) reduction(+:gradient)
+#endif
+          for (int ph = phstart; ph < phend; ++ph) {
+            for (int pw = pwstart; pw < pwend; ++pw) {
+              int pindex = ph * pooled_width_ + pw;
+              if (argmax_data[pindex] == index) {
+                gradient += top_diff[pindex];
+              }
+            }
+          }
+
+          offset_bottom_diff[index] += gradient;
+        }
+      }
+
+      // Increment all data pointers by one channel
+      offset_bottom_diff += bottom[0]->offset(0, 1);
+      top_diff += top[0]->offset(0, 1);
+      argmax_data += max_idx_.offset(0, 1);
+    }
+
+    // Increment ROI data pointer
+    bottom_rois += bottom[1]->offset(1);
+  }
+}
 
 #ifdef CPU_ONLY
 STUB_GPU(ROIPoolingLayer);

diff --git a/src/caffe/layers/smooth_L1_loss_layer.cpp b/src/caffe/layers/smooth_L1_loss_layer.cpp
@@ -6,6 +6,7 @@
 // ------------------------------------------------------------------
 
 #include "caffe/fast_rcnn_layers.hpp"
+#include "caffe/util/math_functions.hpp"
 
 namespace caffe {
 
@@ -51,13 +52,102 @@ void SmoothL1LossLayer<Dtype>::Reshape(
 template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  NOT_IMPLEMENTED;
+  int count = bottom[0]->count();
+  caffe_sub(
+      count,
+      bottom[0]->cpu_data(),
+      bottom[1]->cpu_data(),
+      diff_.mutable_cpu_data());    // d := b0 - b1
+  if (has_weights_) {
+    // apply "inside" weights
+    caffe_mul(
+        count,
+        bottom[2]->cpu_data(),
+        diff_.cpu_data(),
+        diff_.mutable_cpu_data());  // d := w_in * (b0 - b1)
+  }
+
+  // f(x) = 0.5 * (sigma * x)^2          if |x| < 1 / sigma / sigma
+  //        |x| - 0.5 / sigma / sigma    otherwise
+  const Dtype* in = diff_.cpu_data();
+  Dtype* out = errors_.mutable_cpu_data();
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int index = 0; index < count; ++index) {
+    Dtype val = in[index];
+    Dtype abs_val = fabs(val);
+    if (abs_val < 1.0 / sigma2_) {
+      out[index] = 0.5 * val * val * sigma2_;
+    } else {
+      out[index] = abs_val - 0.5 / sigma2_;
+    }
+  }
+
+  if (has_weights_) {
+    // apply "outside" weights
+    caffe_mul(
+        count,
+        bottom[3]->cpu_data(),
+        errors_.cpu_data(),
+        errors_.mutable_cpu_data());  // d := w_out * SmoothL1(w_in * (b0 - b1))
+  }
+
+  Dtype loss = caffe_cpu_dot(count, ones_.cpu_data(), errors_.cpu_data());
+  top[0]->mutable_cpu_data()[0] = loss / bottom[0]->num();
 }
 
 template <typename Dtype>
 void SmoothL1LossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
-  NOT_IMPLEMENTED;
+  // after forwards, diff_ holds w_in * (b0 - b1)
+  int count = diff_.count();
+
+  // f'(x) = sigma * sigma * x         if |x| < 1 / sigma / sigma
+  //       = sign(x)                   otherwise
+  const Dtype* in = diff_.cpu_data();
+  Dtype* out = diff_.mutable_cpu_data();
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int index = 0; index < count; ++index) {
+      Dtype val = in[index];
+      Dtype abs_val = fabs(val);
+      if (abs_val < 1.0 / sigma2_) {
+        out[index] = sigma2_ * val;
+      } else {
+        out[index] = (Dtype(0) < val) - (val < Dtype(0));
+      }
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    if (propagate_down[i]) {
+      const Dtype sign = (i == 0) ? 1 : -1;
+      const Dtype alpha = sign * top[0]->cpu_diff()[0] / bottom[i]->num();
+      caffe_cpu_axpby(
+          count,                           // count
+          alpha,                           // alpha
+          diff_.cpu_data(),                // x
+          Dtype(0),                        // beta
+          bottom[i]->mutable_cpu_diff());  // y
+      if (has_weights_) {
+        // Scale by "inside" weight
+        caffe_mul(
+            count,
+            bottom[2]->cpu_data(),
+            bottom[i]->cpu_diff(),
+            bottom[i]->mutable_cpu_diff());
+        // Scale by "outside" weight
+        caffe_mul(
+            count,
+            bottom[3]->cpu_data(),
+            bottom[i]->cpu_diff(),
+            bottom[i]->mutable_cpu_diff());
+      }
+    }
+  }
 }
 
 #ifdef CPU_ONLY

diff --git a/src/caffe/test/test_roi_pooling_layer.cpp b/src/caffe/test/test_roi_pooling_layer.cpp
@@ -25,8 +25,6 @@ using boost::scoped_ptr;
 
 namespace caffe {
 
-typedef ::testing::Types<GPUDevice<float>, GPUDevice<double> > TestDtypesGPU;
-
 template <typename TypeParam>
 class ROIPoolingLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -85,7 +83,7 @@ class ROIPoolingLayerTest : public MultiDeviceTest<TypeParam> {
   vector<Blob<Dtype>*> blob_top_vec_;
 };
 
-TYPED_TEST_CASE(ROIPoolingLayerTest, TestDtypesGPU);
+TYPED_TEST_CASE(ROIPoolingLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(ROIPoolingLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;

diff --git a/src/caffe/test/test_smooth_L1_loss_layer.cpp b/src/caffe/test/test_smooth_L1_loss_layer.cpp
@@ -8,16 +8,13 @@
 #include "caffe/blob.hpp"
 #include "caffe/common.hpp"
 #include "caffe/filler.hpp"
-#include "caffe/vision_layers.hpp"
 #include "caffe/fast_rcnn_layers.hpp"
 
 #include "caffe/test/test_caffe_main.hpp"
 #include "caffe/test/test_gradient_check_util.hpp"
 
 namespace caffe {
 
-typedef ::testing::Types<GPUDevice<float>, GPUDevice<double> > TestDtypesGPU;
-
 template <typename TypeParam>
 class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
   typedef typename TypeParam::Dtype Dtype;
@@ -30,9 +27,9 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
         blob_bottom_outside_weights_(new Blob<Dtype>(10, 5, 1, 1)),
         blob_top_loss_(new Blob<Dtype>()) {
     // fill the values
-    FillerParameter const_filler_param;
-    const_filler_param.set_value(-1.);
-    ConstantFiller<Dtype> const_filler(const_filler_param);
+    //FillerParameter const_filler_param;
+    //const_filler_param.set_value(-1.);
+    //ConstantFiller<Dtype> const_filler(const_filler_param);
     FillerParameter filler_param;
     GaussianFiller<Dtype> filler(filler_param);
 
@@ -67,7 +64,7 @@ class SmoothL1LossLayerTest : public MultiDeviceTest<TypeParam> {
   vector<Blob<Dtype>*> blob_top_vec_;
 };
 
-TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesGPU);
+TYPED_TEST_CASE(SmoothL1LossLayerTest, TestDtypesAndDevices);
 
 TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {
   typedef typename TypeParam::Dtype Dtype;
@@ -79,8 +76,7 @@ TYPED_TEST(SmoothL1LossLayerTest, TestGradient) {
   const Dtype kLossWeight = 3.7;
   layer_param.add_loss_weight(kLossWeight);
   SmoothL1LossLayer<Dtype> layer(layer_param);
-  layer.SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
-  GradientChecker<Dtype> checker(1e-2, 1e-2, 1701);
+  GradientChecker<Dtype> checker(1e-2, 1e-2);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
       this->blob_top_vec_, 0);
   checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,