modules/detectron/sigmoid_focal_loss_op.cu

/**
 * Copyright (c) 2016-present, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <cfloat>

#include "caffe2/core/context_gpu.h"
#include "modules/detectron/sigmoid_focal_loss_op.h"

namespace caffe2 {

namespace {

__global__ void SigmoidFocalLossKernel(
    const int N, const int D, const int H, const int W, const float* logits,
    const int* targets, const float* weight_pos,
    const float gamma, const float alpha,
    const int num_classes, float* losses) {
  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
    int x = i % W;
    int y = (i / W) % H;
    int c = (i / (W * H)) % D;  // channel, here D is channel dim in input NxDxHxW
    int n = i / (W * H * D);    // n in NxDxHxW

    int A = D / num_classes;   // num_anchors = A
    int a = c / num_classes;   // current anchor out of A anchors in D = A * num_cls
    int d = c % num_classes;   // current class
    int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];   // target

    // check whether the class is true class or not.
    // The target classes are in range 1 - 81 and the d is in range 0-80
    // because we predict A*80 dim, so for comparison purpose, compare t and (d+1)
    float c1 = (t == (d + 1));
    float c2 = (t != -1 & t != (d + 1));

    float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
    float zn = (1.0 - alpha) / Np;
    float zp = alpha / Np;

    // p = 1. / 1. + expf(-x)
    float p = 1. / (1. + expf(-logits[i]));

    // (1 - p)**gamma * log(p) where
    float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN));
    // p**gamma * log(1 - p)
    float term2 =
        powf(p, gamma) *
        (-1. * logits[i] * (logits[i] >= 0) -
         logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));

    losses[i] = 0.0;
    losses[i] += -c1 * term1 * zp;
    losses[i] += -c2 * term2 * zn;
  }
}

__global__ void SigmoidFocalLossGradientKernel(
    const int N, const int D, const int H, const int W, const float* logits,
    const int* targets, float* dX_data, const float* weight_pos,
    const float gamma, const float alpha, const int num_classes,
    const float* avg_loss) {
  CUDA_1D_KERNEL_LOOP(i, N * D * H * W) {
      float a_loss = avg_loss[0];
      int x = i % W;
      int y = (i / W) % H;
      int c = (i / (W * H)) % D;
      int n = i / (W * H * D);

      int A = D / num_classes;   // num_anchors
      int a = c / num_classes;   // current anchor
      int d = c % num_classes;   // current class

      float Np = c10::cuda::compat::max(weight_pos[0], static_cast<float>(1.0));
      float zn = (1.0 - alpha) / Np;
      float zp = alpha / Np;
      int t = targets[n * (H * W * A) + a * (H * W) + y * W + x];

      float c1 = (t == (d + 1));
      float c2 = (t != -1 & t != (d + 1));
      float p = 1. / (1. + expf(-logits[i]));

      // (1-p)**g * (1 - p - g*p*log(p))
      float term1 =
          powf((1. - p), gamma) *
          (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN))));
      // (p**g) * (g*(1-p)*log(1-p) - p)
      float term2 =
          powf(p, gamma) *
          ((-1. * logits[i] * (logits[i] >= 0) -
           logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
           (1. - p) * gamma - p);
      dX_data[i] = 0.0;
      dX_data[i] += -c1 * zp * term1;
      dX_data[i] += -c2 * zn * term2;
      dX_data[i] = dX_data[i] * a_loss;
  }
}
} // namespace

template<>
bool SigmoidFocalLossOp<float, CUDAContext>::RunOnDevice() {
  // Input logits, for example: N x (A * 80) x H x W in cls-agnostic
  auto& X = Input(0);
  // Target, for example: N x A x H x W
  auto& T = Input(1);
  // Number of positive examples: scalar
  auto& wp = Input(2);
  // output avg Sigmoid focal loss as mentioned in RetinaNet paper


  int N = X.dim32(0);
  int D = X.dim32(1);
  int H = X.dim32(2);
  int W = X.dim32(3);

  auto* avg_loss = Output(0, vector<int64_t>(), at::dtype<float>());
  losses_.ResizeLike(X);
  float* avg_loss_data = avg_loss->mutable_data<float>();

  SigmoidFocalLossKernel<<<CAFFE_GET_BLOCKS(X.size()),
          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
      N, D, H, W, X.data<float>(), T.data<int>(),
      wp.data<float>(), gamma_, alpha_, num_classes_,
      losses_.mutable_data<float>());

  math::Sum<float, CUDAContext>(
      losses_.size(), losses_.data<float>(), avg_loss_data, &context_);
  math::Scale<float, float, CUDAContext>(
      1, scale_, avg_loss_data, avg_loss_data, &context_);

  return true;
}


template<>
bool SigmoidFocalLossGradientOp<float, CUDAContext>::RunOnDevice() {
  auto& X = Input(0);
  auto& T = Input(1);
  auto& wp = Input(2);
  auto& d_avg_loss = Input(InputSize() - 1);


  // get input shape
  int N = X.dim32(0);
  int D = X.dim32(1);
  int H = X.dim32(2);
  int W = X.dim32(3);

  auto* dX = Output(0, X.sizes(), at::dtype<float>());

  SigmoidFocalLossGradientKernel<<<CAFFE_GET_BLOCKS(X.size()),
          CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>(
      N, D, H, W, X.data<float>(), T.data<int>(), dX->mutable_data<float>(),
      wp.data<float>(), gamma_, alpha_, num_classes_,
      d_avg_loss.data<float>());
  math::Scale<float, float, CUDAContext>(
      dX->size(),
      scale_,
      dX->data<float>(),
      dX->mutable_data<float>(),
      &context_);

  return true;
}


REGISTER_CUDA_OPERATOR(SigmoidFocalLoss,
                       SigmoidFocalLossOp<float, CUDAContext>);
REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient,
                       SigmoidFocalLossGradientOp<float, CUDAContext>);
} // namespace caffe2