In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        break

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chess-object-detection/chess-pieces-dataset/valid/d114edc5cb4cae0ceb2f152afd15f57d_jpg.rf.fa2760e7c7663ed523265248c14b35ea.xml
/kaggle/input/chess-object-detection/chess-pieces-dataset/test/IMG_0159_JPG.rf.f0d34122f8817d538e396b04f2b70d33.jpg
/kaggle/input/chess-object-detection/chess-pieces-dataset/train/IMG_0167_JPG.rf.51e844ec48d744ea0d541c3978e68c8f.xml


In [2]:
# datasets/image

import imageio
from PIL import Image
import numpy as np

def _compute_scale_factor(original_width, original_height, min_dimension_pixels):
  if not min_dimension_pixels:
    return 1.0
  if original_width > original_height:
    scale_factor = min_dimension_pixels / original_height
  else:
    scale_factor = min_dimension_pixels / original_width
  return scale_factor

def _preprocess_vgg16(image_data):
  image_data = image_data[:, :, ::-1]           # RGB -> BGR
  image_data[:, :, 0] -= 103.939                # ImageNet B mean
  image_data[:, :, 1] -= 116.779                # ImageNet G mean
  image_data[:, :, 2] -= 123.680                # ImageNet R mean 
  return image_data

def load_image(url, min_dimension_pixels = None, horizontal_flip = False):
  """
  Loads and preprocesses an image for use with VGG-16, which consists of
  converting RGB to BGR and subtracting ImageNet dataset means from each
  component. The image can be resized so that the minimum dimension is a
  defined size, as recommended by Faster R-CNN. 

  Parameters
  ----------
  url : str
    URL (local or remote file) to load.
  min_dimension_pixels : int
    If not None, specifies the size in pixels of the smaller side of the image.
    The other side is scaled proportionally.
  horizontal_flip : bool
    Whether to flip the image horizontally.

  Returns
  -------
  np.ndarray, PIL.Image, float, Tuple[int, int, int]
    Image pixels as float32, shaped as (channels, height, width); an image
    object suitable for drawing and visualization; scaling factor applied to
    the image dimensions; and the original image shape.
  """
  data = imageio.imread(url, pilmode = "RGB")
  image = Image.fromarray(data, mode = "RGB")
  original_width, original_height = image.width, image.height
  if horizontal_flip:
    image = image.transpose(method = Image.FLIP_LEFT_RIGHT)
  if min_dimension_pixels is not None:
    scale_factor = _compute_scale_factor(original_width = image.width, original_height = image.height, min_dimension_pixels = min_dimension_pixels)
    width = int(image.width * scale_factor)
    height = int(image.height * scale_factor)
    image = image.resize((width, height), resample = Image.BILINEAR)
  else:
    scale_factor = 1.0
  image_data = np.array(image).astype(np.float32)
  image_data = _preprocess_vgg16(image_data = image_data)
  return image_data, image, scale_factor, (image_data.shape[0], original_height, original_width)


In [3]:
# datasets/training_sample

from dataclasses import dataclass
import numpy as np
from PIL import Image
from typing import List
from typing import Tuple


@dataclass
class Box:
  class_index: int
  class_name: str
  corners: np.ndarray
  
  def __repr__(self):
    return "[class=%s (%f,%f,%f,%f)]" % (self.class_name, self.corners[0], self.corners[1], self.corners[2], self.corners[3])

  def __str__(self):
    return repr(self)

@dataclass
class TrainingSample:
  anchor_map:                 np.ndarray                # shape (feature_map_height,feature_map_width,num_anchors*4), with each anchor as [center_y,center_x,height,width]
  anchor_valid_map:           np.ndarray                # shape (feature_map_height,feature_map_width,num_anchors), indicating which anchors are valid (do not cross image boundaries)
  gt_rpn_map:                 np.ndarray                # TODO: describe me
  gt_rpn_object_indices:      List[Tuple[int,int,int]]  # list of (y,x,k) coordinates of anchors in gt_rpn_map that are labeled as object
  gt_rpn_background_indices:  List[Tuple[int,int,int]]  # list of (y,x,k) coordinates of background anchors
  gt_boxes:                   List[Box]                 # list of ground-truth boxes, scaled
  image_data:                 np.ndarray                # shape (3,height,width), pre-processed and scaled to size expected by model
  image:                      Image                     # PIL image data (for debug rendering), scaled
  filepath:                   str                       # file path of image


In [4]:
# datasets/voc.py

from dataclasses import dataclass
import numpy as np
import os
from pathlib import Path
import random
import xml.etree.ElementTree as ET
from typing import List
from typing import Tuple


class Dataset:
    """
    A VOC dataset iterator for a particular split (train, val, etc.)
    """

    num_classes = 15
    class_index_to_name = {
        0: 'background', 1: 'pieces', 2: 'bishop', 3: 'black-bishop', 4: 'black-king', 5: 'black-knight', 6: 'black-pawn',
        7: 'black-queen', 8: 'black-rook', 9: 'white-bishop', 10: 'white-king', 11: 'white-knight', 12: 'white-pawn',
        13: 'white-queen', 14: 'white-rook'
    }

    def __init__(self, split, dir="dataset/chess-pieces-dataset", feature_pixels=16, augment=True, shuffle=True,
                 allow_difficult=False, cache=True):
        """
        Parameters
        ----------
        split : str
          Dataset split to load: train, val, or trainval.
        dir : str
          Root directory of dataset.
        feature_pixels : int
          Size of each cell in the Faster R-CNN feature map (i.e., VGG-16 feature
          extractor output) in image pixels. This is the separation distance
          between anchors.
        augment : bool
          Whether to randomly augment (horizontally flip) images during iteration
          with 50% probability.
        shuffle : bool
          Whether to shuffle the dataset each time it is iterated.
        allow_difficult : bool
          Whether to include ground truth boxes that are marked as "difficult".
        cache : bool
          Whether to training samples in memory after first being generated.
        """
        if not os.path.exists(dir):
            raise FileNotFoundError("Dataset directory does not exist: %s" % dir)
        self.split = split
        self._dir = dir
        # self.class_index_to_name = self._get_classes()
        # self.class_index_to_name = self._add_background_class(self.class_index_to_name)
        self.class_name_to_index = {class_name: class_index for (class_index, class_name) in
                                    self.class_index_to_name.items()}
        self.num_classes = len(self.class_index_to_name)
        assert self.num_classes == Dataset.num_classes, "Dataset does not have the expected number of classes (found %d but expected %d)" % (
            self.num_classes, Dataset.num_classes)
        # assert self.class_index_to_name == Dataset.class_index_to_name, "Dataset does not have the expected class mapping"
        self._filepaths = self._get_filepaths()
        self.num_samples = len(self._filepaths)
        self._gt_boxes_by_filepath = self._get_ground_truth_boxes(filepaths=self._filepaths,
                                                                  allow_difficult=allow_difficult)
        self._i = 0
        self._iterable_filepaths = self._filepaths.copy()
        self._feature_pixels = feature_pixels
        self._augment = augment
        self._shuffle = shuffle
        self._cache = cache
        self._unaugmented_cached_sample_by_filepath = {}
        self._augmented_cached_sample_by_filepath = {}

    def __iter__(self):
        self._i = 0
        if self._shuffle:
            random.shuffle(self._iterable_filepaths)
        return self

    def __next__(self):
        if self._i >= len(self._iterable_filepaths):
            raise StopIteration

        # Next file to load
        filepath = self._iterable_filepaths[self._i]
        self._i += 1

        # Augment?
        flip = random.randint(0, 1) != 0 if self._augment else 0
        cached_sample_by_filepath = self._augmented_cached_sample_by_filepath if flip else self._unaugmented_cached_sample_by_filepath

        # Load and, if caching, write back to cache
        if filepath in cached_sample_by_filepath:
            sample = cached_sample_by_filepath[filepath]
        else:
            sample = self._generate_training_sample(filepath=filepath, flip=flip)
        if self._cache:
            cached_sample_by_filepath[filepath] = sample

        # Return the sample
        return sample

    def _generate_training_sample(self, filepath, flip):
        # Load and preprocess the image
        scaled_image_data, scaled_image, scale_factor, original_shape = load_image(url=filepath,
                                                                                         min_dimension_pixels=600,
                                                                                         horizontal_flip=flip)
        _, original_height, original_width = original_shape

        # Scale ground truth boxes to new image size
        scaled_gt_boxes = []
        for box in self._gt_boxes_by_filepath[filepath]:
            if flip:
                corners = np.array([
                    box.corners[0],
                    original_width - 1 - box.corners[3],
                    box.corners[2],
                    original_width - 1 - box.corners[1]
                ])
            else:
                corners = box.corners
            scaled_box = Box(
                class_index=box.class_index,
                class_name=box.class_name,
                corners=corners * scale_factor
            )
            scaled_gt_boxes.append(scaled_box)

        # Generate anchor maps and RPN truth map
        anchor_map, anchor_valid_map = generate_anchor_maps(image_shape=scaled_image_data.shape,
                                                                    feature_pixels=self._feature_pixels)
        gt_rpn_map, gt_rpn_object_indices, gt_rpn_background_indices = generate_rpn_map(anchor_map=anchor_map,
                                                                                                anchor_valid_map=anchor_valid_map,
                                                                                                gt_boxes=scaled_gt_boxes)

        # Return sample
        return TrainingSample(
            anchor_map=anchor_map,
            anchor_valid_map=anchor_valid_map,
            gt_rpn_map=gt_rpn_map,
            gt_rpn_object_indices=gt_rpn_object_indices,
            gt_rpn_background_indices=gt_rpn_background_indices,
            gt_boxes=scaled_gt_boxes,
            image_data=scaled_image_data,
            image=scaled_image,
            filepath=filepath
        )

    def _get_classes(self):
        imageset_dir = os.path.join(self._dir, "ImageSets", "Main")
        classes = set(
            [os.path.basename(path).split("_")[0] for path in Path(imageset_dir).glob("*_" + self.split + ".txt")])
        assert len(classes) > 0, "No classes found in ImageSets/Main for '%s' split" % self.split
        class_index_to_name = {(1 + v[0]): v[1] for v in enumerate(sorted(classes))}
        class_index_to_name[0] = "background"
        return class_index_to_name

    def _add_background_class(self, class_index_to_name: dict):
        class_index_to_name = {(1 + k): v for k, v in class_index_to_name.items()}
        class_index_to_name[0] = "background"
        return class_index_to_name

    def _get_filepaths(self):
        image_paths = []
        for root, dirs, _ in os.walk(self._dir):
            for dir in dirs:
                if self.split == dir:
                    folder_path = os.path.join(root, dir)
                    for _, _, files in os.walk(folder_path):
                        for file in files:
                            if file.rsplit(".")[-1] == "jpg":
                                image_paths.append(os.path.join(root, self.split, file))
        return image_paths
        # Debug: 20 chess training images:
        image_paths = [
            '3bab0eaaeb63a2ac9ae4942df4006a25_jpg.rf.8fd1c7b01ae630cdb96546469e0c742d.jpg',
            '3bab0eaaeb63a2ac9ae4942df4006a25_jpg.rf.b78947d5207c15119ee81058a1b75c1e.jpg',
            '3161933dffedf8a859d6623a99492c53_jpg.rf.b3cca32040dcb031002296f83298f3d1.jpg',
            '254f92b18b2a81f88b85e7aed3cabc61_jpg.rf.a55e3d26992b9f4d43e7f317a078689b.jpg',
            'IMG_0291_JPG.rf.d2ba6353082aa25c15708824c08dfb27.jpg',
            '5758322233deed7ae7adc23536db2a4f_jpg.rf.469940331ca0c0fbabd2eaad8348ed71.jpg',
            '389b4c47568c78c44df11dbb1377ffea_jpg.rf.0185f6bf38d82f7cbf9365edd7b2bfc7.jpg',
            '4894f034a55eaa9252cd261a62b11d27_jpg.rf.e153d650cc91ee8985dbc0f9b5050e98.jpg',
            '4894f034a55eaa9252cd261a62b11d27_jpg.rf.bcd60bd54187dbd564c6d84e8a4d3cb9.jpg',
            'd079f4e77b2445abceca7534356db743_jpg.rf.e7fc6fdfea0d14dc4c82a6068b9e4159.jpg',
            '8ff64b3f770bfe96bdffc629efd16460_jpg.rf.7b4792b9f562b28d55342586be82fe91.jpg',
            'ddad9dc4d945006d66f5349d64498559_jpg.rf.48e7a4d1dbc55402801f6f3eb2515561.jpg',
            '1728cd731489df8bb8e0396e178fe393_jpg.rf.cf3127987c30548d691295953a2326db.jpg',
            '02f0931b536dfba10affc3231a3d64fb_jpg.rf.087fbe5ea178dd757f4eb065ae5cf941.jpg',
            '4894f034a55eaa9252cd261a62b11d27_jpg.rf.ec15ec6e91a0367ded74d29495beadca.jpg',
            'f041d3171dfe3137390c85fc5437e447_jpg.rf.3020fee02bc4def16c99bed406ad8671.jpg',
            '03886821377011fec599e8fa12d86e89_jpg.rf.7ec3f29be4f3793b35a2c4a9880d831c.jpg',
            '9146a6989dac08f1769e677064ebfb49_jpg.rf.f479d3177bde0b8beb172fcd798971f2.jpg',
            'a9768de3fceeeae2618f362870fb9a88_jpg.rf.444b950f0b329aa6e7ed17a86383606d.jpg',
            '673bcd0d44f495fbe9dd88d5cacfceb3_jpg.rf.3b647f8c3bb9f3fc64a0d0edf806f691.jpg',
            'IMG_0166_JPG.rf.866e83ca31acd30da2673fcb7e2abbfe.jpg',
        ]
        return [os.path.join(self._dir, self.split, path) for path in image_paths]

    def _get_ground_truth_boxes(self, filepaths, allow_difficult):
        gt_boxes_by_filepath = {}
        for filepath in filepaths:
            basename = os.path.splitext(os.path.basename(filepath))[0]
            annotation_file = os.path.join(self._dir, self.split, basename) + ".xml"
            tree = ET.parse(annotation_file)
            root = tree.getroot()
            assert tree != None, "Failed to parse %s" % annotation_file
            assert len(root.findall("size")) == 1
            size = root.find("size")
            assert len(size.findall("depth")) == 1
            depth = int(size.find("depth").text)
            assert depth == 3
            boxes = []
            for obj in root.findall("object"):
                assert len(obj.findall("name")) == 1
                assert len(obj.findall("bndbox")) == 1
                assert len(obj.findall("difficult")) == 1
                is_difficult = int(obj.find("difficult").text) != 0
                if is_difficult and not allow_difficult:
                    continue  # ignore difficult examples unless asked to include them
                class_name = obj.find("name").text
                bndbox = obj.find("bndbox")
                assert len(bndbox.findall("xmin")) == 1
                assert len(bndbox.findall("ymin")) == 1
                assert len(bndbox.findall("xmax")) == 1
                assert len(bndbox.findall("ymax")) == 1
                x_min = int(bndbox.find("xmin").text) - 1  # convert to 0-based pixel coordinates
                y_min = int(bndbox.find("ymin").text) - 1
                x_max = int(bndbox.find("xmax").text) - 1
                y_max = int(bndbox.find("ymax").text) - 1
                corners = np.array([y_min, x_min, y_max, x_max]).astype(np.float32)
                box = Box(class_index=self.class_name_to_index[class_name], class_name=class_name, corners=corners)
                boxes.append(box)
            if len(boxes) == 0:
                print(filepath)
            assert len(boxes) > 0
            gt_boxes_by_filepath[filepath] = boxes
        return gt_boxes_by_filepath

In [5]:
# math_utils

def intersection_over_union(boxes1, boxes2):
  """
  Computes intersection-over-union (IoU) for multiple boxes in parallel.

  Parameters
  ----------
  boxes1 : np.ndarray
    Box corners, shaped (N, 4), with each box as (y1, x1, y2, x2).
  boxes2 : np.ndarray
    Box corners, shaped (M, 4).

  Returns
  -------
  np.ndarray
    IoUs for each pair of boxes in boxes1 and boxes2, shaped (N, M).
  """
  top_left_point = np.maximum(boxes1[:,None,0:2], boxes2[:,0:2])                                  # (N,1,2) and (M,2) -> (N,M,2) indicating top-left corners of box pairs
  bottom_right_point = np.minimum(boxes1[:,None,2:4], boxes2[:,2:4])                              # "" bottom-right corners ""
  well_ordered_mask = np.all(top_left_point < bottom_right_point, axis = 2)                       # (N,M) indicating whether top_left_x < bottom_right_x and top_left_y < bottom_right_y (meaning boxes may intersect)
  intersection_areas = well_ordered_mask * np.prod(bottom_right_point - top_left_point, axis = 2) # (N,M) indicating intersection area (bottom_right_x - top_left_x) * (bottom_right_y - top_left_y)
  areas1 = np.prod(boxes1[:,2:4] - boxes1[:,0:2], axis = 1)                                       # (N,) indicating areas of boxes1
  areas2 = np.prod(boxes2[:,2:4] - boxes2[:,0:2], axis = 1)                                       # (M,) indicating areas of boxes2
  union_areas = areas1[:,None] + areas2 - intersection_areas                                      # (N,1) + (M,) - (N,M) = (N,M), union areas of both boxes
  epsilon = 1e-7
  return intersection_areas / (union_areas + epsilon)

def tf_intersection_over_union(boxes1, boxes2):
  """
  Equivalent of intersection_over_union() but operates on tf.Tensors and
  produces a TensorFlow graph suitable for use in a model. This code borrowed
  from Matterport's MaskRCNN implementation:
  https://github.com/matterport/Mask_RCNN

  Parameters
  ----------
  boxes1: tf.Tensor
    Box corners, shaped (N,4), with each box as (y1, x1, y2, x2).
  boxes2: tf.Tensor
    Box corners, shaped (M,4).

  Returns
  -------
  tf.Tensor
    Tensor of shape (N, M) containing IoU score between each pair of boxes.
  """
  # 1. Tile boxes2 and repeat boxes1. This allows us to compare
  # every boxes1 against every boxes2 without loops.
  # TF doesn't have an equivalent to np.repeat() so simulate it
  # using tf.tile() and tf.reshape.
  b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
                          [1, 1, tf.shape(boxes2)[0]]), [-1, 4])
  b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
  # 2. Compute intersections
  b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
  b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
  y1 = tf.maximum(b1_y1, b2_y1)
  x1 = tf.maximum(b1_x1, b2_x1)
  y2 = tf.minimum(b1_y2, b2_y2)
  x2 = tf.minimum(b1_x2, b2_x2)
  intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
  # 3. Compute unions
  b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
  b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
  union = b1_area + b2_area - intersection
  # 4. Compute IoU and reshape to [boxes1, boxes2]
  iou = intersection / union
  overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
  return overlaps

def convert_deltas_to_boxes(box_deltas, anchors, box_delta_means, box_delta_stds):
  """
  Converts box deltas, which are in parameterized form (ty, tx, th, tw) as
  described by the Fast R-CNN and Faster R-CNN papers, to boxes
  (y1, x1, y2, x2). The anchors are the base boxes (e.g., RPN anchors or
  proposals) that the deltas describe a modification to.

  Parameters
  ----------
  box_deltas : np.ndarray
    Box deltas with shape (N, 4). Each row is (ty, tx, th, tw).
  anchors : np.ndarray
    Corresponding anchors that the deltas are based upon, shaped (N, 4) with
    each row being (center_y, center_x, height, width).
  box_delta_means : np.ndarray
    Mean ajustment to deltas, (4,), to be added after standard deviation
    scaling and before conversion to actual box coordinates.
  box_delta_stds : np.ndarray
    Standard deviation adjustment to deltas, (4,). Box deltas are first
    multiplied by these values.

  Returns
  -------
  np.ndarray
    Box coordinates, (N, 4), with each row being (y1, x1, y2, x2).
  """
  box_deltas = box_deltas * box_delta_stds + box_delta_means
  center = anchors[:,2:4] * box_deltas[:,0:2] + anchors[:,0:2]  # center_x = anchor_width * tx + anchor_center_x, center_y = anchor_height * ty + anchor_center_y
  size = anchors[:,2:4] * np.exp(box_deltas[:,2:4])             # width = anchor_width * exp(tw), height = anchor_height * exp(th)
  boxes = np.empty(box_deltas.shape)
  boxes[:,0:2] = center - 0.5 * size                            # y1, x1
  boxes[:,2:4] = center + 0.5 * size                            # y2, x2
  return boxes

def tf_convert_deltas_to_boxes(box_deltas, anchors, box_delta_means, box_delta_stds):
  """
  Equivalent of convert_deltas_to_boxes() but operates on tf.Tensors and
  produces a TensorFlow graph suitable for use in a model.

  Parameters
  ----------
  box_deltas : np.ndarray
    Box deltas with shape (N, 4). Each row is (ty, tx, th, tw).
  anchors : np.ndarray
    Corresponding anchors that the deltas are based upon, shaped (N, 4) with
    each row being (center_y, center_x, height, width).
  box_delta_means : np.ndarray
    Mean ajustment to deltas, (4,), to be added after standard deviation
    scaling and before conversion to actual box coordinates.
  box_delta_stds : np.ndarray
    Standard deviation adjustment to deltas, (4,). Box deltas are first
    multiplied by these values.

  Returns
  -------
  tf.Tensor
    Box coordinates, (N, 4), with each row being (y1, x1, y2, x2).
  """
  box_deltas = box_deltas * box_delta_stds + box_delta_means
  center = anchors[:,2:4] * box_deltas[:,0:2] + anchors[:,0:2]  # center_x = anchor_width * tx + anchor_center_x, center_y = anchor_height * ty + anchor_center_y
  size = anchors[:,2:4] * tf.math.exp(box_deltas[:,2:4])        # width = anchor_width * exp(tw), height = anchor_height * exp(th)
  boxes_top_left = center - 0.5 * size                          # y1, x1
  boxes_bottom_right = center + 0.5 * size                      # y2, x2
  boxes = tf.concat([ boxes_top_left, boxes_bottom_right ], axis = 1) # [ (N,2), (N,2) ] -> (N,4)
  return boxes


In [6]:
# statistics

from collections import defaultdict


class TrainingStatistics:
  """
  Computes statistics per epoch.
  """
  def __init__(self):
    self.rpn_class_loss = float("inf")
    self.rpn_regression_loss = float("inf")
    self.detector_class_loss = float("inf")
    self.detector_regression_loss = float("inf")
    self._rpn_class_losses = []
    self._rpn_regression_losses = []
    self._detector_class_losses = []
    self._detector_regression_losses = []

  def on_training_step(self, losses):
    """
    Call once per training iteration to aggregate losses.

    Parameters
    ----------
    losses : models.faster_rcnn.FasterRCNNModel.Loss
      Dataclass containing losses broken down by RPN and detector, and further
      by classifier and regression loss. 
    """
    self._rpn_class_losses.append(losses["rpn_class_loss"])
    self._rpn_regression_losses.append(losses["rpn_regression_loss"])
    self._detector_class_losses.append(losses["detector_class_loss"])
    self._detector_regression_losses.append(losses["detector_regression_loss"])
    self.rpn_class_loss = np.mean(self._rpn_class_losses)
    self.rpn_regression_loss = np.mean(self._rpn_regression_losses)
    self.detector_class_loss = np.mean(self._detector_class_losses)
    self.detector_regression_loss = np.mean(self._detector_regression_losses)

  def get_progbar_postfix(self):
    """
    Returns
    -------
    Dict[str, str]
      A dictionary of labels and values suitable for use as a postfix object
      for a tqdm progress bar.
    """
    return { 
      "rpn_class_loss": "%1.4f" % self.rpn_class_loss,
      "rpn_regr_loss": "%1.4f" % self.rpn_regression_loss,
      "detector_class_loss": "%1.4f" % self.detector_class_loss,
      "detector_regr_loss": "%1.4f" % self.detector_regression_loss,
      "total_loss": "%1.2f" % (self.rpn_class_loss + self.rpn_regression_loss + self.detector_class_loss + self.detector_regression_loss)
    }


class PrecisionRecallCurveCalculator:
  """
  Collects data over the course of a validation pass and then computes
  precision and recall (including mean average precision).
  """
  def __init__(self):
    # List of (confidence_score, correctness) by class for all images in dataset
    self._unsorted_predictions_by_class_index = defaultdict(list)

    # True number of objects by class for all images in dataset
    self._object_count_by_class_index = defaultdict(int)

  def _compute_correctness_of_predictions(self, scored_boxes_by_class_index, gt_boxes):
    unsorted_predictions_by_class_index = {}
    object_count_by_class_index = defaultdict(int)

    # Count objects by class. We do this here because in case there are no
    # predictions, we do not want to miscount the total number of objects.
    for gt_box in gt_boxes:
      object_count_by_class_index[gt_box.class_index] += 1

    for class_index, scored_boxes in scored_boxes_by_class_index.items():
      # Get the ground truth boxes corresponding to this class
      gt_boxes_this_class = [ gt_box for gt_box in gt_boxes if gt_box.class_index == class_index ]

      # Compute IoU of each box with each ground truth box and store as a list
      # of tuples (iou, box_index, gt_box_index) by descending IoU
      ious = []
      for gt_idx in range(len(gt_boxes_this_class)):
        for box_idx in range(len(scored_boxes)):
          boxes1 = np.expand_dims(scored_boxes[box_idx][0:4], axis = 0) # convert single box (4,) to (1,4), as expected by parallel IoU function
          boxes2 = np.expand_dims(gt_boxes_this_class[gt_idx].corners, axis = 0)
          iou = intersection_over_union(boxes1 = boxes1, boxes2 = boxes2) 
          ious.append((iou, box_idx, gt_idx))
      ious = sorted(ious, key = lambda iou: ious[0], reverse = True)  # sort descending by IoU
      
      # Vector that indicates whether a ground truth box has been detected
      gt_box_detected = [ False ] * len(gt_boxes)

      # Vector that indicates whether a prediction is a true positive (True) or
      # false positive (False)
      is_true_positive = [ False ] * len(scored_boxes)
      
      #
      # Construct a list of prediction descriptions: (score, correct)
      # Score is the confidence score of the predicted box and correct is
      # whether it is a true positive (True) or false positive (False).
      #
      # A true positive is a prediction that has an IoU of > 0.5 and is
      # also the highest-IoU prediction for a ground truth box. Predictions
      # with IoU <= 0.5 or that do not have the highest IoU for any ground
      # truth box are considered false positives.
      #
      iou_threshold = 0.5
      for iou, box_idx, gt_idx in ious:
        if iou <= iou_threshold:
          continue
        if is_true_positive[box_idx] or gt_box_detected[gt_idx]:
          # The prediction and/or ground truth box have already been matched
          continue
        # We've got a true positive
        is_true_positive[box_idx] = True
        gt_box_detected[gt_idx] = True
      # Construct the final array of prediction descriptions
      unsorted_predictions_by_class_index[class_index] = [ (scored_boxes[i][4], is_true_positive[i]) for i in range(len(scored_boxes)) ]
        
    return unsorted_predictions_by_class_index, object_count_by_class_index

  def add_image_results(self, scored_boxes_by_class_index, gt_boxes):
    """
    Adds a detection result to the running tally. Should be called only once per
    image in the dataset.

    Parameters
    ----------
    scored_boxes_by_class_index : Dict[int, Tuple[float, float, float, float, float]]
      Final detected boxes as lists of tuples, (y_min, x_min, y_max, x_max,
      score), by class index. The score is the softmax output and is
      interpreted as a confidence metric when sorting results for the mAP
      calculation.
    gt_boxes : List[datasets.training_sample.Box]
      A list of datasets.training_sample.Box objects describing all ground
      truth boxes in the image.
    """
    # Merge in results for this single image
    unsorted_predictions_by_class_index, object_count_by_class_index = self._compute_correctness_of_predictions(
      scored_boxes_by_class_index = scored_boxes_by_class_index,
      gt_boxes = gt_boxes) 
    for class_index, predictions in unsorted_predictions_by_class_index.items():
      self._unsorted_predictions_by_class_index[class_index] += predictions
    for class_index, count in object_count_by_class_index.items():
      self._object_count_by_class_index[class_index] += object_count_by_class_index[class_index]

  def _compute_average_precision(self, class_index):
    # Sort predictions in descending order of score
    sorted_predictions = sorted(self._unsorted_predictions_by_class_index[class_index], key = lambda prediction: prediction[0], reverse = True)
    num_ground_truth_positives = self._object_count_by_class_index[class_index]

    # Compute raw recall and precision arrays
    recall_array = []
    precision_array = []
    true_positives = 0  # running tally
    false_positives = 0 # ""
    for i in range(len(sorted_predictions)):
      true_positives += 1 if sorted_predictions[i][1] == True else 0
      false_positives += 0 if sorted_predictions[i][1] == True else 1
      recall = true_positives / num_ground_truth_positives
      precision = true_positives / (true_positives + false_positives)
      recall_array.append(recall)
      precision_array.append(precision)

    # Insert 0 at the beginning and end of the list. The 0 at the beginning won't
    # matter due to how interpolation works, below.
    recall_array.insert(0, 0.0)
    recall_array.append(1.0)
    precision_array.insert(0, 0.0)
    precision_array.append(0.0)

    # Interpolation means we compute the highest precision observed at a given
    # recall value. Specifically, it means taking the maximum value seen from
    # each point onward. See URL below:
    # https://towardsdatascience.com/breaking-down-mean-average-precision-map-ae462f623a52#1a59
    for i in range(len(precision_array)):
      precision_array[i] = np.max(precision_array[i:])
    
    # Compute AP using simple rectangular integration under the curve
    average_precision = 0
    for i in range(len(recall_array) - 1):
      dx = recall_array[i + 1] - recall_array[i + 0]
      dy = precision_array[i + 1]
      average_precision += dy * dx

    return average_precision, recall_array, precision_array

  def compute_mean_average_precision(self):
    """
    Calculates mAP (mean average precision) using all the data accumulated thus
    far. This should be called only after all image results have been
    processed.

    Returns
    -------
    np.float64
      Mean average precision.
    """
    average_precisions = []
    for class_index in self._object_count_by_class_index:
      average_precision, _, _ = self._compute_average_precision(class_index = class_index)
      average_precisions.append(average_precision)
    return np.mean(average_precisions)
  
  def plot_precision_vs_recall(self, class_index, class_name = None, interpolated = False):
    """
    Plots precision (y axis) vs. recall (x axis) using all the data accumulated
    thus far. This should be called only after all image results have been
    processed.

    Parameters
    ----------
    class_index : int
      The class index for which the curve is plotted.
    class_name : str
      If given, used as the class name on the plot label. Otherwise, the
      numeric class index is used directly.
    """
    average_precision, recall_array, precision_array = self._compute_average_precision(class_index = class_index, interpolated = interpolated)

    # Plot raw precision vs. recall
    import matplotlib.pyplot as plt
    label = "{0} AP={1:1.2f}".format("Class {}".format(class_index) if class_name is None else class_name, average_precision)
    plt.plot(recall_array, precision_array, label = label)
    if interpolated:
      plt.title("Precision (Interpolated) vs. Recall")
    else:
      plt.title("Precision vs. Recall")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.show()
    plt.clf()

  def plot_average_precisions(self, class_index_to_name): 
    # Compute average precisions for each class
    labels = [ class_index_to_name[class_index] for class_index in self._object_count_by_class_index ]
    average_precisions = []
    for class_index in self._object_count_by_class_index:
      average_precision, _, _ = self._compute_average_precision(class_index = class_index)
      average_precisions.append(average_precision)

    # Sort alphabetically by class name
    sorted_results = sorted(zip(labels, average_precisions), reverse = True, key = lambda pair: pair[0])
    labels, average_precisions = zip(*sorted_results) # unzip 
    
    # Convert to %
    average_precisions = np.array(average_precisions) * 100.0 # convert to %

    # Bar plot
    import matplotlib.pyplot as plt
    plt.clf()
    plt.xlim([0, 100])
    plt.barh(labels, average_precisions)
    plt.title("Model Performance")
    plt.xlabel("Average Precision (%)")
    for index, value in enumerate(average_precisions):
      plt.text(value, index, "%1.1f" % value)
    plt.show()

  def print_average_precisions(self, class_index_to_name):
    # Compute average precisions for each class
    labels = [ class_index_to_name[class_index] for class_index in self._object_count_by_class_index ]
    average_precisions = []
    for class_index in self._object_count_by_class_index:
      average_precision, _, _ = self._compute_average_precision(class_index = class_index)
      average_precisions.append(average_precision)

    # Sort by score (descending)
    sorted_results = sorted(zip(labels, average_precisions), reverse = True, key = lambda pair: pair[1])
    _, average_precisions = zip(*sorted_results) # unzip

    # Maximum width of any class name (for pretty printing)
    label_width = max([ len(label) for label in labels ])

    # Pretty print
    print("Average Precisions")
    print("------------------")
    for (label, average_precision) in sorted_results:
      print("%s: %1.1f%%" % (label.ljust(label_width), average_precision * 100.0))
    print("------------------")


In [7]:
# visualize

from PIL import Image, ImageDraw, ImageFont, ImageColor

def _draw_rectangle(ctx, corners, color, thickness = 4):
  y_min, x_min, y_max, x_max = corners
  ctx.rectangle(xy = [(x_min, y_min), (x_max, y_max)], outline = color, width = thickness)

def _draw_text(image, text, position, color, scale = 1.0, offset_lines = 0):
  """
  Parameters
  ----------
  image : PIL.Image
    Image object to draw on.
  text : str
    Text to render.
  position : Tuple[float, float]
    Location of top-left corner of text string in pixels.
  offset_lines : float
    Number of lines to offset the vertical position by, where a line is the
    text height.
  """
  font = ImageFont.load_default()
  text_size = font.getsize(text)
  text_image = Image.new(mode = "RGBA", size = text_size, color = (0, 0, 0, 0))
  ctx = ImageDraw.Draw(text_image)
  ctx.text(xy = (0, 0), text = text, font = font, fill = color)
  scaled = text_image.resize((round(text_image.width * scale), round(text_image.height * scale)))
  position = (round(position[0]), round(position[1] + offset_lines * scaled.height))
  image.paste(im = scaled, box = position, mask = scaled)

def _class_to_color(class_index):
  return list(ImageColor.colormap.values())[class_index + 1]

def show_anchors(output_path, image, anchor_map, anchor_valid_map, gt_rpn_map, gt_boxes, display = False):
  ctx = ImageDraw.Draw(image, mode = "RGBA")
  
  # Draw all ground truth boxes with thick green lines
  for box in gt_boxes:
    _draw_rectangle(ctx, corners = box.corners, color = (0, 255, 0))

  # Draw all object anchor boxes in yellow
  for y in range(anchor_valid_map.shape[0]):
    for x in range(anchor_valid_map.shape[1]):
      for k in range(anchor_valid_map.shape[2]):  
        if anchor_valid_map[y,x,k] <= 0 or gt_rpn_map[y,x,k,0] <= 0:
          continue  # skip anchors excluded from training
        if gt_rpn_map[y,x,k,1] < 1:
          continue  # skip background anchors
        height = anchor_map[y,x,k*4+2]
        width = anchor_map[y,x,k*4+3]
        cy = anchor_map[y,x,k*4+0]
        cx = anchor_map[y,x,k*4+1]
        corners = (cy - 0.5 * height, cx - 0.5 * width, cy + 0.5 * height, cx + 0.5 * width)
        _draw_rectangle(ctx, corners = corners, color = (255, 255, 0), thickness = 3)
 
  image.save(output_path)
  if display:
    image.show()

def show_detections(output_path, show_image, image, scored_boxes_by_class_index, class_index_to_name):
  # Draw all results
  ctx = ImageDraw.Draw(image, mode = "RGBA")
  color_idx = 0
  for class_index, scored_boxes in scored_boxes_by_class_index.items():
    for i in range(scored_boxes.shape[0]):
      scored_box = scored_boxes[i,:]
      class_name = class_index_to_name[class_index]
      text = "%s %1.2f" % (class_name, scored_box[4])
      color = _class_to_color(class_index = class_index)
      _draw_rectangle(ctx = ctx, corners = scored_box[0:4], color = color, thickness = 2)
      _draw_text(image = image, text = text, position = (scored_box[1], scored_box[0]), color = color, scale = 1.5, offset_lines = -1)

  # Output
  if show_image:
    image.show()
  if output_path is not None:
    image.save(output_path)
    print("Wrote detection results to '%s'" % output_path)



In [8]:
# models/anchors

import itertools
from math import sqrt
import numpy as np


def _compute_anchor_sizes():
  #
  # Anchor scales and aspect ratios.
  #
  # x * y = area          x * (x_aspect * x) = x_aspect * x^2 = area
  # x_aspect * x = y  ->  x = sqrt(area / x_aspect)
  #                       y = x_aspect * sqrt(area / x_aspect)
  #
  areas = [ 128*128, 256*256, 512*512 ]   # pixels
  x_aspects = [ 0.5, 1.0, 2.0 ]           # x:1 ratio

  # Generate all 9 combinations of area and aspect ratio
  heights = np.array([ x_aspects[j] * sqrt(areas[i] / x_aspects[j]) for (i, j) in itertools.product(range(3), range(3)) ])
  widths = np.array([ sqrt(areas[i] / x_aspects[j]) for (i, j) in itertools.product(range(3), range(3)) ])

  # Return as (9,2) matrix of sizes
  return np.vstack([ heights, widths ]).T

def generate_anchor_maps(image_shape, feature_pixels): 
  """
  Generates maps defining the anchors for a given input image size. There are 9
  different anchors at each feature map cell (3 scales, 3 ratios).

  Parameters
  ----------
  image_shape : Tuple[int, int, int]
    Shape of the input image, (height, width, channels), at the scale it will
    be passed into the Faster R-CNN model.
  feature_pixels : int
    Distance in pixels between anchors. This is the size, in input image space,
    of each cell of the feature map output by the feature extractor stage of
    the Faster R-CNN network.

  Returns
  -------
  np.ndarray, np.ndarray
    Two maps, with height and width corresponding to the feature map
    dimensions, not the input image:
      1. A map of shape (height, width, num_anchors*4) containing all anchors,
         each stored as (center_y, center_x, anchor_height, anchor_width) in
         input image pixel space.
      2. A map of shape (height, width, num_anchors) indicating which anchors
         are valid (1) or invalid (0). Invalid anchors are those that cross
         image boundaries and must not be used during training.
  """

  assert len(image_shape) == 3

  #
  # Note that precision can strongly affect anchor labeling in some images.
  # Conversion of both operands to float32 matches the implementation by Yun
  # Chen. That is, changing the final line so as to eliminate the conversion to
  # float32:
  #
  #   return anchor_map, anchor_valid_map
  #
  # Has a pronounced effect on positive anchors in image 2008_000028.jpg in
  # VOC2012.
  #
  
  # Base anchor template: (num_anchors,4), with each anchor being specified by
  # its corners (y1,x1,y2,x2)
  anchor_sizes = _compute_anchor_sizes()
  num_anchors = anchor_sizes.shape[0]
  anchor_template = np.empty((num_anchors, 4))
  anchor_template[:,0:2] = -0.5 * anchor_sizes  # y1, x1 (top-left)
  anchor_template[:,2:4] = +0.5 * anchor_sizes  # y2, x2 (bottom-right)

  # Shape of map, (H,W), determined by VGG-16 backbone
  height, width = image_shape[0] // feature_pixels, image_shape[1] // feature_pixels

  # Generate (H,W,2) map of coordinates, in feature space, each being [y,x]
  y_cell_coords = np.arange(height)
  x_cell_coords = np.arange(width)
  cell_coords = np.array(np.meshgrid(y_cell_coords, x_cell_coords)).transpose([2, 1, 0])

  # Convert all coordinates to image space (pixels) at *center* of each cell
  center_points = cell_coords * feature_pixels + 0.5 * feature_pixels

  # (H,W,2) -> (H,W,4), repeating the last dimension so it contains (y,x,y,x)
  center_points = np.tile(center_points, reps = 2)

  # (H,W,4) -> (H,W,4*num_anchors)
  center_points = np.tile(center_points, reps = num_anchors)
  
  #
  # Now we can create the anchors by adding the anchor template to each cell
  # location. Anchor template is flattened to size num_anchors * 4 to make 
  # the addition possible (along the last dimension). 
  #
  anchors = center_points.astype(np.float32) + anchor_template.flatten()

  # (H,W,4*num_anchors) -> (H*W*num_anchors,4)
  anchors = anchors.reshape((height*width*num_anchors, 4))

  # Valid anchors are those that do not cross image boundaries
  image_height, image_width = image_shape[0:2]
  valid = np.all((anchors[:,0:2] >= [0,0]) & (anchors[:,2:4] <= [image_height,image_width]), axis = 1)

  # Convert anchors to anchor format: (center_y, center_x, height, width)
  anchor_map = np.empty((anchors.shape[0], 4))
  anchor_map[:,0:2] = 0.5 * (anchors[:,0:2] + anchors[:,2:4])
  anchor_map[:,2:4] = anchors[:,2:4] - anchors[:,0:2]

  # Reshape maps and return
  anchor_map = anchor_map.reshape((height, width, num_anchors * 4))
  anchor_valid_map = valid.reshape((height, width, num_anchors))
  return anchor_map.astype(np.float32), anchor_valid_map.astype(np.float32)

def generate_rpn_map(anchor_map, anchor_valid_map, gt_boxes, object_iou_threshold = 0.7, background_iou_threshold = 0.3):
  """
  Generates a map containing ground truth data for training the region proposal
  network.

  Parameters
  ----------
  anchor_map : np.ndarray
    Map of shape (height, width, num_anchors*4) defining the anchors as
    (center_y, center_x, anchor_height, anchor_width) in input image space.
  anchor_valid_map : np.ndarray
    Map of shape (height, width, num_anchors) defining anchors that are valid
    and may be included in training.
  gt_boxes : List[training_sample.Box]
    List of ground truth boxes.
  object_iou_threshold : float
    IoU threshold between an anchor and a ground truth box above which an
    anchor is labeled as an object (positive) anchor.
  background_iou_threshold : float
    IoU threshold below which an anchor is labeled as background (negative).

  Returns
  -------
  np.ndarray, np.ndarray, np.ndarray
    RPN ground truth map, object (positive) anchor indices, and background
    (negative) anchor indices. Map height and width dimensions are in feature
    space.
    1. RPN ground truth map of shape (height, width, num_anchors, 6) where the
       last dimension is:
       - 0: Trainable anchor (1) or not (0). Only valid and non-neutral (that
            is, definitely positive or negative) anchors are trainable. This is
            the same as anchor_valid_map with additional invalid anchors caused
            by neutral samples
       - 1: For trainable anchors, whether the anchor is an object anchor (1)
            or background anchor (0). For non-trainable anchors, will be 0.
       - 2: Regression target for box center, ty.
       - 3: Regression target for box center, tx.
       - 4: Regression target for box size, th.
       - 5: Regression target for box size, tw.
    2. Map of shape (N, 3) of indices (y, x, k) of all N object anchors in the
       RPN ground truth map.
    3. Map of shape (M, 3) of indices of all M background anchors in the RPN
       ground truth map.
  """
  height, width, num_anchors = anchor_valid_map.shape

  # Convert ground truth box corners to (M,4) tensor and class indices to (M,)
  gt_box_corners = np.array([ box.corners for box in gt_boxes ])
  num_gt_boxes = len(gt_boxes)

  # Compute ground truth box center points and side lengths
  gt_box_centers = 0.5 * (gt_box_corners[:,0:2] + gt_box_corners[:,2:4])
  gt_box_sides = gt_box_corners[:,2:4] - gt_box_corners[:,0:2]

  # Flatten anchor boxes to (N,4) and convert to corners
  anchor_map = anchor_map.reshape((-1,4))
  anchors = np.empty(anchor_map.shape)
  anchors[:,0:2] = anchor_map[:,0:2] - 0.5 * anchor_map[:,2:4]  # y1, x1
  anchors[:,2:4] = anchor_map[:,0:2] + 0.5 * anchor_map[:,2:4]  # y2, x2
  n = anchors.shape[0]

  # Initialize all anchors initially as negative (background). We will also
  # track which ground truth box was assigned to each anchor.
  objectness_score = np.full(n, -1)   # RPN class: 0 = background, 1 = foreground, -1 = ignore (these will be marked as invalid in the truth map)
  gt_box_assignments = np.full(n, -1) # -1 means no box
  
  # Compute IoU between each anchor and each ground truth box, (N,M).
  ious = intersection_over_union(boxes1 = anchors, boxes2 = gt_box_corners)

  # Need to remove anchors that are invalid (straddle image boundaries) from
  # consideration entirely and the easiest way to do this is to wipe out their
  # IoU scores
  ious[anchor_valid_map.flatten() == 0, :] = -1.0

  # Find the best IoU ground truth box for each anchor and the best IoU anchor
  # for each ground truth box.
  #
  # Note that ious == max_iou_per_gt_box tests each of the N rows of ious
  # against the M elements of max_iou_per_gt_box, column-wise. np.where() then
  # returns all (y,x) indices of matches as a tuple: (y_indices, x_indices).
  # The y indices correspond to the N dimension and therefore indicate anchors
  # and the x indices correspond to the M dimension (ground truth boxes).
  max_iou_per_anchor = np.max(ious, axis = 1)           # (N,)
  best_box_idx_per_anchor = np.argmax(ious, axis = 1)   # (N,)
  max_iou_per_gt_box = np.max(ious, axis = 0)           # (M,)
  highest_iou_anchor_idxs = np.where(ious == max_iou_per_gt_box)[0] # get (L,) indices of anchors that are the highest-overlapping anchors for at least one of the M boxes

  # Anchors below the minimum threshold are negative
  objectness_score[max_iou_per_anchor < background_iou_threshold] = 0

  # Anchors that meet the threshold IoU are positive
  objectness_score[max_iou_per_anchor >= object_iou_threshold] = 1

  # Anchors that overlap the most with ground truth boxes are positive
  objectness_score[highest_iou_anchor_idxs] = 1

  # We assign the highest IoU ground truth box to each anchor. If no box met
  # the IoU threshold, the highest IoU box may happen to be a box for which
  # the anchor had the highest IoU. If not, then the objectness score will be
  # negative and the box regression won't ever be used.
  gt_box_assignments[:] = best_box_idx_per_anchor

  # Anchors that are to be ignored will be marked invalid. Generate a mask to
  # multiply anchor_valid_map by (-1 -> 0, 0 or 1 -> 1). Then mark ignored
  # anchors as 0 in objectness score because the score can only really be 0 or
  # 1.
  enable_mask = (objectness_score >= 0).astype(np.float32)
  objectness_score[objectness_score < 0] = 0
  
  # Compute box delta regression targets for each anchor
  box_delta_targets = np.empty((n, 4))
  box_delta_targets[:,0:2] = (gt_box_centers[gt_box_assignments] - anchor_map[:,0:2]) / anchor_map[:,2:4] # ty = (box_center_y - anchor_center_y) / anchor_height, tx = (box_center_x - anchor_center_x) / anchor_width
  box_delta_targets[:,2:4] = np.log(gt_box_sides[gt_box_assignments] / anchor_map[:,2:4])                 # th = log(box_height / anchor_height), tw = log(box_width / anchor_width)

  # Assemble RPN ground truth map
  rpn_map = np.zeros((height, width, num_anchors, 6))
  rpn_map[:,:,:,0] = anchor_valid_map * enable_mask.reshape((height,width,num_anchors))  # trainable anchors (object or background; excludes boundary-crossing invalid and neutral anchors)
  rpn_map[:,:,:,1] = objectness_score.reshape((height,width,num_anchors))
  rpn_map[:,:,:,2:6] = box_delta_targets.reshape((height,width,num_anchors,4))
  
  # Return map along with positive and negative anchors
  rpn_map_coords = np.transpose(np.mgrid[0:height,0:width,0:num_anchors], (1,2,3,0))                  # shape (height,width,k,3): every index (y,x,k,:) returns its own coordinate (y,x,k)
  object_anchor_idxs = rpn_map_coords[np.where((rpn_map[:,:,:,1] > 0) & (rpn_map[:,:,:,0] > 0))]      # shape (N,3), where each row is the coordinate (y,x,k) of a positive sample
  background_anchor_idxs = rpn_map_coords[np.where((rpn_map[:,:,:,1] == 0) & (rpn_map[:,:,:,0] > 0))] # shape (N,3), where each row is the coordinate (y,x,k) of a negative sample

  return rpn_map.astype(np.float32), object_anchor_idxs, background_anchor_idxs


In [9]:
# models/roi_pooling_layer.py

import numpy as np
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.layers import Layer


class RoIPoolingLayer(Layer):
  """
  Input shape:
    Two tensors [x_maps, x_rois] each with shape:
      x_maps: (samples, height, width, channels), representing the feature maps for this batch, of type tf.float32
      x_rois: (samples, num_rois, 4), where RoIs have the ordering (y, x, height, width), all tf.int32
  Output shape:
    (samples, num_rois, pool_size, pool_size, channels)
  """
  def __init__(self, pool_size, **kwargs):
    self.pool_size = pool_size
    super().__init__(**kwargs)

  def get_config(self):
    config = {
      "pool_size": self.pool_size,
    }
    base_config = super(RoIPoolingLayer, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

  def compute_output_shape(self, input_shape):
    map_shape, rois_shape = input_shape
    assert len(map_shape) == 4 and len(rois_shape) == 3 and rois_shape[2] == 4
    assert map_shape[0] == rois_shape[0]  # same number of samples
    num_samples = map_shape[0]
    num_channels = map_shape[3]
    num_rois = rois_shape[1]
    return (num_samples, num_rois, self.pool_size, self.pool_size, num_channels)

  def call(self, inputs):
    #
    # Unused here but useful to know:
    #
    # When defining model, x_map.shape[0] will be None because we don't have a batch size.
    # Using tf.shape() creates a dynamic scalar tensor that points to the batch size, and
    # will be evaluated when it is known. See: https://github.com/tensorflow/tensorflow/issues/31991
    #
    #   x_map = inputs[0]
    #   batch_size = tf.shape(x_map)[0]
    #

    #
    # Inputs are a list, [ x_maps, x_rois ], where x_maps and x_rois must have
    # the same batch size, N. The first application of map_fn() iterates over
    # N samples of [ x_map, x_roi ]. For this to work, the data type of the
    # final tensor must be specified otherwise map_fn() apparently infers a
    # very different (and incorrect) input element.
    #
    # This basically iterates over every sample in the batch. This is the
    # outer-most of a pair of map_fn() iterations and stacks its results
    # in the batch dimension, (samples, ...).
    #
    if self.pool_size == 7 and inputs[0].shape[3] == 512:
      # Special case optimization: 7x7x512 pools, ~4-5x speed-up
      return tf.map_fn(
        fn = lambda input_pair:
          RoIPoolingLayer._compute_pooled_rois_7x7x512(feature_map = input_pair[0], rois = input_pair[1]),
        elems = inputs,
        fn_output_signature = tf.float32  # this is absolutely required else the fn type inference seems to fail spectacularly
      )
    else:
      # Generic case capable of handling any pool shape
      return tf.map_fn(
        fn = lambda input_pair:
          RoIPoolingLayer._compute_pooled_rois(feature_map = input_pair[0], rois = input_pair[1], pool_size = self.pool_size),
        elems = inputs,
        fn_output_signature = tf.float32  # this is absolutely required else the fn type inference seems to fail spectacularly
      )

  @tf.function
  def _compute_pooled_rois(feature_map, rois, pool_size):
    #
    # Given a feature map and its associated RoIs, iterate over all RoIs for
    # this map. This is the second level of iteration and yields the num_rois
    # dimension: (samples, num_rois, ...)
    #
    return tf.map_fn(
      fn = lambda roi:
        RoIPoolingLayer._compute_pooled_roi(feature_map = feature_map, roi = roi, pool_size = pool_size),
      elems = rois,
      fn_output_signature = tf.float32
    )

  @tf.function
  def _compute_pooled_roi(feature_map, roi, pool_size):
    #
    # Given a feature map and a single RoI, computes the pooled map of shape
    # (pool_size, pool_size).
    #

    # Crop out the region of interest from the feature map
    region_y = roi[0]
    region_x = roi[1]
    region_height = roi[2]
    region_width = roi[3]
    num_channels = feature_map.shape[2]
    region_of_interest = tf.slice(feature_map, [region_y, region_x, 0], [region_height, region_width, num_channels])

    # Compute step size within the region of interest (feature map)
    x_step = tf.cast(region_width, dtype = tf.float32) / tf.cast(pool_size, dtype = tf.float32)
    y_step = tf.cast(region_height, dtype = tf.float32) / tf.cast(pool_size, dtype = tf.float32)

    #
    # Compute the pooled map for this RoI having shape (pool_size, pool_size).
    # This is done by a nested iteration with x being the inner, fast index and
    # y being the outer, slow index, resulting in shape (size_y, size_x), where
    # both sizes here are pool_size.
    #
    x_range = tf.cast(tf.range(pool_size), dtype = tf.float32)
    y_range = tf.cast(tf.range(pool_size), dtype = tf.float32)
    pooled_cells = tf.map_fn(
      fn = lambda y: tf.map_fn(
        fn = lambda x:
          RoIPoolingLayer._pool_one_cell(region_of_interest, pool_y_start = y, pool_x_start = x, y_step = y_step, x_step = x_step, region_height = region_height, region_width = region_width, pool_size = pool_size, num_channels = num_channels),
        elems = x_range
      ),
      elems = y_range
    )
    return pooled_cells

  @tf.function
  def _pool_one_cell(region_of_interest, pool_y_start, pool_x_start, y_step, x_step, region_height, region_width, pool_size, num_channels):
    #
    # This function maps a single pooling cell over some part of the RoI and
    # then computes the max of the RoI cells inside that pooling cell. The
    # operation is performed per-channel, yielding a result of shape
    # (1, num_channels).
    #
    # Compute the start and end positions using the following logic:
    #
    #   x_start = int(x * x_step)
    #   x_end = int((x + 1) * x_step) if (x + 1) < pool_size else region_width
    #   y_start = int(y * y_step)
    #   y_end = int((y + 1) * y_step) if (y + 1) < pool_size else region_height
    #
    pool_y_start_int = tf.cast(pool_y_start, dtype = tf.int32)
    pool_x_start_int = tf.cast(pool_x_start, dtype = tf.int32)
    y_start = tf.cast(pool_y_start * y_step, dtype = tf.int32)
    x_start = tf.cast(pool_x_start * x_step, dtype = tf.int32)
    y_end = tf.cond((pool_y_start_int + 1) < pool_size,
      lambda: tf.cast((pool_y_start + 1) * y_step, dtype = tf.int32),
      lambda: region_height
    )
    x_end = tf.cond((pool_x_start_int + 1) < pool_size,
      lambda: tf.cast((pool_x_start + 1) * x_step, dtype = tf.int32),
      lambda: region_width
    )

    # Extract this cell from the region and return the max
    y_size = tf.math.maximum(y_end - y_start, 1)  # if RoI is smaller than pool area, y_end - y_start can be less than 1 (0); we want to sample at least one cell
    x_size = tf.math.maximum(x_end - x_start, 1)
    pool_cell = tf.slice(region_of_interest, [y_start, x_start, 0], [y_size, x_size, num_channels])
    return tf.math.reduce_max(pool_cell, axis=(1,0))  # keep channels independent

  @tf.function
  def _compute_pooled_rois_7x7x512(feature_map, rois):
    # Special case: 7x7x512, unrolled pool width and height (7x7=49)
    return tf.map_fn(
      fn = lambda roi: tf.reshape(
        tf.stack([
          # y=0,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=0,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=0,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=0,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=0,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=0,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=0,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=1,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=2,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=3,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=4,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=5,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=0
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((0 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(0 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=1
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((1 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(1 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=2
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((2 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(2 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=3
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((3 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(3 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=4
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((4 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(4 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=5
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, tf.cast((5 + 1) * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32) - tf.cast(5 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
          # y=6,x=6
          tf.math.reduce_max(
            tf.slice(
              feature_map[ roi[0]:roi[0]+roi[2], roi[1]:roi[1]+roi[3], 0:512 ],
              [
                tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32),
                tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32),
                0
              ],
              [
                tf.math.maximum(1, roi[2] - tf.cast(6 * (tf.cast(roi[2], dtype = tf.float32) / 7), dtype = tf.int32)),
                tf.math.maximum(1, roi[3] - tf.cast(6 * (tf.cast(roi[3], dtype = tf.float32) / 7), dtype = tf.int32)),
                512
              ]
            ),
            axis = (1,0)
          ),
        ]),
        shape = (7,7,512)
      ),
      elems = rois,
      fn_output_signature = tf.float32
    )

In [10]:
# models/detector.py

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import models
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Lambda
from tensorflow.keras.layers import TimeDistributed
from tensorflow.keras import backend as K


class DetectorNetwork(tf.keras.Model):
  def __init__(self, num_classes, custom_roi_pool, activate_class_outputs, l2, dropout_probability):
    super().__init__()

    self._num_classes = num_classes
    self._activate_class_outputs = activate_class_outputs
    self._dropout_probability = dropout_probability

    regularizer = tf.keras.regularizers.l2(l2)
    class_initializer = tf.keras.initializers.RandomNormal(mean = 0.0, stddev = 0.01)
    regressor_initializer = tf.keras.initializers.RandomNormal(mean = 0.0, stddev = 0.001)

    # If custom_roi_pool flag is set, we use our custom implementation,
    # otherwise, tf operations that can approximate the operation will be used
    # in call().
    self._roi_pool = RoIPoolingLayer(pool_size = 7, name = "custom_roi_pool") if custom_roi_pool else None

    # Fully-connected layers with optional dropout. Initial weights will be
    # loaded from pre-trained VGG-16 ImageNet model by parent Faster R-CNN
    # module. These layers act as classifiers as in VGG-16 and use the same
    # names as Keras' built-in implementation of VGG-16. TimeDistributed() is
    # used to iterate over the proposal dimension and apply the layer to each
    # of the proposals.
    self._flatten = TimeDistributed(Flatten())
    self._fc1 = TimeDistributed(name = "fc1", layer = Dense(units = 4096, activation = "relu", kernel_regularizer = regularizer))
    self._dropout1 = TimeDistributed(Dropout(dropout_probability))
    self._fc2 = TimeDistributed(name = "fc2", layer = Dense(units = 4096, activation = "relu", kernel_regularizer = regularizer))
    self._dropout2 = TimeDistributed(Dropout(dropout_probability))

    # Output: classifier
    class_activation = "softmax" if activate_class_outputs else None
    self._classifier = TimeDistributed(name = "classifier_class", layer = Dense(units = num_classes, activation = class_activation, kernel_initializer = class_initializer))

    # Output: box delta regressions. Unique regression weights for each
    # possible class excluding background class, hence the use of
    # (num_classes-1). Class index 1 regressions are therefore at
    # indices: 0*4:0*4+1.
    self._regressor = TimeDistributed(name = "classifier_boxes", layer = Dense(units = 4 * (num_classes - 1), activation = "linear", kernel_initializer = regressor_initializer))

  def call(self, inputs, training):
    # Unpack inputs
    input_image = inputs[0]
    feature_map = inputs[1]
    proposals = inputs[2]
    assert len(feature_map.shape) == 4

    # RoI pooling: creates a 7x7 map for each proposal (1, num_rois, 7, 7, 512)
    if self._roi_pool:
      # Use our custom layer. Need to convert proposals from image-space
      # (y1, x1, y2, x2) to feature map space (y1, x1, height, width).
      proposals = tf.cast(proposals, dtype = tf.int32)                  # RoIs must be integral for RoIPoolingLayer
      map_dimensions = tf.shape(feature_map)[1:3]                       # (batches, height, width, channels) -> (height, width)
      map_limits = tf.tile(map_dimensions, multiples = [2]) - 1         # (height, width, height, width)
      roi_corners = tf.minimum(proposals // 16, map_limits)             # to feature map space and clamp against map edges
      roi_corners = tf.maximum(roi_corners, 0)
      roi_dimensions = roi_corners[:,2:4] - roi_corners[:,0:2] + 1
      rois = tf.concat([ roi_corners[:,0:2], roi_dimensions ], axis = 1)  # (N,4), where each row is (y1, x2, height, width) in feature map units
      rois = tf.expand_dims(rois, axis = 0)                             # (1,N,4), batch size of 1, as expected by RoIPoolingLayer
      pool = RoIPoolingLayer(pool_size = 7, name = "roi_pool")([feature_map, rois])
    else:
      # Crop the proposals, resize to 14x14 (with bilinear interpolation) and
      # max pool down to 7x7. This works just as well and is used in several
      # TensorFlow implementations of Faster R-CNN, such as:
      # https://github.com/kevinjliang/tf-Faster-RCNN/blob/master/Lib/roi_pool.py

      # Convert to normalized RoIs with each coordinate in [0,1]
      image_height = tf.shape(input_image)[1] # height in pixels
      image_width = tf.shape(input_image)[2]  # width in pixels
      rois = proposals / [ image_height, image_width, image_height, image_width ]

      # Crop, resize, pool
      num_rois = tf.shape(rois)[0];
      region = tf.image.crop_and_resize(image = feature_map, boxes = rois, box_indices = tf.zeros(num_rois, dtype = tf.int32), crop_size = [14, 14])
      pool = tf.nn.max_pool(region, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")
      pool = tf.expand_dims(pool, axis = 0) # (num_rois, 7, 7, 512) -> (1, num_rois, 7, 7, 512)
          
    # Pass through final layers
    flattened = self._flatten(pool)
    if training and self._dropout_probability != 0:
      fc1 = self._fc1(flattened)
      do1 = self._dropout1(fc1)
      fc2 = self._fc2(do1)
      do2 = self._dropout2(fc2)
      out = do2
    else:
      fc1 = self._fc1(flattened)
      fc2 = self._fc2(fc1)
      out = fc2 
    class_activation = "softmax" if self._activate_class_outputs else None
    classes = self._classifier(out)
    box_deltas = self._regressor(out)

    return [ classes, box_deltas ]

  @staticmethod
  def class_loss(y_predicted, y_true, from_logits):
    """
    Computes detector network classification loss.

    Parameters
    ----------
    y_predicted : tf.Tensor
      Class predictions, shaped (1, N, num_classes), where N is the number of
      detections (i.e., the number of proposals fed into the detector network).
    y_true : tf.Tensor
      Ground truth, shaped (1, N, num_classes). One-hot-encoded labels.
    from_logits : bool
      If true, y_predicted is given as logits (that is, softmax was not
      applied), otherwise, as probability scores (softmax applied).

    Returns
    -------
    tf.Tensor
      Scalar loss.
    """
    scale_factor = 1.0
    N = tf.cast(tf.shape(y_true)[1], dtype = tf.float32) + K.epsilon()  # number of proposals
    if from_logits:
      return scale_factor * K.sum(K.categorical_crossentropy(target = y_true, output = y_predicted, from_logits = True)) / N
    else:
      return scale_factor * K.sum(K.categorical_crossentropy(y_true, y_predicted)) / N
  
  @staticmethod
  def regression_loss(y_predicted, y_true):
    """
    Computes detector network box delta regression loss.

    Parameters
    ----------
    y_predicted : tf.Tensor
      Predicted box delta regressions in parameterized form (ty, tx, th, tw).
      Shaped (1, N, 4 * (num_classes - 1)). Class 0 (background) obviously has
      no box associated with it.
    y_true : tf.Tensor
      Ground truth box delta regression targets, shaped
      (1, N, 2, 4 * (num_classes - 1)). Elements [:,:,0,:] are masks indicating
      which of the regression targets [:,:,1,:] to use for the given proposal.
      That is, [0,n,0,:] is an array of 1 or 0 indicating which of [0,n,1,:]
      are valid for inclusion in the loss. For non-background proposals, there
      will be 4 unmasked values corresponding to (ty, tx, th, tw).
    
    Returns
    -------
    tf.Tensor
      Scalar loss.
    """
    scale_factor = 1.0
    sigma = 1.0
    sigma_squared = sigma * sigma
  
    # We want to unpack the regression targets and the mask of valid targets into
    # tensors each of the same shape as the predicted: 
    #   (batch_size, num_proposals, 4*(num_classes-1))
    # y_true has shape:
    #   (batch_size, num_proposals, 2, 4*(num_classes-1))
    y_mask = y_true[:,:,0,:]
    y_true_targets = y_true[:,:,1,:]
  
    # Compute element-wise loss using robust L1 function for all 4 regression
    # targets
    x = y_true_targets - y_predicted
    x_abs = tf.math.abs(x)
    is_negative_branch = tf.stop_gradient(tf.cast(tf.less(x_abs, 1.0 / sigma_squared), dtype = tf.float32))
    R_negative_branch = 0.5 * x * x * sigma_squared
    R_positive_branch = x_abs - 0.5 / sigma_squared
    losses = is_negative_branch * R_negative_branch + (1.0 - is_negative_branch) * R_positive_branch
  
    # Accumulate the relevant terms and normalize by the number of proposals
    N = tf.cast(tf.shape(y_true)[1], dtype = tf.float32) + K.epsilon()  # N = number of proposals
    relevant_loss_terms = y_mask * losses
    return scale_factor * K.sum(relevant_loss_terms) / N


In [11]:
# models/rpn.py

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import models
from tensorflow.keras.layers import Conv2D
from tensorflow.keras import backend as K


class RegionProposalNetwork(tf.keras.Model):
  def __init__(self, max_proposals_pre_nms_train, max_proposals_post_nms_train, max_proposals_pre_nms_infer, max_proposals_post_nms_infer, l2 = 0, allow_edge_proposals = False):
    super().__init__()

    self._max_proposals_pre_nms_train = max_proposals_pre_nms_train
    self._max_proposals_post_nms_train = max_proposals_post_nms_train
    self._max_proposals_pre_nms_infer = max_proposals_pre_nms_infer
    self._max_proposals_post_nms_infer = max_proposals_post_nms_infer
    self._allow_edge_proposals = allow_edge_proposals

    regularizer = tf.keras.regularizers.l2(l2)
    initial_weights = tf.keras.initializers.RandomNormal(mean = 0.0, stddev = 0.01, seed = None)

    anchors_per_location = 9

    # 3x3 convolution over input map producing 512-d result at each output. The center of each output is an anchor point (k anchors at each point).
    self._rpn_conv1 = Conv2D(name = "rpn_conv1", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)

    # Classification layer: predicts whether there is an object at the anchor or not. We use a sigmoid function, where > 0.5 is indicates a positive result.
    self._rpn_class = Conv2D(name = "rpn_class", kernel_size = (1,1), strides = 1, filters = anchors_per_location, padding = "same", activation = "sigmoid", kernel_initializer = initial_weights)

    # Box delta regression
    self._rpn_boxes = Conv2D(name = "rpn_boxes", kernel_size = (1,1), strides = 1, filters = 4 * anchors_per_location, padding = "same", activation = None, kernel_initializer = initial_weights)

  def __call__(self, inputs, training):
    # Unpack inputs
    input_image = inputs[0]
    feature_map = inputs[1]
    anchor_map = inputs[2]
    anchor_valid_map = inputs[3]
    assert len(feature_map.shape) == 4

    # Proposal sampling depends on whether we are training or not
    if training:
      max_proposals_pre_nms = self._max_proposals_pre_nms_train
      max_proposals_post_nms = self._max_proposals_post_nms_train
    else:
      max_proposals_pre_nms = self._max_proposals_pre_nms_infer
      max_proposals_post_nms = self._max_proposals_post_nms_infer

    # Pass through network
    y = self._rpn_conv1(feature_map)
    scores = self._rpn_class(y)
    box_delta_regressions = self._rpn_boxes(y)

    # Extract valid
    anchors, objectness_scores, box_deltas = self._extract_valid(
      anchor_map = anchor_map,
      anchor_valid_map = anchor_valid_map,
      objectness_score_map = scores,
      box_delta_map = box_delta_regressions,
      allow_edge_proposals = self._allow_edge_proposals
    )

    # Convert regressions to box corners
    proposals = tf_convert_deltas_to_boxes(
      box_deltas = box_deltas,
      anchors = anchors,
      box_delta_means = [ 0.0, 0.0, 0.0, 0.0 ],
      box_delta_stds = [ 1.0, 1.0, 1.0, 1.0 ]
    )
    
    # Keep only the top-N scores. Note that we do not care whether the
    # proposals were labeled as objects (score > 0.5) and peform a simple
    # ranking among all of them. Restricting them has a strong adverse impact
    # on training performance.
    sorted_indices = tf.argsort(objectness_scores)                  # sort in ascending order of objectness score
    sorted_indices = sorted_indices[::-1]                           # descending order of score
    proposals = tf.gather(proposals, indices = sorted_indices)[0:max_proposals_pre_nms] # grab the top-N best proposals
    objectness_scores = tf.gather(objectness_scores, indices = sorted_indices)[0:max_proposals_pre_nms] # corresponding scores

    # Clip to image boundaries
    image_height = tf.cast(tf.shape(input_image)[1], dtype = tf.float32)  # height in pixels (shape of image is (batches,height,width,channels))
    image_width = tf.cast(tf.shape(input_image)[2], dtype = tf.float32)   # width in pixels
    proposals_top_left = tf.maximum(proposals[:,0:2], 0.0)
    proposals_y2 = tf.reshape(tf.minimum(proposals[:,2], image_height), shape = (-1, 1))  # slice operation produces [N,], reshape to [N,1]
    proposals_x2 = tf.reshape(tf.minimum(proposals[:,3], image_width), shape = (-1, 1))
    proposals = tf.concat([ proposals_top_left, proposals_y2, proposals_x2 ], axis = 1) # [N,4] proposal tensor

    # Remove anything less than 16 pixels on a side
    height = proposals[:,2] - proposals[:,0]
    width = proposals[:,3] - proposals[:,1]
    idxs = tf.where((height >= 16) & (width >= 16))
    proposals = tf.gather_nd(proposals, indices = idxs)
    objectness_scores = tf.gather_nd(objectness_scores, indices = idxs)

    # Perform NMS
    idxs = tf.image.non_max_suppression(
      boxes = proposals,
      scores = objectness_scores,
      max_output_size = max_proposals_post_nms,
      iou_threshold = 0.7
    )
    proposals = tf.gather(proposals, indices = idxs)

    return [ scores, box_delta_regressions, proposals ] 

  def _extract_valid(self, anchor_map, anchor_valid_map, objectness_score_map, box_delta_map, allow_edge_proposals):
    # anchor_valid_map shape is (batch,height,width,num_anchors)
    height = tf.shape(anchor_valid_map)[1]
    width = tf.shape(anchor_valid_map)[2]
    num_anchors = tf.shape(anchor_valid_map)[3]
  
    anchors = tf.reshape(anchor_map, shape = (height * width * num_anchors, 4))             # [N,4], all anchors 
    anchors_valid = tf.reshape(anchor_valid_map, shape = (height * width * num_anchors, 1)) # [N,1], whether anchors are valid (i.e., do not cross image boundaries)
    scores = tf.reshape(objectness_score_map, shape = (height * width * num_anchors, 1))    # [N,1], predicted objectness scores
    box_deltas = tf.reshape(box_delta_map, shape = (height * width * num_anchors, 4))       # [N,4], predicted box delta regression targets
    
    anchors_valid = tf.squeeze(anchors_valid)                                               # [N,]
    scores = tf.squeeze(scores)                                                             # [N,]
  
    if allow_edge_proposals:
      # Use all proposals
      return anchors, scores, box_deltas
    else:
      # Filter out those proposals generated at invalid anchors. Invalid
      # anchors are really just those that cross image boundaries and, counter-
      # intuitively, given that the Faster R-CNN paper (Section 3.3) says that
      # these anchors are ignored during loss calculation, they should in fact
      # be included when generating proposals. Good performance requires
      # evaluating lots of proposals, so even if cross-boundary anchors do not
      # contribute to RPN loss, they can still feed samples into the detector
      # stage. It is therefore not recommended to exclude edge proposals but
      # the option exists here for educational purposes.
      idxs = tf.where(anchors_valid > 0)
      return tf.gather_nd(anchors, indices = idxs), tf.gather_nd(scores, indices = idxs), tf.gather_nd(box_deltas, indices = idxs)

  @staticmethod
  def class_loss(y_predicted, gt_rpn_map):
    """
    Computes RPN class loss.
  
    Parameters
    ----------
    y_predicted : tf.Tensor
      A tensor of shape (batch_size, height, width, num_anchors) containing
      objectness scores (0 = background, 1 = object).
    gt_rpn_map : tf.Tensor
      Ground truth tensor of shape (batch_size, height, width, num_anchors, 6).
  
    Returns
    -------
    tf.Tensor
      Scalar loss.
    """
  
    # y_true_class: (batch_size, height, width, num_anchors), same as predicted_scores
    y_true_class = tf.reshape(gt_rpn_map[:,:,:,:,1], shape = tf.shape(y_predicted))
    y_predicted_class = y_predicted
    
    # y_mask: y_true[:,:,:,0] is 1.0 for anchors included in the mini-batch
    y_mask = tf.reshape(gt_rpn_map[:,:,:,:,0], shape = tf.shape(y_predicted_class))
  
    # Compute how many anchors are actually used in the mini-batch (e.g.,
    # typically 256)
    N_cls = tf.cast(tf.math.count_nonzero(y_mask), dtype = tf.float32) + K.epsilon()
  
    # Compute element-wise loss for all anchors
    loss_all_anchors = K.binary_crossentropy(y_true_class, y_predicted_class)
    
    # Zero out the ones which should not have been included
    relevant_loss_terms = y_mask * loss_all_anchors
  
    # Sum the total loss and normalize by the number of anchors used
    return K.sum(relevant_loss_terms) / N_cls
  
  @staticmethod
  def regression_loss(y_predicted, gt_rpn_map):
    """
    Computes RPN box delta regression loss.
  
    Parameters
    ----------
    y_predicted : tf.Tensor
      A tensor of shape (batch_size, height, width, num_anchors * 4) containing
      RoI box delta regressions for each anchor, stored as: ty, tx, th, tw.
    gt_rpn_map : tf.Tensor
      Ground truth tensor of shape (batch_size, height, width, num_anchors, 6).
  
    Returns
    -------
    tf.Tensor
      Scalar loss.
    """
  
    scale_factor = 1.0  # hyper-parameter that controls magnitude of regression loss and is chosen to make regression term comparable to class term
    sigma = 3.0         # see: https://github.com/rbgirshick/py-faster-rcnn/issues/89
    sigma_squared = sigma * sigma
  
    y_predicted_regression = y_predicted
    y_true_regression = tf.reshape(gt_rpn_map[:,:,:,:,2:6], shape = tf.shape(y_predicted_regression))
  
    # Include only anchors that are used in the mini-batch and which correspond
    # to objects (positive samples)
    y_included = tf.reshape(gt_rpn_map[:,:,:,:,0], shape = tf.shape(gt_rpn_map)[0:4]) # trainable anchors map: (batch_size, height, width, num_anchors)
    y_positive = tf.reshape(gt_rpn_map[:,:,:,:,1], shape = tf.shape(gt_rpn_map)[0:4]) # positive anchors
    y_mask = y_included * y_positive
  
    # y_mask is of the wrong shape. We have one value per (y,x,k) position but in
    # fact need to have 4 values (one for each of the regression variables). For
    # example, y_predicted might be (1,37,50,36) and y_mask will be (1,37,50,9).
    # We need to repeat the last dimension 4 times.
    y_mask = tf.repeat(y_mask, repeats = 4, axis = 3)
  
    # The paper normalizes by dividing by a quantity called N_reg, which is equal
    # to the total number of anchors (~2400) and then multiplying by lambda=10.
    # This does not make sense to me because we are summing over a mini-batch at
    # most, so we use N_cls here. I might be misunderstanding what is going on
    # but 10/2400 = 1/240 which is pretty close to 1/256 and the paper mentions
    # that training is relatively insensitve to choice of normalization.
    N_cls = tf.cast(tf.math.count_nonzero(y_included), dtype = tf.float32) + K.epsilon()
  
    # Compute element-wise loss using robust L1 function for all 4 regression
    # components
    x = y_true_regression - y_predicted_regression
    x_abs = tf.math.abs(x)
    is_negative_branch = tf.stop_gradient(tf.cast(tf.less(x_abs, 1.0 / sigma_squared), dtype = tf.float32))
    R_negative_branch = 0.5 * x * x * sigma_squared
    R_positive_branch = x_abs - 0.5 / sigma_squared
    loss_all_anchors = is_negative_branch * R_negative_branch + (1.0 - is_negative_branch) * R_positive_branch
  
    # Zero out the ones which should not have been included
    relevant_loss_terms = y_mask * loss_all_anchors
    return scale_factor * K.sum(relevant_loss_terms) / N_cls


In [12]:
# models/vgg16.py

import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import models
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.initializers import glorot_normal


class FeatureExtractor(tf.keras.Model):
  def __init__(self, l2 = 0):
    super().__init__()

    initial_weights = glorot_normal()
    regularizer = tf.keras.regularizers.l2(l2)
    input_shape = (None, None, 3)
  
    # First two convolutional blocks are frozen (not trainable)
    self._block1_conv1 = Conv2D(name = "block1_conv1", input_shape = input_shape, kernel_size = (3,3), strides = 1, filters = 64, padding = "same", activation = "relu", kernel_initializer = initial_weights, trainable = False)
    self._block1_conv2 = Conv2D(name = "block1_conv2", kernel_size = (3,3), strides = 1, filters = 64, padding = "same", activation = "relu", kernel_initializer = initial_weights, trainable = False)
    self._block1_maxpool = MaxPooling2D(pool_size = 2, strides = 2)

    self._block2_conv1 = Conv2D(name = "block2_conv1", kernel_size = (3,3), strides = 1, filters = 128, padding = "same", activation = "relu", kernel_initializer = initial_weights, trainable = False)
    self._block2_conv2 = Conv2D(name = "block2_conv2", kernel_size = (3,3), strides = 1, filters = 128, padding = "same", activation = "relu", kernel_initializer = initial_weights, trainable = False)
    self._block2_maxpool = MaxPooling2D(pool_size = 2, strides = 2)

    # Weight decay begins from these layers onward: https://github.com/rbgirshick/py-faster-rcnn/blob/master/models/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt
    self._block3_conv1 = Conv2D(name = "block3_conv1", kernel_size = (3,3), strides = 1, filters = 256, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block3_conv2 = Conv2D(name = "block3_conv2", kernel_size = (3,3), strides = 1, filters = 256, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block3_conv3 = Conv2D(name = "block3_conv3", kernel_size = (3,3), strides = 1, filters = 256, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block3_maxpool = MaxPooling2D(pool_size = 2, strides = 2)

    self._block4_conv1 = Conv2D(name = "block4_conv1", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block4_conv2 = Conv2D(name = "block4_conv2", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block4_conv3 = Conv2D(name = "block4_conv3", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block4_maxpool = MaxPooling2D(pool_size = 2, strides = 2)

    self._block5_conv1 = Conv2D(name = "block5_conv1", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block5_conv2 = Conv2D(name = "block5_conv2", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)
    self._block5_conv3 = Conv2D(name = "block5_conv3", kernel_size = (3,3), strides = 1, filters = 512, padding = "same", activation = "relu", kernel_initializer = initial_weights, kernel_regularizer = regularizer)

  def call(self, input_image):
    y = self._block1_conv1(input_image)
    y = self._block1_conv2(y)
    y = self._block1_maxpool(y)

    y = self._block2_conv1(y)
    y = self._block2_conv2(y)
    y = self._block2_maxpool(y)

    y = self._block3_conv1(y)
    y = self._block3_conv2(y)
    y = self._block3_conv3(y)
    y = self._block3_maxpool(y)

    y = self._block4_conv1(y)
    y = self._block4_conv2(y)
    y = self._block4_conv3(y)
    y = self._block4_maxpool(y)

    y = self._block5_conv1(y)
    y = self._block5_conv2(y)
    y = self._block5_conv3(y)

    return y


In [13]:
# models/faster_rcnn.py
#
# TensorFlow/Keras implementation of Faster R-CNN training and inference
# models. Here, all stages of Faster R-CNN are instantiated, ground truth
# labels from RPN proposal boxes (RoIs) for the detector stage are generated,
# and proposals are sampled.
#

#
# Weight Decay
# ------------
# Keras does not provide a weight decay option but rather an L2 penalty. Weight
# decay can be converted to L2 by dividing by 2. This is because the L2 penalty
# is added to the loss and then differentiated with respect to the weights
# (introducing a factor of 2 that must be canceled out). See:
# https://bbabenko.github.io/weight-decay/
#
# Pro-Tip
# -------
#
# To log the output of Keras layers using tf.print, use K.Lambda as below:
#
#   def do_log1(x):
#     tf.print("best_ious=", x, output_stream = "file:///projects/frcnn/tf2/out.txt", summarize = -1)
#     return x
#   best_ious = Lambda(do_log1)(best_ious)
#
#   def do_log(x):
#     y_predicted = x[0]
#     y_true = x[1]
#     loss = K.mean(K.categorical_crossentropy(target = y_true, output = y_predicted, from_logits = True))
#     tf.print("loss=", loss, "y_predicted=", y_predicted, output_stream = "file:///projects/frcnn/tf2/out.txt", summarize = -1)
#     return y_predicted
#   y_predicted = Lambda(do_log)((y_predicted, y_true))
#
# output_stream may also be a file stream like sys.stdout.
#

import numpy as np
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras import Model
from tensorflow.keras.layers import Lambda


class FasterRCNNModel(tf.keras.Model):
  def __init__(self, num_classes, allow_edge_proposals, custom_roi_pool, activate_class_outputs, l2 = 0, dropout_probability = 0):
    super().__init__()
    self._num_classes = num_classes
    self._activate_class_outputs = activate_class_outputs
    self._stage1_feature_extractor = FeatureExtractor(l2 = l2)
    self._stage2_region_proposal_network = RegionProposalNetwork(
      max_proposals_pre_nms_train = 12000,
      max_proposals_post_nms_train = 2000,
      max_proposals_pre_nms_infer = 6000,
      max_proposals_post_nms_infer = 300,
      l2 = l2,
      allow_edge_proposals = allow_edge_proposals
    )
    self._stage3_detector_network = DetectorNetwork(
      num_classes = num_classes,
      custom_roi_pool = custom_roi_pool,
      activate_class_outputs = activate_class_outputs,
      l2 = l2,
      dropout_probability = dropout_probability
    )

  def call(self, inputs, training = False):
    # Unpack inputs
    input_image = inputs[0]             # (1, height_pixels, width_pixels, 3)
    anchor_map = inputs[1]              # (1, height, width, num_anchors * 4)
    anchor_valid_map = inputs[2]        # (1, height, width, num_anchors)
    if training:
      gt_rpn_map = inputs[3]            # (1, height, width, num_anchors, 6)
      gt_box_class_idxs_map = inputs[4] # (1, num_gt_boxes)
      gt_box_corners_map = inputs[5]    # (1, num_gt_boxes, 4)

    # Stage 1: Extract features
    feature_map = self._stage1_feature_extractor(input_image = input_image, training = training)

    # Stage 2: Generate object proposals using RPN
    rpn_scores, rpn_box_deltas, proposals = self._stage2_region_proposal_network(
      inputs = [
        input_image,
        feature_map,
        anchor_map,
        anchor_valid_map
      ],
      training = training
    )

    # If training, we must generate ground truth data for the detector stage
    # from RPN outputs
    if training:
      # Assign labels to proposals and take random sample (for detector training)
      proposals, gt_classes, gt_box_deltas = self._label_proposals(
        proposals = proposals,
        gt_box_class_idxs = gt_box_class_idxs_map[0], # for now, batch size of 1
        gt_box_corners = gt_box_corners_map[0],
        min_background_iou_threshold = 0.0,
        min_object_iou_threshold = 0.5
      )
      proposals, gt_classes, gt_box_deltas = self._sample_proposals(
        proposals = proposals,
        gt_classes = gt_classes,
        gt_box_deltas = gt_box_deltas,
        max_proposals = 128,
        positive_fraction = 0.25
      )
      gt_classes = tf.expand_dims(gt_classes, axis = 0)           # (N,num_classes) -> (1,N,num_classes) (as expected by loss function)
      gt_box_deltas = tf.expand_dims(gt_box_deltas, axis = 0)   # (N,2,(num_classes-1)*4) -> (1,N,2,(num_classes-1)*4)

      # Ensure proposals are treated as constants and do not propagate gradients
      proposals = tf.stop_gradient(proposals)
      gt_classes = tf.stop_gradient(gt_classes)
      gt_box_deltas = tf.stop_gradient(gt_box_deltas)

    # Stage 3: Detector
    detector_classes, detector_box_deltas = self._stage3_detector_network(
      inputs = [
        input_image,
        feature_map,
        proposals
      ],
      training = training
    )

    # Losses
    if training:
      rpn_class_loss = self._stage2_region_proposal_network.class_loss(y_predicted = rpn_scores, gt_rpn_map = gt_rpn_map)
      rpn_regression_loss = self._stage2_region_proposal_network.regression_loss(y_predicted = rpn_box_deltas, gt_rpn_map = gt_rpn_map)
      detector_class_loss = self._stage3_detector_network.class_loss(y_predicted = detector_classes, y_true = gt_classes, from_logits = not self._activate_class_outputs)
      detector_regression_loss = self._stage3_detector_network.regression_loss(y_predicted = detector_box_deltas, y_true = gt_box_deltas)
      self.add_loss(rpn_class_loss)
      self.add_loss(rpn_regression_loss)
      self.add_loss(detector_class_loss)
      self.add_loss(detector_regression_loss)
      self.add_metric(rpn_class_loss, name = "rpn_class_loss")
      self.add_metric(rpn_regression_loss, name = "rpn_regression_loss")
      self.add_metric(detector_class_loss, name = "detector_class_loss")
      self.add_metric(detector_regression_loss, name = "detector_regression_loss")
    else:
      # Losses cannot be computed during inference and should be ignored
      rpn_class_loss = float("inf")
      rpn_regression_loss = float("inf")
      detector_class_loss = float("inf")
      detector_regression_loss = float("inf")

    # Return outputs
    return [
      rpn_scores,
      rpn_box_deltas,
      detector_classes,
      detector_box_deltas,
      proposals,
      rpn_class_loss,
      rpn_regression_loss,
      detector_class_loss,
      detector_regression_loss
   ]

  def predict_on_batch(self, x, score_threshold):
    """
    Use this method to run inference. Overrides the default Keras
    implementation to return scored boxes.

    Parameters
    ----------
    x : List[np.ndarray]
      List of input maps, each of batch size 1:
        - Input image: (1, height_pixels, width_pixels, 3)
        - Anchor map: (1, height, width, num_anchors * 4)
        - Anchor valid map: (1, height, width, num_anchors)
    score_threshold : float
      Minimum class score for detections. Detections scoring below this value
      are discarded.

    Returns
    -------
    Dict[int, Tuple[float, float, float, float, float]]
      Scored boxes by class index. Each box is a tuple of
      (y_min, x_min, y_max, x_max, score).
    """
    _, _, detector_classes, detector_box_deltas, proposals, _, _, _, _ = super().predict_on_batch(x = x)
    scored_boxes_by_class_index = self._predictions_to_scored_boxes(
      input_image = x[0],
      classes = detector_classes,
      box_deltas = detector_box_deltas,
      proposals = proposals,
      score_threshold = score_threshold
    )
    return scored_boxes_by_class_index

  def load_imagenet_weights(self):
    """
    Load weights from Keras VGG-16 model pre-trained on ImageNet into the
    feature extractor convolutional layers as well as the two fully connected
    layers in the detector stage.
    """
    keras_model = tf.keras.applications.VGG16(weights = "imagenet")
    for keras_layer in keras_model.layers:
      weights = keras_layer.get_weights()
      if len(weights) > 0:
        vgg16_layers = self._stage1_feature_extractor.layers + self._stage3_detector_network.layers
        our_layer = [ layer for layer in vgg16_layers if layer.name == keras_layer.name ]
        if len(our_layer) > 0:
          print("Loading VGG-16 ImageNet weights into layer: %s" % our_layer[0].name)
          our_layer[0].set_weights(weights)

  def _predictions_to_scored_boxes(self, input_image, classes, box_deltas, proposals, score_threshold):
    # Eliminate batch dimension
    input_image = np.squeeze(input_image, axis = 0)
    classes = np.squeeze(classes, axis = 0)
    box_deltas = np.squeeze(box_deltas, axis = 0)

    # Convert logits to probability distribution if using logits mode
    if not self._activate_class_outputs:
      classes = tf.nn.softmax(classes, axis = 1).numpy()

    # Convert proposal boxes -> center point and size
    proposal_anchors = np.empty(proposals.shape)
    proposal_anchors[:,0] = 0.5 * (proposals[:,0] + proposals[:,2]) # center_y
    proposal_anchors[:,1] = 0.5 * (proposals[:,1] + proposals[:,3]) # center_x
    proposal_anchors[:,2:4] = proposals[:,2:4] - proposals[:,0:2]   # height, width

    # Separate out results per class: class_idx -> (y1, x1, y2, x2, score)
    boxes_and_scores_by_class_idx = {}
    for class_idx in range(1, classes.shape[1]):  # skip class 0 (background)
      # Get the regression parameters (ty, tx, th, tw) corresponding to this
      # class, for all proposals
      box_delta_idx = (class_idx - 1) * 4
      box_delta_params = box_deltas[:, (box_delta_idx + 0) : (box_delta_idx + 4)] # (N, 4)
      proposal_boxes_this_class = convert_deltas_to_boxes(
        box_deltas = box_delta_params,
        anchors = proposal_anchors,
        box_delta_means = [0.0, 0.0, 0.0, 0.0],
        box_delta_stds = [0.1, 0.1, 0.2, 0.2]
      )

      # Clip to image boundaries
      proposal_boxes_this_class[:,0::2] = np.clip(proposal_boxes_this_class[:,0::2], 0, input_image.shape[0] - 1) # clip y1 and y2 to [0,height)
      proposal_boxes_this_class[:,1::2] = np.clip(proposal_boxes_this_class[:,1::2], 0, input_image.shape[1] - 1) # clip x1 and x2 to [0,width)

      # Get the scores for this class. The class scores are returned in
      # normalized categorical form. Each row corresponds to a class.
      scores_this_class = classes[:,class_idx]

      # Keep only those scoring high enough
      sufficiently_scoring_idxs = np.where(scores_this_class > score_threshold)[0]
      proposal_boxes_this_class = proposal_boxes_this_class[sufficiently_scoring_idxs]
      scores_this_class = scores_this_class[sufficiently_scoring_idxs]
      boxes_and_scores_by_class_idx[class_idx] = (proposal_boxes_this_class, scores_this_class)

    # Perform NMS per class
    scored_boxes_by_class_idx = {}
    for class_idx, (boxes, scores) in boxes_and_scores_by_class_idx.items():
      idxs = tf.image.non_max_suppression(
        boxes = boxes,
        scores = scores,
        max_output_size = proposals.shape[0],
        iou_threshold = 0.3
      )
      idxs = idxs.numpy()
      boxes = boxes[idxs]
      scores = np.expand_dims(scores[idxs], axis = 0) # (N,) -> (N,1)
      scored_boxes = np.hstack([ boxes, scores.T ])   # (N,5), with each row: (y1, x1, y2, x2, score)
      scored_boxes_by_class_idx[class_idx] = scored_boxes

    return scored_boxes_by_class_idx

  def _label_proposals(self, proposals, gt_box_class_idxs, gt_box_corners, min_background_iou_threshold, min_object_iou_threshold):
    """
    Determines which proposals generated by the RPN stage overlap with ground
    truth boxes and creates ground truth labels for the subsequent detector
    stage.

    Parameters
    ----------
    proposals : tf.Tensor
      Proposal corners, shaped (N, 4), where each corner is:
      (y_min, x_min, y_max, x_max).
    gt_box_class_idxs : tf.Tensor
      The class index for each ground truth box, shaped (M,), where M is the
      number of ground truth boxes.
    gt_box_corners: tf.Tensor
      Ground truth box corners, shaped (M, 4).
    min_background_iou_threshold : float
      Minimum IoU threshold with ground truth boxes below which proposals are
      ignored entirely. Proposals with an IoU threshold in the range
      [min_background_iou_threshold, min_object_iou_threshold) are labeled as
      background. This value can be greater than 0, which has the effect of
      selecting more difficult background examples that have some degree of
      overlap with ground truth boxes.
    min_object_iou_threshold : float
      Minimum IoU threshold for a proposal to be labeled as an object.

    Returns
    -------
    tf.Tensor, tf.Tensor, tf.Tensor
      Proposals, (N, 4), labeled as either objects or background (depending on
      IoU thresholds, some proposals can end up as neither and are excluded
      here); one-hot encoded class labels, (N, num_classes), for each proposal;
      and box delta regression targets, (N, 2, (num_classes - 1) * 4), for each
      proposal. Regression target values are present at locations [:,1,:] and
      consist of (ty, tx, th, tw) for the class that the box corresponds to.
      The entries for all other classes and the background classes should be
      ignored. A mask is written to locations [:,0,:]. For each proposal
      assigned a non-background class, there will be 4 consecutive elements
      marked with 1 indicating the corresponding regression target values are
      to be used. There are no regression targets for background proposals and
      the mask is entirely 0 for those proposals.
    """
    # Let's be crafty and create some fake proposals that match the ground
    # truth boxes exactly. This isn't strictly necessary and the model should
    # work without it but it will help training and will ensure that there are
    # always some positive examples to train on.
    proposals = tf.concat([ proposals, gt_box_corners ], axis = 0)

    # Compute IoU between each proposal (N,4) and each ground truth box (M,4)
    # -> (N, M)
    ious = tf_intersection_over_union(boxes1 = proposals, boxes2 = gt_box_corners)

    # Find the best IoU for each proposal, the class of the ground truth box
    # associated with it, and the box corners
    best_ious = tf.math.reduce_max(ious, axis = 1)  # (N,) of maximum IoUs for each of the N proposals
    box_idxs = tf.math.argmax(ious, axis = 1)       # (N,) of ground truth box index for each proposal
    gt_box_class_idxs = tf.gather(gt_box_class_idxs, indices = box_idxs)  # (N,) of class indices of highest-IoU box for each proposal
    gt_box_corners = tf.gather(gt_box_corners, indices = box_idxs)        # (N,4) of box corners of highest-IoU box for each proposal

    # Remove all proposals whose best IoU is less than the minimum threshold
    # for a negative (background) sample. We also check for IoUs > 0 because
    # due to earlier clipping, we may get invalid 0-area proposals.
    idxs = tf.where(best_ious >= min_background_iou_threshold)  # keep proposals w/ sufficiently high IoU
    proposals = tf.gather_nd(proposals, indices = idxs)
    best_ious = tf.gather_nd(best_ious, indices = idxs)
    gt_box_class_idxs = tf.gather_nd(gt_box_class_idxs, indices = idxs)
    gt_box_corners = tf.gather_nd(gt_box_corners, indices = idxs)

    # IoUs less than min_object_iou_threshold will be labeled as background
    retain_mask = tf.cast(best_ious >= min_object_iou_threshold, dtype = gt_box_class_idxs.dtype) # (N,), with 0 wherever best_iou < threshold, else 1
    gt_box_class_idxs = gt_box_class_idxs * retain_mask

    # One-hot encode class labels
    num_classes = self._num_classes
    gt_classes = tf.one_hot(indices = gt_box_class_idxs, depth = num_classes) # (N,num_classes)

    # Convert proposals and ground truth boxes into "anchor" format (center
    # points and side lengths). For the detector stage, the proposals serve as
    # the anchors relative to which the final box predictions will be
    # regressed.
    proposal_centers = 0.5 * (proposals[:,0:2] + proposals[:,2:4])          # center_y, center_x
    proposal_sides = proposals[:,2:4] - proposals[:,0:2]                    # height, width
    gt_box_centers = 0.5 * (gt_box_corners[:,0:2] + gt_box_corners[:,2:4])  # center_y, center_x
    gt_box_sides = gt_box_corners[:,2:4] - gt_box_corners[:,0:2]            # height, width

    # Compute regression targets (ty, tx, th, tw) for each proposal based on
    # the best box selected
    detector_box_delta_means = tf.constant([0, 0, 0, 0], dtype = tf.float32)
    detector_box_delta_stds = tf.constant([0.1, 0.1, 0.2, 0.2], dtype = tf.float32)
    tyx = (gt_box_centers - proposal_centers) / proposal_sides  # ty = (gt_center_y - proposal_center_y) / proposal_height, tx = (gt_center_x - proposal_center_x) / proposal_width
    thw = tf.math.log(gt_box_sides / proposal_sides)            # th = log(gt_height / proposal_height), tw = (gt_width / proposal_width)
    box_delta_targets = tf.concat([ tyx, thw ], axis = 1)       # (N,4) box delta regression targets tensor
    box_delta_targets = (box_delta_targets - detector_box_delta_means) / detector_box_delta_stds  # mean and standard deviation adjustment

    # Convert regression targets into a map of shape (N,2,4*(C-1)) where C is
    # the number of classes and [:,0,:] specifies a mask for the corresponding
    # target components at [:,1,:]. Targets are ordered (ty, tx, th, tw).
    # Background class 0 is not present at all.
    gt_box_deltas_mask = tf.repeat(gt_classes, repeats = 4, axis = 1)[:,4:]             # create masks using interleaved repetition, remembering to discard class 0
    gt_box_deltas_values = tf.tile(box_delta_targets, multiples = [1, num_classes - 1]) # populate regression targets with straightforward repetition of each row (only those columns corresponding to class will be masked on)
    gt_box_deltas_mask = tf.expand_dims(gt_box_deltas_mask, axis = 0)     # (N,4*(C-1)) -> (1,N,4*(C-1))
    gt_box_deltas_values = tf.expand_dims(gt_box_deltas_values, axis = 0) # (N,4*(C-1)) -> (1,N,4*(C-1))
    gt_box_deltas = tf.concat([ gt_box_deltas_mask, gt_box_deltas_values ], axis = 0) # (2,N,4*(C-1))
    gt_box_deltas = tf.transpose(gt_box_deltas, perm = [ 1, 0, 2])        # (N,2,4*(C-1))

    return proposals, gt_classes, gt_box_deltas

  def _sample_proposals(self, proposals, gt_classes, gt_box_deltas, max_proposals, positive_fraction):
    if max_proposals <= 0:
      return proposals, gt_classes, gt_box_deltas

    # Get positive and negative (background) proposals
    class_indices = tf.argmax(gt_classes, axis = 1) # (N,num_classes) -> (N,), where each element is the class index (highest score from its row)
    positive_indices = tf.squeeze(tf.where(class_indices > 0), axis = 1)  # (P,), tensor of P indices (the positive, non-background classes in class_indices)
    negative_indices = tf.squeeze(tf.where(class_indices <= 0), axis = 1) # (N,), tensor of N indices (the negative, background classes in class_indices)
    num_positive_proposals = tf.size(positive_indices)
    num_negative_proposals = tf.size(negative_indices)

    # Select positive and negative samples, if there are enough. Note that the
    # number of positive samples can be either the positive fraction of the
    # *actual* number of proposals *or* the *desired* number (max_proposals).
    # In practice, these yield virtually identical results but the latter
    # method will yield slightly more positive samples in the rare cases when
    # the number of proposals is below the desired number. Here, we use the
    # former method but others, such as Yun Chen, use the latter. To implement
    # it, replace num_samples with max_proposals in the line that computes
    # num_positive_samples. I am not sure what the original Faster R-CNN
    # implementation does.
    num_samples = tf.minimum(max_proposals, tf.size(class_indices))
    num_positive_samples = tf.minimum(tf.cast(tf.math.round(tf.cast(num_samples, dtype = float) * positive_fraction), dtype = num_samples.dtype), num_positive_proposals)
    num_negative_samples = tf.minimum(num_samples - num_positive_samples, num_negative_proposals)

    # Sample randomly
    positive_sample_indices = tf.random.shuffle(positive_indices)[:num_positive_samples]
    negative_sample_indices = tf.random.shuffle(negative_indices)[:num_negative_samples]
    indices = tf.concat([ positive_sample_indices, negative_sample_indices ], axis = 0)

    # My initial PyTorch version was careful to return empty tensors if there
    # were no positive samples or no negative samples. Because TF2/Keras is awful
    # and tf.cond doesn't work due to some incompatibility between tf.function
    # and KerasTensor, we always return the proposals even if there are no
    # negative samples among them. Ths occurs very rarely. Positive samples are
    # guaranteed to exist because _label_proposals inserts the ground truth boxes
    # as fake proposals to boost learning.
    """
    # Do we have enough?
    no_samples = tf.logical_or(tf.math.less_equal(num_positive_samples, 0), tf.math.less_equal(num_negative_samples, 0))

    # Return (if we have any samples)
    proposals = tf.cond(
      no_samples,
      true_fn = lambda: tf.zeros(shape = (0, 4), dtype = proposals.dtype),  # empty proposals tensor if no samples
      false_fn = lambda: tf.gather(proposals, indices = indices)            # gather samples
    )
    gt_classes = tf.cond(
      no_samples,
      true_fn = lambda: tf.zeros(shape = (0, tf.shape(gt_classes)[1]), dtype = gt_classes.dtype), # empty list of classes if no samples
      false_fn = lambda: tf.gather(gt_classes, indices = indices)                                 # gather samples
    )
    gt_box_deltas = tf.cond(
      no_samples,
      true_fn = lambda: tf.zeros(shape = (0, tf.shape(gt_box_deltas)[1]), dtype = gt_box_deltas.dtype), # empty list of classes if no samples
      false_fn = lambda: tf.gather(gt_box_deltas, indices = indices)                                     # gather samples
    )
    """

    return tf.gather(proposals, indices = indices), tf.gather(gt_classes, indices = indices), tf.gather(gt_box_deltas, indices = indices)


In [26]:
class Options:
    def __init__(self,):
        self.train = True
        self.eval = False
        self.predict = False
        self.predict_to_file = ""
        self.predict_all = ""
        self.dataset_dir = "/kaggle/input/chess-object-detection/chess-pieces-dataset/"
        self.load_from = "fasterrcnn_tf2.h5" # "fasterrcnn_tf2.h5"
        self.save_to = "fasterrcnn_tf2.h5"
        self.save_best_to = "fasterrcnn_tf2.h5"
        self.train_split = "train"
        self.eval_split = "test"
        self.cache_images = False
        self.periodic_eval_samples = False
        self.checkpoint_dir = False
        self.plot = False
        self.log_csv = "log.csv"
        self.optimizer = "adam"
        self.learning_rate = 1e-3
        self.epochs = 10
        self.clipnorm = 0.0
        self.momentum = 0.9
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.weight_decay = 5e-4
        self.dropout = 0.0
        self.custom_roi_pool = 0.0
        self.detector_logits = False
        self.detector_logits = False
        self.no_augment = True
        self.exclude_edge_proposals = True
        self.dump_anchors = True
        self.debug_dir = ""


options = Options()

In [15]:
# Run-time environment
cuda_available = tf.test.is_built_with_cuda()
gpu_available = tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None)
print("CUDA Available : %s" % ("yes" if cuda_available else "no"))
print("GPU Available  : %s" % ("yes" if gpu_available else "no"))
print("Eager Execution: %s" % ("yes" if tf.executing_eagerly() else "no"))

CUDA Available : yes
GPU Available  : yes
Eager Execution: yes


2022-04-24 04:52:53.101337: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-24 04:52:53.156667: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-24 04:52:53.245568: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-24 04:52:53.246316: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [16]:
def render_anchors():
    training_data = Dataset(dir=options.dataset_dir, split=options.train_split, augment=False, shuffle=False)
    if not os.path.exists(options.dump_anchors):
        os.makedirs(options.dump_anchors)
    print("Rendering anchors from '%s' to set '%s'..." % (options.train_split, options.dump_anchors))
    for sample in iter(training_data):
        output_path = os.path.join(options.dump_anchors, "anchors_" + os.path.basename(sample.filepath) + ".png")
        show_anchors(
            output_path=output_path,
            image=sample.image,
            anchor_map=sample.anchor_map,
            anchor_valid_map=sample.anchor_valid_map,
            gt_rpn_map=sample.gt_rpn_map,
            gt_boxes=sample.gt_boxes
        )

In [17]:
# # Perform optional procedures
# if options.dump_anchors:
#     render_anchors()

# Debug logging
if options.debug_dir:
    tf.debugging.experimental.enable_dump_debug_info(options.debug_dir, tensor_debug_mode="FULL_HEALTH",
                                                     circular_buffer_size=-1)

In [18]:
class CSVLog:
  """
  Logs to a CSV file.
  """
  def __init__(self, filename):
    self._filename = filename
    self._header_written = False

  def log(self, items):
    keys = list(items.keys())
    file_mode = "a" if self._header_written else "w"
    with open(self._filename, file_mode) as fp:
      if not self._header_written:
        fp.write(",".join(keys) + "\n")
        self._header_written = True
      values = [ str(value) for (key, value) in items.items() ]
      fp.write(",".join(values) + "\n")

class BestWeightsTracker:
  def __init__(self, filepath):
    self._filepath = filepath
    self._best_weights = None
    self._best_mAP = 0

  def on_epoch_end(self, model, mAP):
    if mAP > self._best_mAP:
      self._best_mAP = mAP
      self._best_weights = model.get_weights()

  def restore_and_save_best_weights(self, model):
    if self._best_weights is not None:
      model.set_weights(self._best_weights)
      model.save_weights(filepath = self._filepath, overwrite = True, save_format = "h5")
      print("Saved best model weights (Mean Average Precision = %1.2f%%) to '%s'" % (self._best_mAP, self._filepath))



In [19]:
from tqdm import tqdm
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam

def create_optimizer():
    kwargs = {}
    if options.clipnorm > 0:
        kwargs = {"clipnorm": options.clipnorm}
    if options.optimizer == "sgd":
        optimizer = SGD(learning_rate=options.learning_rate, momentum=options.momentum, **kwargs)
    elif options.optimizer == "adam":
        optimizer = Adam(learning_rate=options.learning_rate, beta_1=options.beta1, beta_2=options.beta2, **kwargs)
    else:
        raise ValueError("Optimizer must be 'sgd' for stochastic gradient descent or 'adam' for Adam")
    return optimizer

def _sample_rpn_minibatch(rpn_map, object_indices, background_indices, rpn_minibatch_size):
    """
    Selects anchors for training and produces a copy of the RPN ground truth
    map with only those anchors marked as trainable.

    Parameters
    ----------
    rpn_map : np.ndarray
      RPN ground truth map of shape
      (batch_size, height, width, num_anchors, 6).
    object_indices : List[np.ndarray]
      For each image in the batch, a map of shape (N, 3) of indices (y, x, k)
      of all N object anchors in the RPN ground truth map.
    background_indices : List[np.ndarray]
      For each image in the batch, a map of shape (M, 3) of indices of all M
      background anchors in the RPN ground truth map.

    Returns
    -------
    np.ndarray
      A copy of the RPN ground truth map with index 0 of the last dimension
      recomputed to include only anchors in the minibatch.
    """
    assert rpn_map.shape[0] == 1, "Batch size must be 1"
    assert len(object_indices) == 1, "Batch size must be 1"
    assert len(background_indices) == 1, "Batch size must be 1"
    positive_anchors = object_indices[0]
    negative_anchors = background_indices[0]
    assert len(positive_anchors) + len(
        negative_anchors) >= rpn_minibatch_size, "Image has insufficient anchors for RPN minibatch size of %d" % rpn_minibatch_size
    assert len(positive_anchors) > 0, "Image does not have any positive anchors"
    assert rpn_minibatch_size % 2 == 0, "RPN minibatch size must be evenly divisible"

    # Sample, producing indices into the index maps
    num_positive_anchors = len(positive_anchors)
    num_negative_anchors = len(negative_anchors)
    num_positive_samples = min(rpn_minibatch_size // 2,
                               num_positive_anchors)  # up to half the samples should be positive, if possible
    num_negative_samples = rpn_minibatch_size - num_positive_samples  # the rest should be negative
    positive_anchor_idxs = random.sample(range(num_positive_anchors), num_positive_samples)
    negative_anchor_idxs = random.sample(range(num_negative_anchors), num_negative_samples)

    # Construct index expressions into RPN map
    positive_anchors = positive_anchors[positive_anchor_idxs]
    negative_anchors = negative_anchors[negative_anchor_idxs]
    trainable_anchors = np.concatenate([positive_anchors, negative_anchors])
    batch_idxs = np.zeros(len(trainable_anchors), dtype=int)
    trainable_idxs = (batch_idxs, trainable_anchors[:, 0], trainable_anchors[:, 1], trainable_anchors[:, 2], 0)

    # Create a copy of the RPN map with samples set as trainable
    rpn_minibatch_map = rpn_map.copy()
    rpn_minibatch_map[:, :, :, :, 0] = 0
    rpn_minibatch_map[trainable_idxs] = 1

    return rpn_minibatch_map

In [20]:
def _convert_training_sample_to_model_input(sample, mode):
    """
    Converts a training sample obtained from the dataset into an input vector
    that can be passed to the model.

    Parameters
    ----------
    sample : datasets.training_sample.TrainingSample
      Training sample obtained from dataset.
    mode : str
      "train" if the input vector will be fed into a training model otherwise
      "infer".

    Returns
    -------
    List[np.ndarray], np.ndarray, np.ndarray
      Input vector for model (see model definition for details), image data,
      and ground truth RPN minibatch map. All maps are converted to a batch
      size of 1 as expected by Keras model.
    """

    # Ground truth boxes to NumPy arrays
    gt_box_corners = np.array([box.corners for box in sample.gt_boxes]).astype(
        np.float32)  # (num_boxes,4), where each row is (y1,x1,y2,x2)
    gt_box_class_idxs = np.array([box.class_index for box in sample.gt_boxes]).astype(
        np.int32)  # (num_boxes,), where each is an index [1,num_classes)

    # Expand all maps to a batch size of 1
    image_data = np.expand_dims(sample.image_data, axis=0)
    image_shape_map = np.array(
        [[image_data.shape[1], image_data.shape[2], image_data.shape[3]]])  # (1,3), with (height,width,channels)
    anchor_map = np.expand_dims(sample.anchor_map, axis=0)
    anchor_valid_map = np.expand_dims(sample.anchor_valid_map, axis=0)
    gt_rpn_map = np.expand_dims(sample.gt_rpn_map, axis=0)
    gt_rpn_object_indices = [sample.gt_rpn_object_indices]
    gt_rpn_background_indices = [sample.gt_rpn_background_indices]
    gt_box_corners = np.expand_dims(gt_box_corners, axis=0)
    gt_box_class_idxs = np.expand_dims(gt_box_class_idxs, axis=0)

    # Create a RPN minibatch: sample anchors randomly and create a new ground
    # truth RPN map
    gt_rpn_minibatch_map = _sample_rpn_minibatch(
        rpn_map=gt_rpn_map,
        object_indices=gt_rpn_object_indices,
        background_indices=gt_rpn_background_indices,
        rpn_minibatch_size=256
    )

    # Input vector to model
    if mode == "train":
        x = [image_data, anchor_map, anchor_valid_map, gt_rpn_minibatch_map, gt_box_class_idxs, gt_box_corners]
    else:  # "infer"
        x = [image_data, anchor_map, anchor_valid_map]

    # Return all plus some unpacked elements for convenience
    return x, image_data, gt_rpn_minibatch_map


In [21]:
def evaluate(model, eval_data=None, num_samples=None, plot=False, print_average_precisions=False):
    if eval_data is None:
        eval_data = Dataset(dir=options.dataset_dir, split=options.eval_split, augment=False, shuffle=False)
    if num_samples is None:
        num_samples = eval_data.num_samples
    precision_recall_curve = PrecisionRecallCurveCalculator()
    i = 0
    print("Evaluating '%s'..." % eval_data.split)
    for sample in tqdm(iterable=iter(eval_data), total=num_samples):
        x, image_data, _ = _convert_training_sample_to_model_input(sample=sample, mode="infer")
        scored_boxes_by_class_index = model.predict_on_batch(x=x,
                                                             score_threshold=0.05)  # lower threshold score for evaluation
        precision_recall_curve.add_image_results(
            scored_boxes_by_class_index=scored_boxes_by_class_index,
            gt_boxes=sample.gt_boxes
        )
        i += 1
        if i >= num_samples:
            break
    if print_average_precisions:
        precision_recall_curve.print_average_precisions(class_index_to_name=Dataset.class_index_to_name)
    mean_average_precision = 100.0 * precision_recall_curve.compute_mean_average_precision()
    print("Mean Average Precision = %1.2f%%" % mean_average_precision)
    if plot:
        precision_recall_curve.plot_average_precisions(class_index_to_name=Dataset.class_index_to_name)
    return mean_average_precision

In [22]:
def train(model):
    print("Training Parameters")
    print("-------------------")
    print("Initial weights           : %s" % (
        options.load_from if options.load_from else "Keras VGG-16 ImageNet weights"))
    print("Dataset                   : %s" % options.dataset_dir)
    print("Training split            : %s" % options.train_split)
    print("Evaluation split          : %s" % options.eval_split)
    print("Epochs                    : %d" % options.epochs)
    print("Optimizer                 : %s" % options.optimizer)
    print("Learning rate             : %f" % options.learning_rate)
    print("Gradient norm clipping    : %s" % ("disabled" if options.clipnorm <= 0 else ("%f" % options.clipnorm)))
    print("SGD momentum              : %f" % options.momentum)
    print("Adam Beta-1               : %f" % options.beta1)
    print("Adam Beta-2               : %f" % options.beta2)
    print("Weight decay              : %f" % options.weight_decay)
    print("Dropout                   : %f" % options.dropout)
    print("RoI pooling implementation: %s" % ("custom" if options.custom_roi_pool else "crop-and-resize w/ max pool"))
    print("Detector output           : %s" % ("logits" if options.detector_logits else "probabilities"))
    print("Augmentation              : %s" % ("disabled" if options.no_augment else "enabled"))
    print("Edge proposals            : %s" % ("excluded" if options.exclude_edge_proposals else "included"))
    print("CSV log                   : %s" % ("none" if not options.log_csv else options.log_csv))
    print("Checkpoints               : %s" % ("disabled" if not options.checkpoint_dir else options.checkpoint_dir))
    print("Final weights file        : %s" % ("none" if not options.save_to else options.save_to))
    print("Best weights file         : %s" % ("none" if not options.save_best_to else options.save_best_to))
    # training_data = Dataset(dir=options.dataset_dir, split=options.train_split, augment=not options.no_augment,
    #                             shuffle=True, cache=options.cache_images)
    training_data = Dataset(dir=options.dataset_dir, split=options.train_split, augment=not options.no_augment,
                                shuffle=True, cache=False)
    eval_data = Dataset(dir=options.dataset_dir, split=options.eval_split, augment=False, shuffle=False,
                            cache=False)
    if options.checkpoint_dir and not os.path.exists(options.checkpoint_dir):
        os.makedirs(options.checkpoint_dir)
    if options.log_csv:
        csv = CSVLog(options.log_csv)
    if options.save_best_to:
        best_weights_tracker = BestWeightsTracker(filepath=options.save_best_to)
    for epoch in range(1, 1 + options.epochs):
        print("Epoch %d/%d" % (epoch, options.epochs))
        stats = TrainingStatistics()
        progbar = tqdm(iterable=iter(training_data), total=training_data.num_samples,
                       postfix=stats.get_progbar_postfix())
        for sample in progbar:
            x, image_data, gt_rpn_minibatch_map = _convert_training_sample_to_model_input(sample=sample, mode="train")
            losses = model.train_on_batch(x=x, y=gt_rpn_minibatch_map, return_dict=True)
            stats.on_training_step(losses=losses)
            progbar.set_postfix(stats.get_progbar_postfix())
        last_epoch = epoch == options.epochs
        mean_average_precision = evaluate(
            model=model,
            eval_data=eval_data,
            num_samples=options.periodic_eval_samples,
            plot=False,
            print_average_precisions=False
        )
        if options.checkpoint_dir:
            checkpoint_file = os.path.join(options.checkpoint_dir,
                                           "checkpoint-epoch-%d-mAP-%1.1f.h5" % (epoch, mean_average_precision))
            model.save_weights(filepath=checkpoint_file, overwrite=True, save_format="h5")
            print("Saved model checkpoint to '%s'" % checkpoint_file)
        if options.log_csv:
            log_items = {
                "epoch": epoch,
                "learning_rate": options.learning_rate,
                "clipnorm": options.clipnorm,
                "momentum": options.momentum,
                "beta1": options.beta1,
                "beta2": options.beta2,
                "weight_decay": options.weight_decay,
                "dropout": options.dropout,
                "mAP": mean_average_precision
            }
            log_items.update(stats.get_progbar_postfix())
            csv.log(log_items)
        if options.save_best_to:
            best_weights_tracker.on_epoch_end(model=model, mAP=mean_average_precision)
    if options.save_to:
        model.save_weights(filepath=options.save_to, overwrite=True, save_format="h5")
        print("Saved final model weights to '%s'" % options.save_to)
    if options.save_best_to:
        best_weights_tracker.restore_and_save_best_weights(model=model)
    print("Evaluating %s model on all samples in '%s'..." % (
        ("best" if options.save_best_to else "final"),
        options.eval_split))  # evaluate final or best model on full dataset
    evaluate(
        model=model,
        eval_data=eval_data,
        num_samples=eval_data.num_samples,
        plot=options.plot,
        print_average_precisions=True
    )

In [27]:
# Construct model and load initial weights
model = FasterRCNNModel(
    num_classes=Dataset.num_classes,
    allow_edge_proposals=not options.exclude_edge_proposals,
    custom_roi_pool=options.custom_roi_pool,
    activate_class_outputs=not options.detector_logits,
    l2=0.5 * options.weight_decay,
    dropout_probability=options.dropout
)
model.build(
    input_shape=[
        (1, None, None, 3),  # input_image: (1, height_pixels, width_pixels, 3)
        (1, None, None, 9 * 4),  # anchor_map: (1, height, width, num_anchors * 4)
        (1, None, None, 9),  # anchor_valid_map: (1, height, width, num_anchors)
        (1, None, None, 9, 6),  # gt_rpn_map: (1, height, width, num_anchors, 6)
        (1, None),  # gt_box_class_idxs_map: (1, num_gt_boxes)
        (1, None, 4)  # gt_box_corners_map: (1, num_gt_boxes, 4)
    ]
)
model.compile(
    optimizer=create_optimizer())  # losses not needed here because they were baked in at model construction
if options.load_from:
    model.load_weights(filepath=options.load_from, by_name=True)
    print("Loaded initial weights from '%s'" % options.load_from)
else:
    model.load_imagenet_weights()
    print("Initialized VGG-16 layers to Keras ImageNet weights")

Loaded initial weights from 'fasterrcnn_tf2.h5'


In [28]:
options.epochs = 5
# Perform mutually exclusive procedures
if options.train:
    train(model=model)
elif options.eval:
    evaluate(model=model, plot=options.plot, print_average_precisions=True)
elif options.predict:
    predict_one(model=model, url=options.predict, show_image=True, output_path=None)
elif options.predict_to_file:
    predict_one(model=model, url=options.predict_to_file, show_image=False, output_path="predictions.png")
elif options.predict_all:
    predict_all(model=model, split=options.predict_all)
elif not options.dump_anchors:
    print("Nothing to do. Did you mean to use --train or --predict?")

Training Parameters
-------------------
Initial weights           : fasterrcnn_tf2.h5
Dataset                   : /kaggle/input/chess-object-detection/chess-pieces-dataset/
Training split            : train
Evaluation split          : test
Epochs                    : 5
Optimizer                 : adam
Learning rate             : 0.001000
Gradient norm clipping    : disabled
SGD momentum              : 0.900000
Adam Beta-1               : 0.900000
Adam Beta-2               : 0.999000
Weight decay              : 0.000500
Dropout                   : 0.000000
RoI pooling implementation: crop-and-resize w/ max pool
Detector output           : probabilities
Augmentation              : disabled
Edge proposals            : excluded
CSV log                   : log.csv
Checkpoints               : disabled
Final weights file        : fasterrcnn_tf2.h5
Best weights file         : fasterrcnn_tf2.h5
Epoch 1/5


100%|██████████| 606/606 [01:33<00:00,  6.49it/s, rpn_class_loss=0.1084, rpn_regr_loss=0.0562, detector_class_loss=0.1641, detector_regr_loss=0.1220, total_loss=0.45]


Evaluating 'test'...


0it [00:00, ?it/s]


Mean Average Precision = 84.72%
Epoch 2/5


100%|██████████| 606/606 [01:23<00:00,  7.24it/s, rpn_class_loss=0.0931, rpn_regr_loss=0.0501, detector_class_loss=0.1382, detector_regr_loss=0.1097, total_loss=0.39]


Evaluating 'test'...


0it [00:00, ?it/s]


Mean Average Precision = 81.60%
Epoch 3/5


100%|██████████| 606/606 [01:23<00:00,  7.24it/s, rpn_class_loss=0.0869, rpn_regr_loss=0.0473, detector_class_loss=0.1249, detector_regr_loss=0.1091, total_loss=0.37]


Evaluating 'test'...


0it [00:00, ?it/s]


Mean Average Precision = 63.11%
Epoch 4/5


100%|██████████| 606/606 [01:23<00:00,  7.27it/s, rpn_class_loss=0.0865, rpn_regr_loss=0.0474, detector_class_loss=0.1313, detector_regr_loss=0.1082, total_loss=0.37]


Evaluating 'test'...


0it [00:00, ?it/s]


Mean Average Precision = 74.31%
Epoch 5/5


100%|██████████| 606/606 [01:23<00:00,  7.27it/s, rpn_class_loss=0.0814, rpn_regr_loss=0.0452, detector_class_loss=0.1201, detector_regr_loss=0.1025, total_loss=0.35]


Evaluating 'test'...


0it [00:00, ?it/s]


Mean Average Precision = 89.58%
Saved final model weights to 'fasterrcnn_tf2.h5'
Saved best model weights (Mean Average Precision = 89.58%) to 'fasterrcnn_tf2.h5'
Evaluating best model on all samples in 'test'...
Evaluating 'test'...


 96%|█████████▋| 27/28 [00:02<00:00,  9.49it/s]


Average Precisions
------------------
black-king  : 100.0%
white-king  : 99.6%
white-queen : 99.5%
black-pawn  : 96.1%
white-pawn  : 93.0%
black-queen : 92.9%
white-knight: 90.4%
black-bishop: 88.1%
black-knight: 87.5%
white-bishop: 86.1%
white-rook  : 77.0%
black-rook  : 74.2%
------------------
Mean Average Precision = 90.36%


In [None]:
print("Tensorflow", tensorflow.__version__)
# print("Keras", keras.__version__)
# print("Keras Pre-processing", keras_preprocessing.__version__)
print("Numpy", np.version)
print("Pandas", pd.__version__)

In [25]:
!cp ../input/chess-vision-model/chess-vision-model/fasterrcnn_tf2.h5 ./fasterrcnn_tf2.h5

Download best model:

<a href="./fasterrcnn_tf2.h5"> Download File </a>