From f2d0cc8ef7070b779cbb339ad73f62d67909fbd0 Mon Sep 17 00:00:00 2001
From: Lucy Qiu <lfq@meta.com>
Date: Thu, 29 Aug 2024 22:53:20 -0700
Subject: [PATCH] Preprocess C++ (#4987)

Summary:

Preprocess C++ runner calculations.

Mirror torchtune's helper functions in C++, and tests

- find_supported_resolutions
- get_canvas_best_fit
- get_inscribed_size

Functions: https://github.com/pytorch/torchtune/tree/main/torchtune/modules/transforms/vision_utils

Reviewed By: mergennachin

Differential Revision: D61833480
---
 .../models/flamingo/preprocess/preprocess.cpp | 118 ++++++++++++++++++
 .../models/flamingo/preprocess/preprocess.h   |  41 ++++++
 .../flamingo/preprocess/preprocess_test.cpp   | 113 +++++++++++++++++
 .../models/flamingo/preprocess/targets.bzl    |  20 +++
 4 files changed, 292 insertions(+)
 create mode 100644 examples/models/flamingo/preprocess/preprocess.cpp
 create mode 100644 examples/models/flamingo/preprocess/preprocess.h
 create mode 100644 examples/models/flamingo/preprocess/preprocess_test.cpp
 create mode 100644 examples/models/flamingo/preprocess/targets.bzl
diff --git a/examples/models/flamingo/preprocess/preprocess.cpp b/examples/models/flamingo/preprocess/preprocess.cpp
new file mode 100644
index 00000000000..ff46070f669
--- /dev/null
+++ b/examples/models/flamingo/preprocess/preprocess.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "executorch/examples/models/flamingo/preprocess/preprocess.h"
+
+#include <algorithm>
+#include <cassert>
+
+std::vector<int> _get_factors(int n) {
+  std::vector<int> factors;
+  for (int i = 1; i <= n; i++) {
+    if (n % i == 0) {
+      factors.push_back(i);
+    }
+  }
+  return factors;
+}
+
+std::vector<std::vector<int>> find_supported_resolutions(
+    int max_num_tiles,
+    int tile_size) {
+  std::vector<std::vector<int>> supported_resolutions;
+  for (int _tile_size = max_num_tiles; _tile_size > 0; _tile_size--) {
+    auto factors = _get_factors(_tile_size);
+    for (int i = 0; i < factors.size(); i++) {
+      int height = factors[i];
+      int width = _tile_size / factors[i];
+      supported_resolutions.push_back({height * tile_size, width * tile_size});
+    }
+  }
+  return supported_resolutions;
+}
+
+std::vector<int> get_canvas_best_fit(
+    std::vector<int> image_size,
+    std::vector<std::vector<int>> possible_resolutions,
+    bool resize_to_max_canvas) {
+  assert(image_size.size() == 2);
+  int image_h = image_size[0];
+  int image_w = image_size[1];
+
+  float best_scale = -0.1;
+  std::vector<int> best_resolution;
+  int best_area = 0;
+
+  for (int i = 0; i < possible_resolutions.size(); i++) {
+    assert(possible_resolutions[i].size() == 2);
+    float scale_h = possible_resolutions[i][0] / (float)image_h;
+    float scale_w = possible_resolutions[i][1] / (float)image_w;
+
+    // Get limiting side scaling -> no distortion
+    float scale = scale_h < scale_w ? scale_h : scale_w;
+
+    bool is_candidate = false;
+
+    if (scale >= 1.0) {
+      // Upscaling options.
+      if (resize_to_max_canvas) {
+        is_candidate = scale >= best_scale;
+      } else {
+        is_candidate = ((scale <= best_scale) || (best_resolution.size() == 0));
+      }
+    } else {
+      // If no upscaling options, find the minimum downscaling (max scale for
+      // scales < 1)
+      is_candidate = ((scale >= best_scale) || (best_resolution.size() == 0));
+    }
+
+    // Select the best resolution.
+    if (is_candidate) {
+      // @lint-ignore CLANGTIDY facebook-hte-ParameterUncheckedArrayBounds
+      int area = possible_resolutions[i][0] * possible_resolutions[i][1];
+      if (scale == best_scale) {
+        // If there are multiple resolutions, get the one with minimum area to
+        // reduce padding.
+        if (scale >= 1.0 && area < best_area) {
+          best_resolution = possible_resolutions[i];
+          best_area = area;
+        }
+      } else {
+        best_resolution = possible_resolutions[i];
+        best_scale = scale;
+        best_area = area;
+      }
+    }
+  }
+  return best_resolution;
+}
+
+std::vector<int> get_inscribed_size(
+    std::vector<int> image_size,
+    std::vector<int> target_size,
+    int max_size) {
+  assert(image_size.size() == 2);
+  assert(target_size.size() == 2);
+
+  int target_height = target_size[0];
+  int target_width = target_size[1];
+
+  if (max_size > 0) {
+    target_height = std::min(std::max(image_size[0], max_size), target_size[0]);
+    target_width = std::min(std::max(image_size[1], max_size), target_size[1]);
+  }
+
+  int resize_height = std::min(
+      (int)(image_size[0] * (target_width / (float)image_size[1])),
+      target_height);
+  int resize_width = std::min(
+      (int)(image_size[1] * (target_height / (float)image_size[0])),
+      target_width);
+
+  return {resize_height, resize_width};
+}
diff --git a/examples/models/flamingo/preprocess/preprocess.h b/examples/models/flamingo/preprocess/preprocess.h
new file mode 100644
index 00000000000..f6c7b813e95
--- /dev/null
+++ b/examples/models/flamingo/preprocess/preprocess.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <vector>
+
+// C++ implementation of the python functions in torchtune:
+// https://github.com/pytorch/torchtune/tree/main/torchtune/modules/transforms/vision_utils
+
+// Calculate all factors of a given number.
+std::vector<int> _get_factors(int n);
+
+// Computes all combinations of resolutions, multiple of tile_size,
+// that contain up to max_num_tiles. Useful for when dividing an image into
+// tiles. For example, if we want at most 2 tiles per image, then we can support
+// the following resolutions: (1x1, 1x2, 2x1) * tile_size Returns a vector of
+// tuples of (height, width).
+std::vector<std::vector<int>> find_supported_resolutions(
+    int max_num_tiles,
+    int tile_size);
+
+// Determines the best canvas possible from a list of possible resolutions to
+// resize an image to, without distortion.
+std::vector<int> get_canvas_best_fit(
+    std::vector<int> image_size,
+    std::vector<std::vector<int>> possible_resolutions,
+    bool resize_to_max_canvas);
+
+// Calculates the size of an image, if it was resized to be inscribed within the
+// target_size. It is upscaled or downscaled such that one size is equal to the
+// target_size, and the second size is less than or equal to the target_size.
+std::vector<int> get_inscribed_size(
+    std::vector<int> image_size,
+    std::vector<int> canvas_size,
+    int max_size);
diff --git a/examples/models/flamingo/preprocess/preprocess_test.cpp b/examples/models/flamingo/preprocess/preprocess_test.cpp
new file mode 100644
index 00000000000..deede877223
--- /dev/null
+++ b/examples/models/flamingo/preprocess/preprocess_test.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/examples/models/flamingo/preprocess/preprocess.h>
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+
+// Mirror the torchtune python testing:
+// https://github.com/pytorch/torchtune/tree/main/tests/torchtune/modules/transforms
+
+void test_find_supported_resolutions(
+    int max_num_tiles,
+    int tile_size,
+    std::vector<std::vector<int>> expected_resolutions) {
+  std::vector<std::vector<int>> resolutions =
+      find_supported_resolutions(max_num_tiles, tile_size);
+
+  EXPECT_EQ(resolutions.size(), expected_resolutions.size());
+
+  for (int i = 0; i < resolutions.size(); i++) {
+    EXPECT_EQ(resolutions[i].size(), expected_resolutions[i].size());
+    EXPECT_EQ(resolutions[i][0], expected_resolutions[i][0]); // height
+    EXPECT_EQ(resolutions[i][1], expected_resolutions[i][1]); // width
+  }
+}
+
+TEST(PreprocessTest, TestFindSupportedResolution) {
+  test_find_supported_resolutions(1, 224, {{224, 224}});
+  test_find_supported_resolutions(2, 100, {{100, 200}, {200, 100}, {100, 100}});
+  test_find_supported_resolutions(
+      3, 50, {{50, 150}, {150, 50}, {50, 100}, {100, 50}, {50, 50}});
+  test_find_supported_resolutions(
+      4,
+      300,
+      {
+          {300, 1200},
+          {600, 600},
+          {1200, 300},
+          {300, 900},
+          {900, 300},
+          {300, 600},
+          {600, 300},
+          {300, 300},
+      });
+}
+
+void test_get_canvas_best_fit(
+    std::vector<int> image_size,
+    std::vector<std::vector<int>> possible_resolutions,
+    bool resize_to_max_canvas,
+    std::vector<int> expected_best_resolution) {
+  std::vector<int> best_resolution = get_canvas_best_fit(
+      image_size, possible_resolutions, resize_to_max_canvas);
+  EXPECT_EQ(best_resolution[0], expected_best_resolution[0]); // height
+  EXPECT_EQ(best_resolution[1], expected_best_resolution[1]); // width
+}
+
+TEST(PreprocessTest, TestGetCanvasBestFit_200x300_F) {
+  std::vector<std::vector<int>> possible_resolutions = {
+      {224, 896},
+      {448, 448},
+      {224, 224},
+      {896, 224},
+      {224, 672},
+      {672, 224},
+      {224, 448},
+      {448, 224},
+  };
+  test_get_canvas_best_fit(
+      {200, 300},
+      possible_resolutions,
+      false, // resize_to_max_canvas
+      {224, 448});
+
+  test_get_canvas_best_fit(
+      {200, 500},
+      possible_resolutions,
+      true, // resize_to_max_canvas
+      {224, 672});
+  test_get_canvas_best_fit(
+      {200, 200},
+      possible_resolutions,
+      false, // resize_to_max_canvas
+      {224, 224});
+  test_get_canvas_best_fit(
+      {200, 100},
+      possible_resolutions,
+      true, // resize_to_max_canvas
+      {448, 224});
+}
+
+void test_get_inscribed_size(
+    std::vector<int> image_size,
+    std::vector<int> target_size,
+    int max_size,
+    std::vector<int> expected_target_size) {
+  std::vector<int> result =
+      get_inscribed_size(image_size, target_size, max_size);
+  EXPECT_EQ(result[0], expected_target_size[0]); // height
+  EXPECT_EQ(result[1], expected_target_size[1]); // width
+}
+TEST(PreprocessTest, GetInscribedSize) {
+  test_get_inscribed_size({200, 100}, {1000, 1200}, 600, {600, 300});
+  test_get_inscribed_size({2000, 200}, {1000, 1200}, 2000, {1000, 100});
+  test_get_inscribed_size({400, 200}, {1000, 1200}, -1, {1000, 500});
+  test_get_inscribed_size({1000, 500}, {400, 300}, -1, {400, 200});
+}
diff --git a/examples/models/flamingo/preprocess/targets.bzl b/examples/models/flamingo/preprocess/targets.bzl
new file mode 100644
index 00000000000..fd60d94a907
--- /dev/null
+++ b/examples/models/flamingo/preprocess/targets.bzl
@@ -0,0 +1,20 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "preprocess",
+        srcs = ["preprocess.cpp"],
+        exported_headers = ["preprocess.h"],
+    )
+
+    runtime.cxx_test(
+        name = "preprocess_test",
+        srcs = ["preprocess_test.cpp"],
+        deps = [":preprocess"],
+    )