From d37922c85fe90f68eace795ea83c25e758f511e8 Mon Sep 17 00:00:00 2001
From: taox <taox@fb.com>
Date: Thu, 15 Oct 2020 00:25:23 -0700
Subject: [PATCH] [Metal]  Enable optimize_for_mobile on Linux

Currently, the optimize_for_mobile binary only works on macOS, which is not very convenient to use. This diff introduces a new buck target that separates out the objective-c code. The goal here is to be able to export models for metal on linux machines.

Differential Revision: [D24322017](https://our.internmc.facebook.com/intern/diff/D24322017/)

**NOTE FOR REVIEWERS**: This PR has internal Facebook specific changes or comments, please review them on [Phabricator](https://our.internmc.facebook.com/intern/diff/D24322017/)!

[ghstack-poisoned]
---
 aten/src/ATen/native/metal/MetalConvolution.h |   5 +-
 .../src/ATen/native/metal/MetalConvolution.mm |   4 -
 .../ATen/native/metal/MetalPrepackOpContext.h |  38 ++---
 .../native/metal/MetalPrepackOpContext.mm     |  71 ---------
 .../native/metal/MetalPrepackOpRegister.cpp   | 140 ++++++++++++++++++
 .../native/metal/MetalPrepackOpRegister.mm    |  55 -------
 .../src/ATen/native/metal/mpscnn/MPSCNNOps.mm |   9 +-
 7 files changed, 158 insertions(+), 164 deletions(-)
 delete mode 100644 aten/src/ATen/native/metal/MetalPrepackOpContext.mm
 create mode 100644 aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
 delete mode 100644 aten/src/ATen/native/metal/MetalPrepackOpRegister.mm
diff --git a/aten/src/ATen/native/metal/MetalConvolution.h b/aten/src/ATen/native/metal/MetalConvolution.h
index 6e811a34267c..7a7bdfbd21c2 100644
--- a/aten/src/ATen/native/metal/MetalConvolution.h
+++ b/aten/src/ATen/native/metal/MetalConvolution.h
@@ -1,6 +1,5 @@
 #import <ATen/native/metal/MetalPrepackOpContext.h>
-#import <ATen/native/metal/MetalUtils.h>
-#import <ATen/native/metal/mpscnn/MPSCNNOp.h>
+
 #include <torch/script.h>
 
 namespace at {
@@ -49,8 +48,6 @@ struct Conv2DParams final {
 
 NeuronType neuronType(const Conv2dOpContext& context);
 
-Tensor conv2d_prepack_run_impl(Conv2dOpContext& context, const Tensor& input);
-
 } // namespace metal
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/metal/MetalConvolution.mm b/aten/src/ATen/native/metal/MetalConvolution.mm
index 178f202e9530..1d316e2144d2 100644
--- a/aten/src/ATen/native/metal/MetalConvolution.mm
+++ b/aten/src/ATen/native/metal/MetalConvolution.mm
@@ -60,10 +60,6 @@ NeuronType neuronType(const Conv2dOpContext& context) {
   }
 }
 
-Tensor conv2d_prepack_run_impl(Conv2dOpContext& context, const Tensor& input) {
-  return mpscnn::conv2d(input, context);
-}
-
 } // namespace metal
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/metal/MetalPrepackOpContext.h b/aten/src/ATen/native/metal/MetalPrepackOpContext.h
index 6a07af68e049..4e20e242183a 100644
--- a/aten/src/ATen/native/metal/MetalPrepackOpContext.h
+++ b/aten/src/ATen/native/metal/MetalPrepackOpContext.h
@@ -1,4 +1,4 @@
-#import <Foundation/Foundation.h>
+#pragma once
 
 #include <ATen/Tensor.h>
 #include <torch/custom_class.h>
@@ -49,6 +49,13 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
         output_min(output_min),
         output_max(output_max) {}
 
+  void release_resources() override {
+    if (releaseCallback) {
+      releaseCallback(conv2dOp);
+      conv2dOp = nullptr;
+    }
+  }
+
   Tensor weight;
   c10::optional<Tensor> bias;
   std::vector<int64_t> stride;
@@ -57,35 +64,10 @@ class Conv2dOpContext : public torch::jit::CustomClassHolder {
   int64_t groups;
   c10::optional<Scalar> output_min;
   c10::optional<Scalar> output_max;
-  id extra = nil;
+  void* conv2dOp = nullptr; // reserved for MPSCNNConv2dOp
+  std::function<void(void*)> releaseCallback = nullptr;
 };
 
-c10::intrusive_ptr<Conv2dOpContext> unpack(
-    Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& dilation,
-    int64_t groups,
-    c10::optional<Scalar> output_min,
-    c10::optional<Scalar> output_max);
-
-c10::intrusive_ptr<Conv2dOpContext> conv2d_prepack(
-    Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& dilation,
-    int64_t groups,
-    c10::optional<Scalar> output_min,
-    c10::optional<Scalar> output_max);
-
-Tensor conv2d_prepack_run(
-    const Tensor& input,
-    const c10::intrusive_ptr<Conv2dOpContext>& op_context);
-
-Tensor copy_to_host(const Tensor& input);
-
 } // namespace metal
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/metal/MetalPrepackOpContext.mm b/aten/src/ATen/native/metal/MetalPrepackOpContext.mm
deleted file mode 100644
index f51c8ad824a0..000000000000
--- a/aten/src/ATen/native/metal/MetalPrepackOpContext.mm
+++ /dev/null
@@ -1,71 +0,0 @@
-#import <ATen/native/metal/MetalConvolution.h>
-#import <ATen/native/metal/MetalPrepackOpContext.h>
-#import <ATen/native/metal/MetalUtils.h>
-#import <ATen/native/metal/mpscnn/MPSCNNOps.h>
-
-#include <torch/script.h>
-
-namespace at {
-namespace native {
-namespace metal {
-
-c10::intrusive_ptr<Conv2dOpContext> conv2d_prepack(
-    at::Tensor&& weight,
-    c10::optional<at::Tensor>&& bias,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& dilation,
-    const int64_t groups,
-    c10::optional<Scalar> output_min,
-    c10::optional<Scalar> output_max) {
-  TORCH_CHECK(weight.dim() == 4);
-  return c10::make_intrusive<Conv2dOpContext>(
-      std::move(weight),
-      std::move(bias),
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_min,
-      output_max);
-}
-
-c10::intrusive_ptr<Conv2dOpContext> unpack(
-    Tensor&& weight,
-    c10::optional<Tensor>&& bias,
-    std::vector<int64_t>&& stride,
-    std::vector<int64_t>&& padding,
-    std::vector<int64_t>&& dilation,
-    int64_t groups,
-    c10::optional<Scalar> output_min,
-    c10::optional<Scalar> output_max) {
-  const Tensor weightContig = weight.contiguous();
-  const auto ws = weightContig.sizes();
-  auto packed_buffer = permuteWeights(weightContig.data_ptr<float>(), ws.vec());
-  auto packedWeight = at::empty(ws);
-  int64_t size_bytes = at::prod_intlist(ws) * sizeof(float);
-  memcpy(packedWeight.data_ptr(), packed_buffer.data(), size_bytes);
-  return c10::make_intrusive<Conv2dOpContext>(
-      std::move(packedWeight),
-      std::move(bias),
-      stride,
-      padding,
-      dilation,
-      groups,
-      output_min,
-      output_max);
-}
-
-Tensor conv2d_prepack_run(
-    const Tensor& input,
-    const c10::intrusive_ptr<Conv2dOpContext>& op_context) {
-  return conv2d_prepack_run_impl(*op_context, input);
-}
-
-Tensor copy_to_host(const Tensor& input) {
-  return mpscnn::copy_to_host(input);
-}
-
-} // namespace metal
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp b/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
new file mode 100644
index 000000000000..60cddbd7eb13
--- /dev/null
+++ b/aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp
@@ -0,0 +1,140 @@
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/metal/MetalPrepackOpContext.h>
+
+#if defined(C10_IOS)
+#import <ATen/native/metal/MetalUtils.h>
+#import <ATen/native/metal/mpscnn/MPSCNNOps.h>
+#endif
+
+namespace at {
+namespace native {
+namespace metal {
+
+c10::intrusive_ptr<Conv2dOpContext> unpack(
+    Tensor&& weight,
+    c10::optional<Tensor>&& bias,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& dilation,
+    int64_t groups,
+    c10::optional<Scalar> output_min,
+    c10::optional<Scalar> output_max) {
+#if defined(C10_IOS)
+  const Tensor weightContig = weight.contiguous();
+  const auto ws = weightContig.sizes();
+  auto packed_buffer = permuteWeights(weightContig.data_ptr<float>(), ws.vec());
+  auto packedWeight = at::empty(ws);
+  int64_t size_bytes = at::prod_intlist(ws) * sizeof(float);
+  memcpy(packedWeight.data_ptr(), packed_buffer.data(), size_bytes);
+  return c10::make_intrusive<Conv2dOpContext>(
+      std::move(packedWeight),
+      std::move(bias),
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_min,
+      output_max);
+#else
+  TORCH_CHECK(false, "unpack can only be invoked on iOS")
+  return c10::make_intrusive<Conv2dOpContext>(
+      std::move(weight),
+      std::move(bias),
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_min,
+      output_max);
+#endif
+}
+
+TORCH_LIBRARY(metal, m) {
+  m.class_<Conv2dOpContext>("Conv2dOpContext")
+      .def_pickle(
+          [](const c10::intrusive_ptr<Conv2dOpContext>& op_context)
+              -> SerializationTypeConv2dPrePack { // __getstate__
+            return op_context->pack();
+          },
+          [](SerializationTypeConv2dPrePack state)
+              -> c10::intrusive_ptr<Conv2dOpContext> { // __setstate__
+            return unpack(
+                std::move(std::get<0>(state)),
+                std::move(std::get<1>(state)),
+                std::move(std::get<2>(state)),
+                std::move(std::get<3>(state)),
+                std::move(std::get<4>(state)),
+                std::move(std::get<5>(state)),
+                std::move(std::get<6>(state)),
+                std::move(std::get<7>(state)));
+          });
+  m.def("copy_to_host(Tensor X) -> Tensor Y");
+}
+
+TORCH_LIBRARY(metal_prepack, m) {
+  m.def(
+      "conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
+      "int[2] padding, int[2] dilation, int groups, "
+      "Scalar? output_min=None, Scalar? output_max=None) "
+      "-> __torch__.torch.classes.metal.Conv2dOpContext");
+  m.def(
+      "conv2d_run(Tensor X, "
+      "__torch__.torch.classes.metal.Conv2dOpContext W_prepack) -> Tensor Y");
+}
+
+c10::intrusive_ptr<Conv2dOpContext> conv2d_prepack(
+    Tensor&& weight,
+    c10::optional<Tensor>&& bias,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& dilation,
+    int64_t groups,
+    c10::optional<Scalar> output_min,
+    c10::optional<Scalar> output_max) {
+  TORCH_CHECK(weight.dim() == 4);
+  return c10::make_intrusive<Conv2dOpContext>(
+      std::move(weight),
+      std::move(bias),
+      stride,
+      padding,
+      dilation,
+      groups,
+      output_min,
+      output_max);
+}
+
+Tensor conv2d_prepack_run(
+    const Tensor& input,
+    const c10::intrusive_ptr<Conv2dOpContext>& op_context) {
+#if defined(C10_IOS)
+  return mpscnn::conv2d(input, *op_context);
+#else
+  TORCH_CHECK(false, "conv2d_prepack_run can only be invoked on iOS");
+  return input;
+#endif
+}
+
+Tensor copy_to_host(const Tensor& input) {
+#if defined(C10_IOS)
+  return mpscnn::copy_to_host(input);
+#else
+  TORCH_CHECK(false, "copy_to_host can only be invoked on iOS");
+  return input;
+#endif
+}
+
+TORCH_LIBRARY_IMPL(metal_prepack, CPU, m) {
+  m.impl("conv2d_prepack", TORCH_FN(conv2d_prepack));
+}
+
+TORCH_LIBRARY_IMPL(metal_prepack, Metal, m) {
+  m.impl("conv2d_run", conv2d_prepack_run);
+}
+
+TORCH_LIBRARY_IMPL(metal, Metal, m) {
+  m.impl("copy_to_host", copy_to_host);
+}
+
+} // namespace metal
+} // namespace native
+} // namespace at
diff --git a/aten/src/ATen/native/metal/MetalPrepackOpRegister.mm b/aten/src/ATen/native/metal/MetalPrepackOpRegister.mm
deleted file mode 100644
index d1872398ddc9..000000000000
--- a/aten/src/ATen/native/metal/MetalPrepackOpRegister.mm
+++ /dev/null
@@ -1,55 +0,0 @@
-#include <ATen/core/op_registration/op_registration.h>
-#import <ATen/native/metal/MetalPrepackOpContext.h>
-
-namespace at {
-namespace native {
-namespace metal {
-
-TORCH_LIBRARY(metal, m) {
-  m.class_<Conv2dOpContext>("Conv2dOpContext")
-      .def_pickle(
-          [](const c10::intrusive_ptr<Conv2dOpContext>& op_context)
-              -> SerializationTypeConv2dPrePack { // __getstate__
-            return op_context->pack();
-          },
-          [](SerializationTypeConv2dPrePack state)
-              -> c10::intrusive_ptr<Conv2dOpContext> { // __setstate__
-            return unpack(
-                std::move(std::get<0>(state)),
-                std::move(std::get<1>(state)),
-                std::move(std::get<2>(state)),
-                std::move(std::get<3>(state)),
-                std::move(std::get<4>(state)),
-                std::move(std::get<5>(state)),
-                std::move(std::get<6>(state)),
-                std::move(std::get<7>(state)));
-          });
-  m.def("copy_to_host(Tensor X) -> Tensor Y");
-}
-
-TORCH_LIBRARY(metal_prepack, m) {
-  m.def(
-      "conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
-      "int[2] padding, int[2] dilation, int groups, "
-      "Scalar? output_min=None, Scalar? output_max=None) "
-      "-> __torch__.torch.classes.metal.Conv2dOpContext");
-  m.def(
-      "conv2d_run(Tensor X, "
-      "__torch__.torch.classes.metal.Conv2dOpContext W_prepack) -> Tensor Y");
-}
-
-TORCH_LIBRARY_IMPL(metal_prepack, CPU, m) {
-  m.impl("conv2d_prepack", TORCH_FN(conv2d_prepack));
-}
-
-TORCH_LIBRARY_IMPL(metal_prepack, Metal, m) {
-  m.impl("conv2d_run", conv2d_prepack_run);
-}
-
-TORCH_LIBRARY_IMPL(metal, Metal, m) {
-  m.impl("copy_to_host", copy_to_host);
-}
-
-} // namespace metal
-} // namespace native
-} // namespace at
diff --git a/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm b/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm
index fc883337d401..d2b54f0dcb3e 100644
--- a/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm
+++ b/aten/src/ATen/native/metal/mpscnn/MPSCNNOps.mm
@@ -88,14 +88,19 @@ Tensor conv2d(const Tensor& input, Conv2dOpContext& context) {
                       context.stride,
                       context.dilation,
                       context.groups};
-  MPSCNNConvOp* op = (MPSCNNConvOp*)context.extra;
+  MPSCNNConvOp* op = (__bridge MPSCNNConvOp*)(context.conv2dOp);
   NeuronType nt = neuronType(context);
   if (!op) {
     float* w = context.weight.data_ptr<float>();
     float* b = context.bias.has_value() ? ((*context.bias).data_ptr<float>())
                                         : nullptr;
     op = [MPSCNNConvOp conv2d:params weights:w bias:b neuronFilter:nt];
-    context.extra = op;
+    context.conv2dOp = (void*)CFBridgingRetain(op);
+    context.releaseCallback = ^(void* res) {
+      if (res) {
+        CFBridgingRelease(res);
+      }
+    };
   }
 
   auto outputSize = params.output_sizes();