From 239e41cea985fe72dc35571b622bb7432cb08ca3 Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 15 Nov 2024 10:18:24 -0800
Subject: [PATCH 1/2] [Executorch][Portable] Dont upcast to double for sigmoid

Upcasting to double for compute precision may not be aten compliant.

Reason for internal test change:
Apparently running on broadwell CPU vs test runner with Cooper lake gives
different results for this change.

Without this change:
Both broadwell and Cooper lake will produce "Once upon a time, there was a
little"

With this change:
Broadwell still produces "Once upon a time, there was a little", while
Cooperlake produces "Once upon a time, there was a girl".

So one possibility is that that some XNNPACK kernel for Cooper lake is produces
slightly different numerical result that propagates through.

Still landing this change since upcasting to double for compute, does not seem
necessary.

Differential Revision: [D65928920](https://our.internmc.facebook.com/intern/diff/D65928920/)

[ghstack-poisoned]
---
 kernels/portable/cpu/op_sigmoid.cpp           | 42 ++++++++++++-------
 .../kernels/portable/op_registration_util.bzl |  3 ++
 2 files changed, 30 insertions(+), 15 deletions(-)
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 84c4ea2f542..fcac0955083 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -8,6 +8,7 @@
 
 #include <cmath>
 
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/kernels/portable/cpu/util/functional_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
@@ -35,21 +36,32 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
-  ScalarType in_type = in.scalar_type();
-  ScalarType out_type = out.scalar_type();
-  ET_SWITCH_REALHB_TYPES(in_type, ctx, "sigmoid.out", CTYPE_IN, [&]() {
-    ET_SWITCH_FLOATH_TYPES(out_type, ctx, "sigmoid.out", CTYPE_OUT, [&]() {
-      apply_unary_map_fn(
-          [](const CTYPE_IN val_in) {
-            // perform math in double to preserve precision
-            double in_casted = static_cast<double>(val_in);
-            double out_val = 1.0 / (1.0 + exp(-in_casted));
-            return static_cast<CTYPE_OUT>(out_val);
-          },
-          in.const_data_ptr<CTYPE_IN>(),
-          out.mutable_data_ptr<CTYPE_OUT>(),
-          in.numel());
-    });
+  ScalarType common_type = in.scalar_type();
+  ScalarType compute_type = utils::get_compute_type(common_type);
+  // For integer types, we need to promote to the next higher float type
+  if (compute_type != ScalarType::Float && compute_type != ScalarType::Double) {
+    compute_type = ScalarType::Float;
+  }
+
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sigmoid.out";
+
+  ET_KERNEL_CHECK(
+      ctx, executorch::runtime::isRealType(compute_type), InvalidArgument, out);
+
+  ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        [](const CTYPE_COMPUTE val_in) {
+          CTYPE_COMPUTE in_casted = static_cast<CTYPE_COMPUTE>(val_in);
+          CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
+              (static_cast<CTYPE_COMPUTE>(1.0) + exp(-in_casted));
+          return out_val;
+        },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out,
+        utils::SupportedTensorDtypes::REALHBBF16);
   });
 
   return out;
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
index f63932d4840..b89c4ac1a95 100644
--- a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -1074,6 +1074,9 @@ ATEN_OPS = (
         name = "op_sigmoid",
         deps = [
             "//executorch/kernels/portable/cpu/util:functional_util",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
+            "//executorch/kernels/portable/cpu/util:broadcast_util",
+            "//executorch/kernels/portable/cpu/util:dtype_util",
         ],
     ),
     op_target(

From 8eabd3ebb09f7fb6d11fbfbd05919488e889410a Mon Sep 17 00:00:00 2001
From: Kimish Patel <kimishpatel@fb.com>
Date: Fri, 15 Nov 2024 12:21:13 -0800
Subject: [PATCH 2/2] Update on "[Executorch][Portable] Dont upcast to double
 for sigmoid"

Upcasting to double for compute precision may not be aten compliant.

Reason for internal test change:
Apparently running on broadwell CPU vs test runner with Cooper lake gives
different results for this change.

Without this change:
Both broadwell and Cooper lake will produce "Once upon a time, there was a
little"

With this change:
Broadwell still produces "Once upon a time, there was a little", while
Cooperlake produces "Once upon a time, there was a girl".

So one possibility is that that some XNNPACK kernel for Cooper lake is produces
slightly different numerical result that propagates through.

Still landing this change since upcasting to double for compute, does not seem
necessary.

Differential Revision: [D65928920](https://our.internmc.facebook.com/intern/diff/D65928920/)

[ghstack-poisoned]
---
 kernels/portable/cpu/op_sigmoid.cpp | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index fcac0955083..34b2ec60dec 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -36,32 +36,26 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
       out,
       "Failed to resize output tensor.");
 
-  ScalarType common_type = in.scalar_type();
-  ScalarType compute_type = utils::get_compute_type(common_type);
-  // For integer types, we need to promote to the next higher float type
-  if (compute_type != ScalarType::Float && compute_type != ScalarType::Double) {
-    compute_type = ScalarType::Float;
-  }
+  ScalarType compute_type =
+      executorch::runtime::isFloatingType(in.scalar_type()) ? in.scalar_type()
+                                                            : ScalarType::Float;
+  compute_type = utils::get_compute_type(compute_type);
 
   // @lint-ignore CLANGTIDY facebook-hte-CArray
   static constexpr const char op_name[] = "sigmoid.out";
 
-  ET_KERNEL_CHECK(
-      ctx, executorch::runtime::isRealType(compute_type), InvalidArgument, out);
-
-  ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
+  ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
         [](const CTYPE_COMPUTE val_in) {
-          CTYPE_COMPUTE in_casted = static_cast<CTYPE_COMPUTE>(val_in);
           CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
-              (static_cast<CTYPE_COMPUTE>(1.0) + exp(-in_casted));
+              (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
         },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        utils::SupportedTensorDtypes::FLOATHBF16);
   });
 
   return out;