pytorch
diff --git a/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 5 additions & 1 deletion b/‎.github/scripts/extract_benchmark_results.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 7 additions & 9 deletions b/‎.github/workflows/apple-perf-private-device-experiment.yml‎
Lines changed: 7 additions & 9 deletions
diff --git a/‎.lintrunner.toml‎
Lines changed: 15 additions & 15 deletions b/‎.lintrunner.toml‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_avg_pool2d.py‎
Lines changed: 24 additions & 10 deletions b/‎backends/arm/operators/op_avg_pool2d.py‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 0 additions & 1 deletion b/‎backends/cadence/aot/tests/test_fusion_ops_passes.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 0 additions & 2 deletions b/‎backends/cadence/aot/tests/test_memory_passes.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/cadence/aot/tests/test_remove_ops_passes.py‎
Lines changed: 0 additions & 2 deletions b/‎backends/cadence/aot/tests/test_remove_ops_passes.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/cadence/hifi/operators/op_bmm.cpp‎
Lines changed: 5 additions & 5 deletions b/‎backends/cadence/hifi/operators/op_bmm.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/cadence/hifi/operators/op_mm.cpp‎
Lines changed: 3 additions & 3 deletions b/‎backends/cadence/hifi/operators/op_mm.cpp‎
Lines changed: 3 additions & 3 deletions
@@ -349,7 +349,10 @@ def transform(
     # Overwrite the device name here with the job name as it has more information about
     # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
     for r in benchmark_results:
-        r["deviceInfo"]["device"] = job_name
+        is_private_device = job_report.get("is_private_instance", False)
+        r["deviceInfo"]["device"] = (
+            f"{job_name} (private)" if is_private_device else job_name
+        )
 
     # From https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
     return [
@@ -363,6 +366,7 @@ def transform(
                     "benchmark_config": json.dumps(benchmark_config),
                     "job_conclusion": "SUCCESS",
                     "job_arn": job_report.get("arn", ""),
+                    "instance_arn": job_report.get("instance_arn", ""),
                 },
             },
             "model": {
 
@@ -1,18 +1,16 @@
 name: apple-perf (private devices)
 
 on:
-  # TODO (huydhn): Disable the schedule run until we land the change to add device pool and device name
-  # to separate between public and private iOS devices
-  # schedule:
-  # - cron: 0 0,4,8,12,16,20 * * *
+  schedule:
+   - cron: 0 0,4,8,12,16,20 * * *
   pull_request:
     paths:
       - .github/workflows/apple-perf-private-device-experiment.yml
-  # push:
-  #   branches:
-  #     - main
-  #   paths:
-  #     - .github/workflows/apple-perf-private-device-experiment.yml
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/apple-perf-private-device-experiment.yml
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
     inputs:
 
@@ -10,7 +10,7 @@ exclude_patterns = [
     'exir/serde/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -19,7 +19,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -41,7 +41,7 @@ exclude_patterns = [
     'exir/serde/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -50,7 +50,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -83,7 +83,7 @@ exclude_patterns = [
     'runtime/core/portable_type/c10/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -94,7 +94,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -116,7 +116,7 @@ exclude_patterns = [
     '**/third-party/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -126,7 +126,7 @@ command = [
     '@{{PATHSFILE}}',
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -150,7 +150,7 @@ exclude_patterns = [
     '**/third-party/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -191,7 +191,7 @@ exclude_patterns = [
     'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -226,7 +226,7 @@ exclude_patterns = [
     'util/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -275,7 +275,7 @@ exclude_patterns = [
     'util/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -325,7 +325,7 @@ exclude_patterns = [
     'backends/arm/test/**',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -337,7 +337,7 @@ command = [
     '@{{PATHSFILE}}'
 ]
 init_command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
@@ -356,7 +356,7 @@ exclude_patterns = [
     '.lintrunner.toml',
 ]
 command = [
-    'python',
+    'python3',
     '-m',
     'lintrunner_adapters',
     'run',
 
@@ -49,8 +49,8 @@ Key value propositions of ExecuTorch are:
 ## Getting Started
 To get started you can:
 
-- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/main/index) to get things running locally and deploy a model to a device
-- Use this [Colab Notebook](https://pytorch.org/executorch/main/getting-started-setup#quick-setup-colab-jupyter-notebook-prototype) to start playing around right away
+- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
+- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
 - Jump straight into LLM use cases by following specific instructions for [Llama](examples/models/llama/README.md) and [Llava](examples/models/llava/README.md)
 
 ## Feedback and Engagement
 
@@ -85,8 +85,12 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        input_tensor = inputs[0]
-        assert input_tensor.dtype == ts.DType.INT8
+        supported_dtypes = [ts.DType.INT8]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         accumulator_type = ts.DType.INT32
 
@@ -118,9 +122,12 @@ def define_node(
     ) -> None:
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
-        assert (
-            inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.FP32
-        ), "Only FP32 and INT8 supported"
+        supported_dtypes = [ts.DType.INT8, ts.DType.FP32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         if inputs[0].dtype == ts.DType.INT8:
             super().define_node(node, tosa_graph, inputs, output)
@@ -205,8 +212,12 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts  # type: ignore
 
-        input_tensor = inputs[0]
-        assert input_tensor.dtype == ts.DType.INT8
+        supported_dtypes = [ts.DType.INT8]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         accumulator_type = ts.DType.INT32
 
@@ -241,9 +252,12 @@ def define_node(
     ) -> None:
         import serializer.tosa_serializer as ts  # type: ignore
 
-        assert (
-            inputs[0].dtype == ts.DType.INT8 or inputs[0].dtype == ts.DType.FP32
-        ), "Only FP32 and INT8 supported"
+        supported_dtypes = [ts.DType.INT8, ts.DType.FP32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f"IO data type needs to be one of {supported_dtypes}, got "
+                f'"{inputs[0].dtype}"'
+            )
 
         if inputs[0].dtype == ts.DType.INT8:
             super().define_node(node, tosa_graph, inputs, output)
 
@@ -328,7 +328,6 @@ def forward(self, x):
         model = M()
         graph_module = export_to_edge(model, (inputs,)).exported_program().graph_module
         graph_module = FuseQuantDequantToRequantizePass()(graph_module).graph_module
-        graph_module.print_readable()
 
         self.check_op_counts(
             graph_module,
 
@@ -711,7 +711,6 @@ def forward(self, x) -> torch.Tensor:
             .exported_program()
             .graph_module
         )
-        graph_module.print_readable()
         self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 1)
         self.assertEqual(
             count_node(graph_module, torch.ops.aten._slice_copy_nop.Tensor_out), 0
@@ -741,7 +740,6 @@ def forward(self, x) -> torch.Tensor:
             .exported_program()
             .graph_module
         )
-        graph_module.print_readable()
         self.assertEqual(count_node(graph_module, torch.ops.aten._cat_nop.out), 2)
         self.assertEqual(count_node(graph_module, torch.ops.aten.cat.out), 0)
         self.verify_nop_memory_alloc(graph_module)
 
@@ -100,7 +100,6 @@ def forward(self, t: torch.Tensor):
         p = RemoveNopAddOpPass()
 
         graph_after_passes = cast(PassResult, p(graph_module)).graph_module
-        graph_module.print_readable()
         self.assertEqual(
             count_node(graph_after_passes, exir_ops.edge.aten.add.Tensor),
             0,
@@ -140,7 +139,6 @@ def forward(self, t: torch.Tensor):
         p = RemoveNopMulOpPass()
 
         graph_after_passes = cast(PassResult, p(graph_module)).graph_module
-        graph_module.print_readable()
         self.assertEqual(
             count_node(graph_after_passes, exir_ops.edge.aten.mul.Tensor),
             0,
 
@@ -16,8 +16,8 @@ using exec_aten::ScalarType;
 using executorch::runtime::KernelRuntimeContext;
 using executorch::runtime::kTensorDimensionLimit;
 using executorch::runtime::resize_tensor;
-using executorch::runtime::tensors_have_same_dim_order;
 using executorch::runtime::tensor_is_default_dim_order;
+using executorch::runtime::tensors_have_same_dim_order;
 using torch::executor::check_bmm_args;
 using torch::executor::Error;
 using torch::executor::get_bmm_out_target_size;
@@ -78,16 +78,16 @@ Tensor& bmm_out(
     WORD32 out_stride = p;
 
     WORD32* __restrict__ tmp =
-          (WORD32* __restrict__)kernels::allocate_temp_memory(
-              ctx, (batch_size * m * p) * sizeof(float));
+        (WORD32* __restrict__)kernels::allocate_temp_memory(
+            ctx, (batch_size * m * p) * sizeof(float));
 
     ET_KERNEL_CHECK(ctx, tmp != nullptr, MemoryAllocationFailed, out);
 
     tmp[batch_size * m * p] = {0};
 
     WORD32* __restrict__ p_o =
-          (WORD32* __restrict__)kernels::allocate_temp_memory(
-              ctx, (batch_size * m * p) * sizeof(WORD32));
+        (WORD32* __restrict__)kernels::allocate_temp_memory(
+            ctx, (batch_size * m * p) * sizeof(WORD32));
 
     ET_KERNEL_CHECK(ctx, p_o != nullptr, MemoryAllocationFailed, out);
 
 
@@ -76,8 +76,8 @@ Tensor& mm_out(
     WORD32 out_stride = p;
 
     WORD32* __restrict__ p_o =
-          (WORD32* __restrict__)kernels::allocate_temp_memory(
-              ctx, (n * p) * sizeof(WORD32));
+        (WORD32* __restrict__)kernels::allocate_temp_memory(
+            ctx, (n * p) * sizeof(WORD32));
 
     WORD32 p_inp_shape[2];
     p_inp_shape[0] = n;
@@ -146,4 +146,4 @@ Tensor& mm_out(
 } // namespace native
 } // namespace HiFi
 } // namespace impl
-} // namespace cadence
+} // namespace cadence