diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index 4237ae7b3a7..338d997297d 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -16,6 +16,7 @@ from executorch.backends.transforms.duplicate_dynamic_quant_chain import ( DuplicateDynamicQuantChainPass, ) +from executorch.backends.xnnpack.passes.convert_to_linear import ConvertToLinearPass from executorch.exir import EdgeProgramManager from executorch.exir.backend.partitioner import Partitioner @@ -382,6 +383,10 @@ def to_executorch(self) -> "LLMEdgeManager": ExecutorchBackendConfig( extract_delegate_segments=True, passes=[ + # If there are Linear operations left in the graph, let's execute + # them with the optimized op_linear rather than materializing a + # transpose followed by a regular op_mm. + ConvertToLinearPass(), QuantFusionPass(), ], memory_planning_pass=MemoryPlanningPass( diff --git a/kernels/optimized/optimized-oss.yaml b/kernels/optimized/optimized-oss.yaml index f79d652b91d..797744f3bd4 100644 --- a/kernels/optimized/optimized-oss.yaml +++ b/kernels/optimized/optimized-oss.yaml @@ -45,6 +45,11 @@ - arg_meta: null kernel_name: torch::executor::opt_le_tensor_out +- op: linear.out + kernels: + - arg_meta: null + kernel_name: torch::executor::opt_linear_out + - op: mul.out kernels: - arg_meta: null