pytorch · facebook-github-bot · Sep 11, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 11, 2025
@@ -34,20 +34,6 @@ runtime.python_library(
     ],
 )
 
-runtime.python_library(
-    name = "int4_weight_only_quantizer",
-    srcs = [
-        "int4_weight_only_quantizer.py",
-    ],
-    visibility = [
-        "//executorch/backends/...",
-    ],
-    deps = [
-        "//executorch/backends/vulkan:custom_ops_lib",
-        "//pytorch/ao:torchao",
-    ]
-)
-
 runtime.python_library(
     name = "squeeze_unsqueeze_inputs",
     srcs = [
@@ -161,7 +147,6 @@ runtime.python_library(
         ":fuse_patterns",
         ":fuse_quantized_ops",
         ":insert_prepack_nodes",
-        ":int4_weight_only_quantizer",
         ":remove_asserts",
         ":remove_local_scalar_dense",
         ":remove_redundant_ops",

@@ -12,9 +12,6 @@
     FuseQuantizedOpsTransform,
 )
 from executorch.backends.vulkan._passes.insert_prepack_nodes import insert_prepack_nodes
-from executorch.backends.vulkan._passes.int4_weight_only_quantizer import (
-    VkInt4WeightOnlyQuantizer,
-)
 from executorch.backends.vulkan._passes.remove_asserts import (
     remove_asserts,
     RemoveAssertsTransform,
@@ -35,7 +32,6 @@
     "FusePatternsPass",
     "FuseQuantizedOpsTransform",
     "insert_prepack_nodes",
-    "VkInt4WeightOnlyQuantizer",
     "remove_asserts",
     "RemoveAssertsTransform",
     "RemoveLocalScalarDenseOpsTransform",

@@ -210,278 +210,6 @@ def fuse_into_linear_qcnw_node(
         graph_module.graph.erase_node(dq_weight_node)
 
 
-#########################
-## linear_qta8a_qga4w ##
-#########################
-
-
-def _is_dequantize_affine_node(node: torch.fx.Node) -> bool:
-    """Check if a node is a dequantize_affine operation."""
-    return (
-        node.op == "call_function"
-        and node.target is not None
-        and hasattr(node.target, "__name__")
-        and "dequantize_affine" in getattr(node.target, "__name__", "")
-    )
-
-
-def _is_view_copy_node(node: torch.fx.Node) -> bool:
-    """Check if a node is a view_copy operation."""
-    return (
-        node.op == "call_function"
-        and node.target is not None
-        and hasattr(node.target, "__name__")
-        and "view_copy" in getattr(node.target, "__name__", "")
-    )
-
-
-def _validate_qta8a_qga4w_nodes(
-    input_node: torch.fx.node.Argument, weight_node: torch.fx.node.Argument
-) -> Optional[torch.fx.Node]:
-    """
-    Validate input and weight nodes for QTA8A_QGA4W pattern.
-    Returns the actual input node (after handling view operations) or None if invalid.
-    """
-    # Type checking - ensure we have torch.fx.Node objects
-    if not isinstance(weight_node, torch.fx.Node) or not isinstance(
-        input_node, torch.fx.Node
-    ):
-        return None
-
-    # Input may be preprocessed with a view node
-    actual_input_node = input_node
-    if _is_view_copy_node(input_node):
-        actual_input_node = input_node.args[0]
-        if not isinstance(actual_input_node, torch.fx.Node):
-            return None
-
-    # Check if input is dequantized with dequantize_affine (from dynamic quantization)
-    if not _is_dequantize_affine_node(actual_input_node):
-        return None
-
-    # Check if weight is dequantized with dequantize_affine
-    if not _is_dequantize_affine_node(weight_node):
-        return None
-
-    return actual_input_node
-
-
-def _extract_weight_params(
-    program: ExportedProgram, weight_node: torch.fx.Node
-) -> Optional[Tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node]]:
-    """Extract and validate weight parameters from dequantize_affine node."""
-    # Get the original quantized weight and quantization parameters
-    if len(weight_node.args) < 4:
-        return None
-
-    orig_weight = weight_node.args[0]
-    weight_scales = weight_node.args[2]
-    weight_zeros = weight_node.args[3]
-
-    # Type checking
-    if not isinstance(orig_weight, torch.fx.Node) or not is_param_node(
-        program, orig_weight
-    ):
-        return None
-    if not isinstance(weight_scales, torch.fx.Node) or not is_param_node(
-        program, weight_scales
-    ):
-        return None
-    if not isinstance(weight_zeros, torch.fx.Node) or not is_param_node(
-        program, weight_zeros
-    ):
-        return None
-
-    return orig_weight, weight_scales, weight_zeros
-
-
-def _validate_4bit_quantization(weight_tensor: torch.Tensor) -> bool:
-    """Check if weight tensor is quantized to 4 bits (values in [-8, 7] range)."""
-    quant_min = weight_tensor.min().item()
-    quant_max = weight_tensor.max().item()
-    return quant_min >= -8 and quant_max <= 7
-
-
-def _calculate_group_size(
-    orig_weight_tensor: torch.Tensor, weight_scales_tensor: torch.Tensor
-) -> Optional[int]:
-    """Calculate and validate group size from weight and scales tensors."""
-    out_features, in_features = orig_weight_tensor.shape
-
-    if len(weight_scales_tensor.shape) != 2:
-        return None
-
-    scales_out_features, num_groups = weight_scales_tensor.shape
-
-    if scales_out_features != out_features:
-        return None
-
-    group_size = in_features // num_groups
-    if in_features % group_size != 0:
-        return None
-
-    return group_size
-
-
-def matches_linear_qta8a_qga4w_pattern(
-    program: ExportedProgram, node: torch.fx.Node
-) -> Optional[Tuple[int, int]]:
-    """
-    Checks if the nodes surrounding a linear node matches the pattern for dynamic
-    activation + grouped weight quantized linear (QTA8A_QGA4W).
-
-    This pattern involves:
-    1. Dynamic quantization of input activations (8-bit)
-    2. Grouped quantization of weights (4-bit with group size)
-
-    The expected pattern from Int8DynActInt4WeightQuantizer is:
-        scale, zero_point = choose_qparams_affine(input)
-        quantized_input = quantize_affine(input, scale, zero_point)
-        dequantized_input = dequantize_affine(quantized_input, ...)
-        dequantized_weight = dequantize_affine(weight, weight_scales, weight_zeros)
-        output = linear(dequantized_input, dequantized_weight)
-
-    If the pattern matches, return (group_size, weight_bits), otherwise None.
-    """
-    if not utils.is_linear_node(node):
-        return None
-
-    input_node = node.args[0]
-    weight_node = node.args[1]
-
-    # Validate nodes and get actual input node
-    actual_input_node = _validate_qta8a_qga4w_nodes(input_node, weight_node)
-    if actual_input_node is None:
-        return None
-
-    # Extract weight parameters
-    if not isinstance(weight_node, torch.fx.Node):
-        return None
-    weight_params = _extract_weight_params(program, weight_node)
-    if weight_params is None:
-        return None
-
-    orig_weight, weight_scales, weight_zeros = weight_params
-
-    # Get tensors to analyze the quantization scheme
-    orig_weight_tensor = get_param_tensor(program, orig_weight)
-    weight_scales_tensor = get_param_tensor(program, weight_scales)
-    weight_zeros_tensor = get_param_tensor(program, weight_zeros)
-
-    if not isinstance(orig_weight_tensor, torch.Tensor):
-        return None
-    if not isinstance(weight_scales_tensor, torch.Tensor):
-        return None
-    if not isinstance(weight_zeros_tensor, torch.Tensor):
-        return None
-
-    # Check if weight is quantized to 4 bits
-    if not _validate_4bit_quantization(orig_weight_tensor):
-        return None
-
-    # Calculate group size
-    group_size = _calculate_group_size(orig_weight_tensor, weight_scales_tensor)
-    if group_size is None:
-        return None
-
-    # Verify this is 4-bit grouped quantization
-    weight_bits = 4
-
-    return group_size, weight_bits
-
-
-def fuse_into_linear_qta8a_qga4w_node(
-    program: ExportedProgram,
-    graph_module: torch.fx.GraphModule,
-    linear_node: torch.fx.Node,
-    group_size: int,
-    weight_bits: int,
-) -> None:
-    """
-    Fuse the dynamic activation + grouped weight quantized linear pattern into
-    a single linear_qta8a_qga4w operator.
-
-    The pattern:
-        dequantized_input = dequantize_affine(quantized_input, block_size, scale, zero_point, ...)
-        dequantized_weight = dequantize_affine(weight, block_size, weight_scales, weight_zeros, ...)
-        output = linear(dequantized_input, dequantized_weight)
-
-    Becomes:
-        output = linear_qta8a_qga4w(quantized_input, input_scale, input_zero_point,
-                                   weight, group_size, weight_scales, weight_zeros)
-    """
-    dq_input_node = linear_node.args[0]
-    dq_weight_node = linear_node.args[1]
-
-    assert isinstance(dq_input_node, torch.fx.Node)
-
-    input_view_node = None
-    # Input may be preprocessed with a view node
-    if (
-        dq_input_node.op == "call_function"
-        and dq_input_node.target is not None
-        and hasattr(dq_input_node.target, "__name__")
-        and "view_copy" in getattr(dq_input_node.target, "__name__", "")
-    ):
-        input_view_node = dq_input_node
-        dq_input_node = dq_input_node.args[0]
-        assert isinstance(dq_input_node, torch.fx.Node)
-
-    assert isinstance(dq_input_node, torch.fx.Node)
-    assert isinstance(dq_weight_node, torch.fx.Node)
-
-    # Get the quantized input and quantization parameters from the input dequantize_affine node
-    # Args: (input, block_size, scale, zero_point, input_dtype, quant_min, quant_max, output_dtype)
-    quantized_input = dq_input_node.args[0]
-    input_scale = dq_input_node.args[2]  # scale is the 3rd argument
-    input_zero_point = dq_input_node.args[3] if len(dq_input_node.args) > 3 else None
-
-    # Get the weight and its quantization parameters from dequantize_affine
-    # Args: (weight, block_size, weight_scales, weight_zeros, input_dtype, quant_min, quant_max, output_dtype)
-    orig_weight = dq_weight_node.args[0]
-    weight_scales = dq_weight_node.args[2]
-    weight_zeros = dq_weight_node.args[3]
-
-    # Pack the 4-bit weight tensor for efficient storage
-    assert isinstance(orig_weight, torch.fx.Node)
-    orig_weight_tensor = get_param_tensor(program, orig_weight)
-    assert isinstance(orig_weight_tensor, torch.Tensor)
-    packed_weight_tensor = pack_4bit_weight_tensor(orig_weight_tensor)
-    utils.update_program_state_dict(
-        program,
-        orig_weight.name,
-        packed_weight_tensor,
-    )
-    # Update the metadata to reflect the new packed shape
-    orig_weight.meta["val"] = orig_weight.meta["val"][:, ::2].to(torch.uint8)
-
-    # Create the linear_qta8a_qga4w node
-    with graph_module.graph.inserting_before(linear_node):
-        linear_qta8a_qga4w_node = graph_module.graph.create_node(
-            "call_function",
-            exir_ops.edge.et_vk.linear_qta8a_qga4w.default,
-            (
-                quantized_input,  # quantized input (int8)
-                input_scale,  # mat1_scale
-                input_zero_point,  # mat1_zero_point
-                orig_weight,  # mat2_data (packed 4-bit weights)
-                group_size,  # group_size (int)
-                weight_scales,  # weight_scales
-                weight_zeros,  # weight_zeros
-            ),
-        )
-
-        # Replace the linear node with the new fused node
-        linear_node.replace_all_uses_with(linear_qta8a_qga4w_node)
-
-        # Erase nodes in the correct order (users first, then dependencies)
-        graph_module.graph.erase_node(linear_node)
-        if input_view_node is not None:
-            graph_module.graph.erase_node(input_view_node)
-        graph_module.graph.erase_node(dq_weight_node)
-        graph_module.graph.erase_node(dq_input_node)
-
-
 class FuseQuantizedOpsTransform(ExportPass):
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
@@ -498,15 +226,6 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 )
                 continue
 
-            # Check for linear_qta8a_qga4w pattern (dynamic activation + grouped weight quantization)
-            qta8a_qga4w_details = None
-            if qta8a_qga4w_details is not None:
-                group_size, weight_bits = qta8a_qga4w_details
-                fuse_into_linear_qta8a_qga4w_node(
-                    self.program, graph_module, node, group_size, weight_bits
-                )
-                continue
-
         graph_module.recompile()
         dead_code_elimination_pass(graph_module)