diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index 9cf6c22c6ca..87aa86154ee 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -25,3 +25,6 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
+    - NAME: conv2d_dw_output_tile_3x3_b1x1
+      BATCH_SIZE_X: 1
+      BATCH_SIZE_Y: 1
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl
new file mode 100644
index 00000000000..aee920bd84a
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.glsl
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+#define TILE_M4 ${TILE_M4}
+#define TILE_K4 ${TILE_K4}
+#define TILE_N4 ${TILE_N4}
+
+#define TILE_M ${TILE_M}
+#define TILE_K ${TILE_K4 * 4}
+#define TILE_N ${TILE_N4 * 4}
+
+${define_required_extensions(STORAGE, DTYPE)}
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "texture2d")}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, "texture2d")}
+
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+
+layout(push_constant) uniform restrict Block {
+  int stride_h;
+  int stride_w;
+  int padding_h;
+  int padding_w;
+  float out_min;
+  float out_max;
+};
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+${layout_declare_spec_const(C, "int", "stride_1_padding_0", "0")}
+${layout_declare_spec_const(C, "int", "activation_type", "0")}
+
+#include "linear_fp_input_tile.glslh"
+#include "linear_fp_packed_weight_tile_load.glslh"
+#include "linear_fp_output_tile_fp_compute.glslh"
+
+void load_input_tile_with_checks(
+    out FPInputTile tile,
+    const int k4_start,
+    const int m_start,
+    const int K4,
+    const int M,
+    const int W_out,
+    const int W_in,
+    const int H_in) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int k4 = 0; k4 < TILE_K4; ++k4) {
+      if (k4_start + k4 < K4 && m_start + m < M) {
+        if (stride_1_padding_0 != 0) {
+          const int spatial = m_start + m;
+          tile.data[m][k4] =
+              texelFetch(t_in, ivec3(spatial % W_out, spatial / W_out, k4_start + k4), 0);
+        } else {
+          const int out_spatial = m_start + m;
+          const int out_x = out_spatial % W_out;
+          const int out_y = out_spatial / W_out;
+          const int in_x = out_x * stride_w - padding_w;
+          const int in_y = out_y * stride_h - padding_h;
+          if (in_x >= 0 && in_x < W_in && in_y >= 0 && in_y < H_in) {
+            tile.data[m][k4] =
+                texelFetch(t_in, ivec3(in_x, in_y, k4_start + k4), 0);
+          } else {
+            tile.data[m][k4] = VEC4_T(0.0);
+          }
+        }
+      } else {
+        tile.data[m][k4] = VEC4_T(0.0);
+      }
+    }
+  }
+}
+
+void store_output_tile_with_checks(
+    const FPOutTile out_tile,
+    const int n4_start,
+    const int m_start,
+    const int N4,
+    const int M,
+    const int W_out) {
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      if (m_start + m < M && n4_start + n4 < N4) {
+        const int spatial = m_start + m;
+        VEC4_T texel = out_tile.data[m][n4];
+        if (activation_type == 1) {
+          texel = max(texel, VEC4_T(0.0));
+        } else if (activation_type == 2) {
+          texel = clamp(texel, VEC4_T(out_min), VEC4_T(out_max));
+        }
+        imageStore(t_out, ivec3(spatial % W_out, spatial / W_out, n4_start + n4), texel);
+      }
+    }
+  }
+}
+
+void main() {
+  const int tile_idx_n = int(gl_GlobalInvocationID.x);
+  const int tile_idx_m = int(gl_GlobalInvocationID.y);
+
+  const int n4_start = tile_idx_n * TILE_N4;
+  const int m_start = tile_idx_m * TILE_M;
+
+  const int W_in = in_sizes.x;
+  const int H_in = in_sizes.y;
+  const int K = in_sizes.z;
+  const int K4 = div_up_4(K);
+
+  const int W_out = out_sizes.x;
+  const int H_out = out_sizes.y;
+  const int M = W_out * H_out;
+  const int N = out_sizes.z;
+  const int N4 = div_up_4(N);
+
+  if (n4_start >= N4 || m_start >= M) {
+    return;
+  }
+
+  FPOutTile out_tile;
+  initialize(out_tile);
+
+  FPInputTile in_tile;
+  FPWeightTile w_tile;
+
+  for (int k4 = 0; k4 < K4; k4++) {
+    load_input_tile_with_checks(in_tile, k4, m_start, K4, M, W_out, W_in, H_in);
+    load_packed_weight_tile_with_checks(w_tile, n4_start, k4, 0, N4, K4);
+    fp_accumulate_with_fp_weight(out_tile, in_tile, w_tile);
+  }
+
+  // Apply bias
+  [[unroll]] for (int m = 0; m < TILE_M; ++m) {
+    [[unroll]] for (int n4 = 0; n4 < TILE_N4; ++n4) {
+      if (n4_start + n4 < N4) {
+        out_tile.data[m][n4] +=
+            texelFetch(t_bias, ivec2(n4_start + n4, 0), 0);
+      }
+    }
+  }
+
+  store_output_tile_with_checks(out_tile, n4_start, m_start, N4, M, W_out);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml
new file mode 100644
index 00000000000..037dfc35c89
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw_tiled.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_pw_tiled:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+    TILE_M4: 1
+    TILE_K4: 1
+    TILE_N4: 1
+    TILE_M: 4
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: conv2d_pw_tiled
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
new file mode 100644
index 00000000000..a9d8483b2e2
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dDW.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+
+namespace vkcompute {
+
+//
+// Weight prepack
+//
+
+ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) {
+  const auto original_sizes = graph.sizes_of(vref);
+
+  int64_t out_channels_padded =
+      utils::align_up_4(utils::val_at(-4, original_sizes));
+  int64_t height = utils::val_at(-2, original_sizes);
+  int64_t width = utils::val_at(-1, original_sizes);
+
+  const std::vector<int64_t> final_sizes = {
+      4, out_channels_padded / 4, height * width};
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.dtype_of(vref),
+      utils::kTexture2D,
+      utils::kChannelsPacked);
+
+  std::string kernel_name = "conv2d_dw_prepack_weights";
+  add_dtype_suffix(kernel_name, graph.dtype_of(v));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref));
+
+  const auto original_sizes_pc =
+      utils::make_ivec4(original_sizes, /*reverse = */ true);
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      vref,
+      v,
+      {},
+      // Specialization constants
+      {graph.packed_dim_of(v)},
+      {graph.sizes_pc_of(v),
+       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
+
+  return v;
+}
+
+//
+// Shader selection
+//
+
+std::string pick_conv2d_dw_shader(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const ValueRef out,
+    const bool stride_equals_dilation,
+    const bool clamp_out) {
+  std::string kernel_name = "conv2d_dw";
+  kernel_name.reserve(kShaderNameReserve);
+
+  const auto& weight_sizes = graph.get_tref(weight_data)->sizes;
+  const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3;
+  const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5;
+
+  if (!stride_equals_dilation) {
+    kernel_name += "_sned";
+  }
+
+  if (is_3x3) {
+    kernel_name += "_output_tile_3x3";
+    if (stride_equals_dilation && graph.device_is_mali()) {
+      kernel_name += "_b1x1";
+    }
+  } else if (is_5x5) {
+    kernel_name += "_output_tile_5x5";
+  }
+
+  if (clamp_out) {
+    kernel_name += "_clamp";
+  }
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  return kernel_name;
+}
+
+//
+// Workgroup size
+//
+
+utils::uvec3 conv2d_dw_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool uses_output_tile =
+      shader.kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos;
+
+    const utils::uvec3 image_extents = graph->create_global_wg_size(out);
+
+    if (is_sned) {
+      // sned output_tile shaders: no batch division, just flatten W*H
+      return {image_extents[0] * image_extents[1], image_extents[2], 1};
+    }
+
+    // stride==dilation output_tile shaders: apply batch division
+    uint32_t batch_x = 4u;
+    uint32_t batch_y = 2u;
+    if (shader.kernel_name.find("_b1x1") != std::string::npos) {
+      batch_x = 1u;
+      batch_y = 1u;
+    }
+
+    uint32_t scaled_x = utils::div_up(image_extents[0], batch_x);
+    uint32_t scaled_y = utils::div_up(image_extents[1], batch_y);
+    return {scaled_x * scaled_y, image_extents[2], 1};
+  }
+
+  // Base conv2d_dw shader: fully linearized dispatch
+  const utils::uvec3 base_extents = graph->create_global_wg_size(out);
+  return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
+}
+
+utils::uvec3 conv2d_dw_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64, 1, 1};
+}
+
+//
+// Dispatch node
+//
+
+struct Conv2dDWParams final {
+  utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+struct OutputParams final {
+  float out_min;
+  float out_max;
+};
+
+void add_conv2d_dw_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef arg_weight,
+    const ValueRef arg_bias,
+    const ValueRef weight_data,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const std::string& kernel_name,
+    const Kernel2dParams& kernel_params,
+    const Conv2dDWParams& extra_params,
+    const OutputParams& out_params) {
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  const bool uses_output_tile =
+      kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
+  } else {
+    param_buffers = {
+        graph.logical_limits_ubo(out),
+        graph.sizes_ubo(in),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
+  }
+
+  // transposed is always false for depthwise, output_padding unused
+  ValueRef transposed_ref = graph.add_scalar(false);
+  ValueRef output_padding = graph.add_none();
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      shader,
+      conv2d_dw_global_wg_size,
+      conv2d_dw_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
+      // Shader params buffers
+      param_buffers,
+      // Push Constants
+      push_constants,
+      // Specialization Constants
+      {},
+      // Resize Args
+      {weight_data, stride, padding, dilation, transposed_ref, output_padding},
+      // Resizing Logic
+      resize_conv2d_node));
+}
+
+//
+// High level operator impl
+//
+
+void conv2d_dw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  ValueRef arg_weight = prepack_dw_weights(graph, weight_data);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /* transposed = */ false,
+      /* storage_type = */ utils::kTexture2D,
+      /* memory_layout = */ utils::kWidthPacked);
+
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  if (in_sizes.at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
+
+  check_conv_args(graph, in, out);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight_data,
+      /*kernel_size_only = */ false,
+      stride,
+      padding,
+      dilation);
+
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
+  const auto& overlay_region = utils::make_ivec2({
+      kernel_params.kernel_size[0] +
+          (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1),
+      kernel_params.kernel_size[1] +
+          (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1),
+  });
+  const auto weight_sizes = graph.sizes_of(weight_data);
+  const int32_t in_group_size =
+      utils::safe_downcast<int32_t>(utils::align_up_4(weight_sizes.at(1)));
+  Conv2dDWParams extra_params = {overlay_region, in_group_size};
+
+  OutputParams out_params = {out_min_val, out_max_val};
+
+  std::string kernel_name = pick_conv2d_dw_shader(
+      graph, weight_data, out, stride_equals_dilation, clamp_out);
+
+  add_conv2d_dw_node(
+      graph,
+      in,
+      arg_weight,
+      arg_bias,
+      weight_data,
+      stride,
+      padding,
+      dilation,
+      out,
+      kernel_name,
+      kernel_params,
+      extra_params,
+      out_params);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp b/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp
new file mode 100644
index 00000000000..2863d80aa0e
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Conv2dPW.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Shader dispatch utilities
+//
+
+void resize_conv2d_pw_tiled_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  const ValueRef self = args.at(1).refs.at(0);
+
+  std::vector<int64_t> self_sizes = graph->sizes_of(self);
+  TensorRefPtr weight_ref = graph->get_tref(extra_args.at(0));
+  const auto& weight_sizes = weight_ref->sizes;
+
+  const auto stride_list = graph->get_int_list(extra_args.at(1));
+  const auto padding_list = graph->get_int_list(extra_args.at(2));
+
+  const int64_t stride_h = stride_list->at(0);
+  const int64_t stride_w = stride_list->at(1);
+  const int64_t padding_h = padding_list->at(0);
+  const int64_t padding_w = padding_list->at(1);
+
+  const int64_t in_h = self_sizes.at(self_sizes.size() - 2);
+  const int64_t in_w = self_sizes.at(self_sizes.size() - 1);
+
+  // For 1x1 kernel with dilation=1: out = (in + 2*padding - 1) / stride + 1
+  const int64_t out_h = (in_h + 2 * padding_h - 1) / stride_h + 1;
+  const int64_t out_w = (in_w + 2 * padding_w - 1) / stride_w + 1;
+
+  std::vector<int64_t> new_out_sizes = self_sizes;
+  new_out_sizes.at(self_sizes.size() - 3) = weight_sizes.at(0);
+  new_out_sizes.at(self_sizes.size() - 2) = out_h;
+  new_out_sizes.at(self_sizes.size() - 1) = out_w;
+
+  graph->virtual_resize(out, new_out_sizes);
+}
+
+vkapi::ShaderInfo pick_conv2d_pw_tiled_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  std::string kernel_name = "conv2d_pw_tiled";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 pick_conv2d_pw_tiled_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  uint32_t W = graph->size_at<uint32_t>(-1, out);
+  uint32_t H = graph->size_at<uint32_t>(-2, out);
+  uint32_t C_out = graph->size_at<uint32_t>(-3, out);
+  uint32_t M = H * W;
+  uint32_t N4 = utils::div_up_4(C_out);
+  // TILE_N4=1, TILE_M=4
+  return {N4, utils::div_up(M, 4u), 1};
+}
+
+//
+// Prepack nodes
+//
+
+struct PackParams {
+  int32_t N;
+  int32_t K;
+  int32_t B;
+  int32_t is_transposed;
+};
+
+ValueRef prepack_conv2d_pw_weight(
+    ComputeGraph& graph,
+    const ValueRef weight_data) {
+  const std::vector<int64_t> weight_sizes = graph.sizes_of(weight_data);
+  const int64_t N = weight_sizes.at(0); // C_out
+  const int64_t K = weight_sizes.at(1); // C_in
+  const int64_t N4 = utils::div_up(N, int64_t(4));
+  const int64_t K4 = utils::div_up(K, int64_t(4));
+
+  const int64_t output_height = K4;
+  const int64_t output_width = N4 * 4 * 4;
+
+  utils::StorageType weight_storage = utils::kTexture2D;
+  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+  if (output_width / 4 > max_extent ||
+      static_cast<uint32_t>(output_height) > max_extent) {
+    weight_storage = utils::kBuffer;
+  }
+
+  ValueRef packed_weight = graph.add_tensor(
+      {output_height, output_width},
+      graph.dtype_of(weight_data),
+      weight_storage,
+      utils::kWidthPacked);
+
+  utils::uvec3 global_wg_size = {
+      utils::safe_downcast<uint32_t>(N4),
+      utils::safe_downcast<uint32_t>(K4),
+      1u};
+
+  PackParams pack_params{
+      utils::safe_downcast<int32_t>(N), utils::safe_downcast<int32_t>(K), 1, 1};
+
+  std::string pack_kernel_name = "pack_fp_linear_weight";
+  add_storage_type_suffix(pack_kernel_name, weight_storage);
+  add_dtype_suffix(pack_kernel_name, graph.dtype_of(weight_data));
+
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(pack_kernel_name),
+      global_wg_size,
+      graph.create_local_wg_size(global_wg_size),
+      weight_data,
+      packed_weight,
+      {},
+      {},
+      {PushConstantDataInfo(&pack_params, sizeof(PackParams))}));
+
+  return packed_weight;
+}
+
+//
+// Dispatch nodes
+//
+
+void add_conv2d_pw_tiled_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef packed_weight,
+    const ValueRef packed_bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef out,
+    const ValueRef weight_data,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  int32_t stride_h, stride_w, padding_h, padding_w;
+  {
+    const auto stride_list = graph.get_int_list(stride);
+    const auto padding_list = graph.get_int_list(padding);
+    stride_h = utils::safe_downcast<int32_t>(stride_list->at(0));
+    stride_w = utils::safe_downcast<int32_t>(stride_list->at(1));
+    padding_h = utils::safe_downcast<int32_t>(padding_list->at(0));
+    padding_w = utils::safe_downcast<int32_t>(padding_list->at(1));
+  }
+
+  bool s1p0 =
+      stride_h == 1 && stride_w == 1 && padding_h == 0 && padding_w == 0;
+
+  utils::ivec4 stride_padding{stride_h, stride_w, padding_h, padding_w};
+
+  struct ClampParams final {
+    float out_min;
+    float out_max;
+  };
+  ClampParams clamp_params{out_min_val, out_max_val};
+
+  ValueRef clamp_out_ref = graph.add_scalar(clamp_out);
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      pick_conv2d_pw_tiled_shader,
+      pick_conv2d_pw_tiled_global_wg_size,
+      pick_hw_square_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {{in, packed_weight, packed_bias}, vkapi::kRead}},
+      // Shader params buffers
+      {graph.sizes_ubo(in), graph.sizes_ubo(out)},
+      // Push Constants
+      {PushConstantDataInfo(&stride_padding, sizeof(stride_padding)),
+       PushConstantDataInfo(&clamp_params, sizeof(clamp_params))},
+      // Specialization Constants
+      // activation_type: 0=none, 1=relu, 2=clamp
+      {s1p0 ? 1 : 0, clamp_out ? 2 : 0},
+      // Resize Args
+      {weight_data, stride, padding, clamp_out_ref},
+      // Resizing Logic
+      resize_conv2d_pw_tiled_node));
+}
+
+//
+// High level operator impl
+//
+
+void conv2d_pw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef out,
+    const bool transposed_val,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val) {
+  ValueRef packed_weight = prepack_conv2d_pw_weight(graph, weight_data);
+
+  ValueRef packed_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      transposed_val,
+      utils::kTexture2D,
+      utils::kWidthPacked);
+
+  check_conv_args(graph, in, out);
+
+  const std::vector<int64_t> in_sizes = graph.sizes_of(in);
+  if (in_sizes.at(0) > 1) {
+    VK_THROW("conv2d: input batch size > 1 is not supported yet!");
+  }
+
+  add_conv2d_pw_tiled_node(
+      graph,
+      in,
+      packed_weight,
+      packed_bias,
+      stride,
+      padding,
+      out,
+      weight_data,
+      clamp_out,
+      out_min_val,
+      out_max_val);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 2bf3f8f726d..077ce285cfc 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+
 #include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
 
 #include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
@@ -137,18 +139,6 @@ vkapi::ShaderInfo get_conv2d_shader(
   switch (method) {
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
-      if (!prepack_weights) {
-        if (!stride_equals_dilation) {
-          kernel_name += "_sned";
-        }
-        const auto& weight_sizes = graph.get_tref(weight)->sizes;
-        if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-          kernel_name += "_output_tile_3x3";
-        }
-        if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-          kernel_name += "_output_tile_5x5";
-        }
-      }
       break;
     case Conv2dMethod::Pointwise:
       if (prepack_weights) {
@@ -294,17 +284,6 @@ Conv2dMethod get_conv2d_method(
   return Conv2dMethod::SlidingWindow;
 }
 
-utils::uvec2 get_conv2d_dw_dispatch_divisor(
-    const std::vector<int64_t>& weight_sizes) {
-  if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
-    return {4u, 2u};
-  }
-  if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
-    return {4u, 2u};
-  }
-  return {4u, 2u};
-}
-
 utils::uvec3 create_conv2d_global_wg_size(
     ComputeGraph& graph,
     const Conv2dMethod method,
@@ -317,14 +296,6 @@ utils::uvec3 create_conv2d_global_wg_size(
         utils::div_up(image_extents[0u], 1u),
         utils::div_up(image_extents[1u], 4u),
         image_extents[2u]};
-  } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
-    const utils::uvec3 image_extents = graph.create_global_wg_size(out);
-    const utils::uvec2 div =
-        get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes);
-    return {
-        utils::div_up(image_extents[0], div[0]),
-        utils::div_up(image_extents[1], div[1]),
-        image_extents[2]};
   } else {
     return graph.create_global_wg_size(out);
   }
@@ -341,10 +312,7 @@ utils::uvec3 conv2d_global_wg_size(
 
   // Determine method from shader name
   Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
+  if (shader.kernel_name.find("conv2d_pw") != std::string::npos ||
       (shader.kernel_name.find("conv2d") != std::string::npos &&
        shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
     // Check if it's pointwise by examining weight sizes
@@ -367,21 +335,7 @@ utils::uvec3 conv2d_global_wg_size(
   utils::uvec3 wg_size = create_conv2d_global_wg_size(
       *graph, method, out, weight_data, stride_equals_dilation);
 
-  if (method == Conv2dMethod::Depthwise) {
-    // The output_tile shaders (conv2d_dw_output_tile,
-    // conv2d_dw_sned_output_tile) use a 2D dispatch: (x_tile, y_tile) packed
-    // into glb_x, channel in glb_y. The base conv2d_dw shader uses a 1D
-    // dispatch: all (x, y, channel) packed into glb_x. For the base shader, we
-    // must use {W*H*C_packed, 1, 1}.
-    const bool uses_output_tile =
-        shader.kernel_name.find("_output_tile") != std::string::npos;
-    if (uses_output_tile) {
-      wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-    } else {
-      const utils::uvec3 base_extents = graph->create_global_wg_size(out);
-      wg_size = {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
-    }
-  } else if (method == Conv2dMethod::Pointwise) {
+  if (method == Conv2dMethod::Pointwise) {
     wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
 
     if (shader.kernel_name.find("s1p0") != std::string::npos) {
@@ -404,10 +358,7 @@ utils::uvec3 conv2d_local_wg_size(
 
   // Determine method from shader name
   Conv2dMethod method;
-  if (shader.kernel_name.find("conv2d_dw") != std::string::npos) {
-    method = Conv2dMethod::Depthwise;
-  } else if (
-      shader.kernel_name.find("conv2d_pw") != std::string::npos ||
+  if (shader.kernel_name.find("conv2d_pw") != std::string::npos ||
       (shader.kernel_name.find("conv2d") != std::string::npos &&
        shader.kernel_name.find("conv_transpose2d") == std::string::npos)) {
     method = Conv2dMethod::Pointwise;
@@ -425,8 +376,6 @@ utils::uvec3 conv2d_local_wg_size(
       local_wg_size_y = 2;
     }
     return {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    return {64, 1, 1};
   } else {
     return graph->create_local_wg_size(global_workgroup_size);
   }
@@ -481,6 +430,37 @@ void add_conv2d_node(
   const Conv2dMethod method =
       get_conv2d_method(graph, weight_data, groups_val, transposed_val);
 
+  // Use tiled path for all pointwise conv2d
+  if (method == Conv2dMethod::Pointwise) {
+    return conv2d_pw_impl(
+        graph,
+        in,
+        weight_data,
+        bias,
+        stride,
+        padding,
+        out,
+        transposed_val,
+        clamp_out,
+        out_min_val,
+        out_max_val);
+  }
+
+  if (method == Conv2dMethod::Depthwise) {
+    return conv2d_dw_impl(
+        graph,
+        in,
+        weight_data,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        clamp_out,
+        out_min_val,
+        out_max_val);
+  }
+
   ValueRef arg_weight = prepack_weights(graph, weight_data, method);
   ValueRef arg_bias = prepack_biases(
       graph,
@@ -529,101 +509,13 @@ void add_conv2d_node(
       stride_equals_dilation,
       stride_1_padding_0);
 
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(
-      graph, method, out, weight_data, stride_equals_dilation);
-
-  utils::uvec3 local_wg_size;
-  if (method == Conv2dMethod::Depthwise || method == Conv2dMethod::Pointwise) {
-    wg_size = {wg_size[0] * wg_size[1], wg_size[2], 1};
-  }
-
-  if (method == Conv2dMethod::Pointwise) {
-    uint32_t local_wg_size_y = 1;
-    if (wg_size[1] % 8 == 0) {
-      local_wg_size_y = 8;
-    } else if (wg_size[1] % 4 == 0) {
-      local_wg_size_y = 4;
-    } else if (wg_size[1] % 2 == 0) {
-      local_wg_size_y = 2;
-    }
-    local_wg_size = {64 / local_wg_size_y, local_wg_size_y, 1};
-  } else if (method == Conv2dMethod::Depthwise) {
-    local_wg_size = {64, 1, 1};
-  } else {
-    local_wg_size = graph.create_local_wg_size(wg_size);
-  }
-
-  vkapi::ParamsBindList param_buffers;
-  std::vector<PushConstantDataInfo> push_constants;
-  if (method == Conv2dMethod::Pointwise) {
-    const utils::ivec4 kernel_param_stride_pad = {
-        kernel_params.stride[0],
-        kernel_params.stride[1],
-        kernel_params.padding[0],
-        kernel_params.padding[1],
-    };
-
-    struct Conv2dPWParams final {
-      int in_group_size;
-      int dummy_padding;
-      OutputParams out_params;
-    } param{extra_params.in_group_size, 0, out_params};
-
-    push_constants = {
-        graph.logical_limits_pc_of(out),
-        PushConstantDataInfo(
-            &kernel_param_stride_pad, sizeof(kernel_param_stride_pad)),
-        PushConstantDataInfo(&param, sizeof(param)),
-    };
-  } else if (method == Conv2dMethod::Depthwise) {
-    // output_tile variants use push constants; the base conv2d_dw shader uses
-    // UBOs. Distinguish by checking if "_output_tile" is in the shader name.
-    const bool uses_output_tile =
-        shader.kernel_name.find("_output_tile") != std::string::npos;
-
-    if (uses_output_tile) {
-      const utils::ivec4 kernel_param_size_stride = {
-          kernel_params.kernel_size[0],
-          kernel_params.kernel_size[1],
-          kernel_params.stride[0],
-          kernel_params.stride[1]};
-
-      const utils::ivec4 kernel_param_pad_dial = {
-          kernel_params.padding[0],
-          kernel_params.padding[1],
-          kernel_params.dilation[0],
-          kernel_params.dilation[1]};
-
-      push_constants = {
-          graph.logical_limits_pc_of(out),
-          graph.sizes_pc_of(in),
-          PushConstantDataInfo(
-              &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
-          PushConstantDataInfo(
-              &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
-          PushConstantDataInfo(
-              &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
-          PushConstantDataInfo(&out_params, sizeof(out_params)),
-      };
-    } else {
-      // Base conv2d_dw shader uses UBOs, same as SlidingWindow case
-      param_buffers = {
-          graph.logical_limits_ubo(out),
-          graph.sizes_ubo(in),
-          graph.create_params_buffer(kernel_params),
-          graph.create_params_buffer(extra_params),
-          graph.create_params_buffer(out_params),
-      };
-    }
-  } else {
-    param_buffers = {
-        graph.logical_limits_ubo(out),
-        graph.sizes_ubo(in),
-        graph.create_params_buffer(kernel_params),
-        graph.create_params_buffer(extra_params),
-        graph.create_params_buffer(out_params),
-    };
-  }
+  vkapi::ParamsBindList param_buffers = {
+      graph.logical_limits_ubo(out),
+      graph.sizes_ubo(in),
+      graph.create_params_buffer(kernel_params),
+      graph.create_params_buffer(extra_params),
+      graph.create_params_buffer(out_params),
+  };
 
   graph.execute_nodes().emplace_back(new DynamicDispatchNode(
       graph,
@@ -635,7 +527,7 @@ void add_conv2d_node(
       // Shader params buffers
       param_buffers,
       // Push Constants
-      push_constants,
+      {},
       // Specialization Constants
       {utils::safe_downcast<int32_t>(groups_val)},
       // Resize Args
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.h b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
new file mode 100644
index 00000000000..f49e7efcfe7
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+ValueRef prepack_biases(
+    ComputeGraph& graph,
+    const ValueRef vref,
+    const ValueRef weight,
+    const bool transposed,
+    const utils::StorageType storage_type,
+    const utils::GPUMemoryLayout memory_layout);
+
+void check_conv_args(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef out);
+
+void conv2d_pw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef out,
+    const bool transposed_val,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val);
+
+void conv2d_dw_impl(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const bool clamp_out,
+    const float out_min_val,
+    const float out_max_val);
+
+void resize_conv2d_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args);
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
new file mode 100644
index 00000000000..f7454b6b93a
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dDw.cpp
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Convolution.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+//
+// Local copies of Conv2dDW internals, extended with impl_selector support.
+// These mirror the logic in Conv2dDW.cpp but allow forcing a specific tile size
+// variant via the impl_selector string.
+//
+
+struct Conv2dDWParams final {
+  utils::ivec2 overlay_region;
+  int in_group_size;
+};
+
+struct OutputParams final {
+  float out_min;
+  float out_max;
+};
+
+static std::string pick_conv2d_dw_shader_with_selector(
+    ComputeGraph& graph,
+    const ValueRef weight_data,
+    const ValueRef out,
+    const bool stride_equals_dilation,
+    const bool clamp_out,
+    const std::string& impl_selector) {
+  std::string kernel_name = "conv2d_dw";
+  kernel_name.reserve(40);
+
+  const auto& weight_sizes = graph.get_tref(weight_data)->sizes;
+  const bool is_3x3 = weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3;
+  const bool is_5x5 = weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5;
+
+  if (!stride_equals_dilation) {
+    kernel_name += "_sned";
+  }
+
+  if (is_3x3) {
+    kernel_name += "_output_tile_3x3";
+    if (impl_selector == "b1x1") {
+      kernel_name += "_b1x1";
+    } else if (impl_selector == "b4x2") {
+      // b4x2 is the default (no suffix)
+    } else {
+      // Auto-selection: use b1x1 on Mali
+      if (stride_equals_dilation && graph.device_is_mali()) {
+        kernel_name += "_b1x1";
+      }
+    }
+  } else if (is_5x5) {
+    kernel_name += "_output_tile_5x5";
+    // No b1x1 variant for 5x5; impl_selector is ignored for batch size
+  }
+
+  if (clamp_out) {
+    kernel_name += "_clamp";
+  }
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  return kernel_name;
+}
+
+static utils::uvec3 conv2d_dw_global_wg_size_fn(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+
+  const bool uses_output_tile =
+      shader.kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const bool is_sned = shader.kernel_name.find("_sned") != std::string::npos;
+    const utils::uvec3 image_extents = graph->create_global_wg_size(out);
+
+    if (is_sned) {
+      return {image_extents[0] * image_extents[1], image_extents[2], 1};
+    }
+
+    uint32_t batch_x = 4u;
+    uint32_t batch_y = 2u;
+    if (shader.kernel_name.find("_b1x1") != std::string::npos) {
+      batch_x = 1u;
+      batch_y = 1u;
+    }
+
+    uint32_t scaled_x = utils::div_up(image_extents[0], batch_x);
+    uint32_t scaled_y = utils::div_up(image_extents[1], batch_y);
+    return {scaled_x * scaled_y, image_extents[2], 1};
+  }
+
+  const utils::uvec3 base_extents = graph->create_global_wg_size(out);
+  return {base_extents[0] * base_extents[1] * base_extents[2], 1, 1};
+}
+
+static utils::uvec3 conv2d_dw_local_wg_size_fn(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {64, 1, 1};
+}
+
+static ValueRef prepack_dw_weights(ComputeGraph& graph, const ValueRef vref) {
+  const auto original_sizes = graph.sizes_of(vref);
+
+  int64_t out_channels_padded =
+      utils::align_up_4(utils::val_at(-4, original_sizes));
+  int64_t height = utils::val_at(-2, original_sizes);
+  int64_t width = utils::val_at(-1, original_sizes);
+
+  const std::vector<int64_t> final_sizes = {
+      4, out_channels_padded / 4, height * width};
+
+  ValueRef v = graph.add_tensor(
+      final_sizes,
+      graph.dtype_of(vref),
+      utils::kTexture2D,
+      utils::kChannelsPacked);
+
+  std::string kernel_name = "conv2d_dw_prepack_weights";
+  add_dtype_suffix(kernel_name, graph.dtype_of(v));
+  add_dtype_suffix(kernel_name, graph.get_staging_dtype_for(vref));
+
+  const auto original_sizes_pc =
+      utils::make_ivec4(original_sizes, /*reverse=*/true);
+  graph.prepack_nodes().emplace_back(new PrepackNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      graph.create_global_wg_size(v),
+      graph.create_local_wg_size(v),
+      vref,
+      v,
+      {},
+      {graph.packed_dim_of(v)},
+      {graph.sizes_pc_of(v),
+       PushConstantDataInfo(&original_sizes_pc, sizeof(original_sizes_pc))}));
+
+  return v;
+}
+
+static void conv2d_dw_with_selector(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef weight_data,
+    const ValueRef bias,
+    const ValueRef stride,
+    const ValueRef padding,
+    const ValueRef dilation,
+    const ValueRef out,
+    const std::string& impl_selector) {
+  ValueRef arg_weight = prepack_dw_weights(graph, weight_data);
+  ValueRef arg_bias = prepack_biases(
+      graph,
+      bias,
+      weight_data,
+      /*transposed=*/false,
+      /*storage_type=*/utils::kTexture2D,
+      /*memory_layout=*/utils::kWidthPacked);
+
+  check_conv_args(graph, in, out);
+
+  Kernel2dParams kernel_params = create_kernel2d_params(
+      graph,
+      weight_data,
+      /*kernel_size_only=*/false,
+      stride,
+      padding,
+      dilation);
+
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
+  const auto& overlay_region = utils::make_ivec2({
+      kernel_params.kernel_size[0] +
+          (kernel_params.kernel_size[0] - 1) * (kernel_params.dilation[0] - 1),
+      kernel_params.kernel_size[1] +
+          (kernel_params.kernel_size[1] - 1) * (kernel_params.dilation[1] - 1),
+  });
+  const auto weight_sizes = graph.sizes_of(weight_data);
+  const int32_t in_group_size =
+      utils::safe_downcast<int32_t>(utils::align_up_4(weight_sizes.at(1)));
+  Conv2dDWParams extra_params = {overlay_region, in_group_size};
+
+  OutputParams out_params = {
+      std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()};
+
+  std::string kernel_name = pick_conv2d_dw_shader_with_selector(
+      graph,
+      weight_data,
+      out,
+      stride_equals_dilation,
+      /*clamp_out=*/false,
+      impl_selector);
+
+  vkapi::ShaderInfo shader = VK_KERNEL_FROM_STR(kernel_name);
+
+  vkapi::ParamsBindList param_buffers;
+  std::vector<PushConstantDataInfo> push_constants;
+
+  const bool uses_output_tile =
+      kernel_name.find("_output_tile") != std::string::npos;
+
+  if (uses_output_tile) {
+    const utils::ivec4 kernel_param_size_stride = {
+        kernel_params.kernel_size[0],
+        kernel_params.kernel_size[1],
+        kernel_params.stride[0],
+        kernel_params.stride[1]};
+
+    const utils::ivec4 kernel_param_pad_dial = {
+        kernel_params.padding[0],
+        kernel_params.padding[1],
+        kernel_params.dilation[0],
+        kernel_params.dilation[1]};
+
+    push_constants = {
+        graph.logical_limits_pc_of(out),
+        graph.sizes_pc_of(in),
+        PushConstantDataInfo(
+            &kernel_param_size_stride, sizeof(kernel_param_size_stride)),
+        PushConstantDataInfo(
+            &kernel_param_pad_dial, sizeof(kernel_param_pad_dial)),
+        PushConstantDataInfo(
+            &extra_params, sizeof(extra_params), sizeof(utils::ivec4)),
+        PushConstantDataInfo(&out_params, sizeof(out_params)),
+    };
+  } else {
+    param_buffers = {
+        graph.logical_limits_ubo(out),
+        graph.sizes_ubo(in),
+        graph.create_params_buffer(kernel_params),
+        graph.create_params_buffer(extra_params),
+        graph.create_params_buffer(out_params),
+    };
+  }
+
+  ValueRef transposed_ref = graph.add_scalar(false);
+  ValueRef output_padding = graph.add_none();
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      shader,
+      conv2d_dw_global_wg_size_fn,
+      conv2d_dw_local_wg_size_fn,
+      {{out, vkapi::kWrite}, {{in, arg_weight, arg_bias}, vkapi::kRead}},
+      param_buffers,
+      push_constants,
+      {},
+      {weight_data, stride, padding, dilation, transposed_ref, output_padding},
+      resize_conv2d_node));
+}
+
+void test_conv2d_dw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[0] = input [N, C, H, W]
+  // args[1] = weight [C, 1, K_h, K_w] (constant)
+  // args[2] = bias (constant, or none)
+  // args[3] = stride_h (int)
+  // args[4] = stride_w (int)
+  // args[5] = padding_h (int)
+  // args[6] = padding_w (int)
+  // args[7] = dilation_h (int)
+  // args[8] = dilation_w (int)
+  // args[9] = impl_selector (string)
+  // args[10] = output
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const int64_t stride_h = graph.extract_scalar<int64_t>(args.at(3));
+  const int64_t stride_w = graph.extract_scalar<int64_t>(args.at(4));
+  const int64_t padding_h = graph.extract_scalar<int64_t>(args.at(5));
+  const int64_t padding_w = graph.extract_scalar<int64_t>(args.at(6));
+  const int64_t dilation_h = graph.extract_scalar<int64_t>(args.at(7));
+  const int64_t dilation_w = graph.extract_scalar<int64_t>(args.at(8));
+  const std::string impl_selector = graph.extract_string(args.at(9));
+  const ValueRef out = args.at(10);
+
+  ValueRef stride =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{stride_h, stride_w});
+  ValueRef padding = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{padding_h, padding_w});
+  ValueRef dilation = graph.add_scalar_list<int64_t>(
+      std::vector<int64_t>{dilation_h, dilation_w});
+
+  if (impl_selector.empty()) {
+    // Auto-selection: delegate to aten.convolution.default
+    const int64_t channels = graph.sizes_of(input).at(1);
+    ValueRef transposed = graph.add_scalar<bool>(false);
+    ValueRef output_padding =
+        graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+    ValueRef groups = graph.add_scalar<int64_t>(channels);
+
+    VK_GET_OP_FN("aten.convolution.default")
+    (graph,
+     {input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      transposed,
+      output_padding,
+      groups,
+      out});
+  } else {
+    // Forced variant: build the dispatch directly with impl_selector
+    conv2d_dw_with_selector(
+        graph,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        out,
+        impl_selector);
+  }
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv2d_dw.default, test_conv2d_dw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp b/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp
new file mode 100644
index 00000000000..7c640d7a1ac
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/impl/TestConv2dPw.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+
+namespace vkcompute {
+
+void test_conv2d_pw(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  // args[0] = input [N, C_in, H, W]
+  // args[1] = weight [C_out, C_in, 1, 1] (constant)
+  // args[2] = bias (constant, or none)
+  // args[3] = impl_selector (string)
+  // args[4] = output [N, C_out, H, W]
+  const ValueRef input = args.at(0);
+  const ValueRef weight = args.at(1);
+  const ValueRef bias = args.at(2);
+  const ValueRef impl_selector_str = args.at(3);
+  const ValueRef out = args.at(4);
+
+  std::string impl_selector = graph.extract_string(impl_selector_str);
+  (void)impl_selector; // Reserved for future use
+
+  // Create fixed pointwise conv parameters
+  ValueRef stride = graph.add_scalar_list<int64_t>(std::vector<int64_t>{1, 1});
+  ValueRef padding = graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+  ValueRef dilation =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{1, 1});
+  ValueRef transposed = graph.add_scalar<bool>(false);
+  ValueRef output_padding =
+      graph.add_scalar_list<int64_t>(std::vector<int64_t>{0, 0});
+  ValueRef groups = graph.add_scalar<int64_t>(1);
+
+  // Call aten.convolution.default with all 10 args:
+  // input, weight, bias, stride, padding, dilation, transposed,
+  // output_padding, groups, output
+  VK_GET_OP_FN("aten.convolution.default")
+  (graph,
+   {input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    out});
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(test_etvk.test_conv2d_pw.default, test_conv2d_pw);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index fef8994718f..84432bce30b 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -100,3 +100,5 @@ def define_common_targets(is_fbcode = False):
     define_custom_op_test_binary("test_q8ta_linear")
     define_custom_op_test_binary("test_q8ta_conv2d_transposed")
     define_custom_op_test_binary("test_mm")
+    define_custom_op_test_binary("test_conv2d_pw")
+    define_custom_op_test_binary("test_conv2d_dw")
diff --git a/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp
new file mode 100644
index 00000000000..9c48c320d62
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv2d_dw.cpp
@@ -0,0 +1,435 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "conv2d_utils.h"
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 64;
+
+struct InputDims {
+  int64_t N;
+  int64_t C;
+  int64_t H;
+  int64_t W;
+
+  InputDims(int64_t n, int64_t c, int64_t h, int64_t w)
+      : N(n), C(c), H(h), W(w) {}
+};
+
+struct Conv2dDwConfig {
+  InputDims dims;
+  KernelSize kernel;
+  Stride stride;
+  Padding padding;
+  Dilation dilation;
+  bool has_bias;
+};
+
+static int64_t calc_out_size(
+    int64_t in_size,
+    int64_t kernel_size,
+    int64_t stride,
+    int64_t padding,
+    int64_t dilation) {
+  return (in_size + 2 * padding - dilation * (kernel_size - 1) - 1) / stride +
+      1;
+}
+
+static TestCase create_conv2d_dw_test_case(
+    const Conv2dDwConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout,
+    const std::string& impl_selector = "") {
+  TestCase test_case;
+
+  bool is_perf = config.dims.C > kRefDimSizeLimit ||
+      config.dims.H > kRefDimSizeLimit || config.dims.W > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string layout_str = layout_abbrev(memory_layout);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  int64_t H_out = calc_out_size(
+      config.dims.H,
+      config.kernel.h,
+      config.stride.h,
+      config.padding.h,
+      config.dilation.h);
+  int64_t W_out = calc_out_size(
+      config.dims.W,
+      config.kernel.w,
+      config.stride.w,
+      config.padding.w,
+      config.dilation.w);
+
+  std::string shape = "[" + std::to_string(config.dims.N) + "," +
+      std::to_string(config.dims.C) + "," + std::to_string(config.dims.H) +
+      "," + std::to_string(config.dims.W) + "] k" +
+      std::to_string(config.kernel.h) + "x" + std::to_string(config.kernel.w) +
+      " s" + std::to_string(config.stride.h) + " p" +
+      std::to_string(config.padding.h) + " d" +
+      std::to_string(config.dilation.h) + "->[" +
+      std::to_string(config.dims.N) + "," + std::to_string(config.dims.C) +
+      "," + std::to_string(H_out) + "," + std::to_string(W_out) + "]";
+
+  std::string selector_str =
+      impl_selector.empty() ? "" : " [" + impl_selector + "]";
+
+  std::string name = prefix + "  conv2d_dw" + bias_str + " " + shape + "  " +
+      storage_str + "(" + layout_str + ") " + dtype_str + selector_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv2d_dw.default");
+
+  // Input tensor [N, C, H, W]
+  ValueSpec input(
+      {config.dims.N, config.dims.C, config.dims.H, config.dims.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+
+  // Weight tensor [C, 1, K_h, K_w] - constant
+  ValueSpec weight(
+      {config.dims.C, 1, config.kernel.h, config.kernel.w},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+
+  test_case.add_input_spec(input);
+  test_case.add_input_spec(weight);
+
+  // Bias (or none)
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.dims.C},
+        dtype,
+        storage_type,
+        memory_layout,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // stride_h, stride_w, padding_h, padding_w, dilation_h, dilation_w
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.stride.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.padding.w)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.h)));
+  test_case.add_input_spec(ValueSpec(static_cast<int32_t>(config.dilation.w)));
+
+  // impl_selector string
+  test_case.add_input_spec(ValueSpec::make_string(impl_selector));
+
+  // Output tensor [N, C, H_out, W_out]
+  ValueSpec output(
+      {config.dims.N, config.dims.C, H_out, W_out},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+// Reference implementation for depthwise conv2d
+static void conv2d_dw_reference_impl(TestCase& test_case) {
+  const ValueSpec& input = test_case.inputs()[0];
+  const ValueSpec& weight = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto input_sizes = input.get_tensor_sizes();
+  auto weight_sizes = weight.get_tensor_sizes();
+  auto output_sizes = output.get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C = input_sizes[1];
+  int64_t H_in = input_sizes[2];
+  int64_t W_in = input_sizes[3];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+
+  int64_t stride_h = test_case.inputs()[3].get_int_value();
+  int64_t stride_w = test_case.inputs()[4].get_int_value();
+  int64_t padding_h = test_case.inputs()[5].get_int_value();
+  int64_t padding_w = test_case.inputs()[6].get_int_value();
+  int64_t dilation_h = test_case.inputs()[7].get_int_value();
+  int64_t dilation_w = test_case.inputs()[8].get_int_value();
+
+  auto& input_data = input.get_float_data();
+  auto& weight_data = weight.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C * H_out * W_out, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t c = 0; c < C; ++c) {
+      for (int64_t oh = 0; oh < H_out; ++oh) {
+        for (int64_t ow = 0; ow < W_out; ++ow) {
+          float sum = 0.0f;
+          for (int64_t kh = 0; kh < K_h; ++kh) {
+            for (int64_t kw = 0; kw < K_w; ++kw) {
+              int64_t ih = oh * stride_h - padding_h + kh * dilation_h;
+              int64_t iw = ow * stride_w - padding_w + kw * dilation_w;
+              if (ih >= 0 && ih < H_in && iw >= 0 && iw < W_in) {
+                float in_val = input_data
+                    [n * (C * H_in * W_in) + c * (H_in * W_in) + ih * W_in +
+                     iw];
+                // weight is [C, 1, K_h, K_w]
+                float w_val = weight_data[c * (K_h * K_w) + kh * K_w + kw];
+                sum += in_val * w_val;
+              }
+            }
+          }
+          if (!bias_spec.is_none()) {
+            auto& bias_data = bias_spec.get_float_data();
+            sum += bias_data[c];
+          }
+          ref_data
+              [n * (C * H_out * W_out) + c * (H_out * W_out) + oh * W_out +
+               ow] = sum;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv2d_dw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  utils::GPUMemoryLayout layout = utils::kChannelsPacked;
+
+  // Accuracy shapes (small enough for float reference validation)
+  std::vector<Conv2dDwConfig> accuracy_configs = {
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       true},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 8, 16, 16),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // Non-multiple-of-4 channels
+      {InputDims(1, 11, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 3, 16, 16),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // EdgeTAM depthwise shapes (from profiling data)
+  std::vector<Conv2dDwConfig> perf_configs = {
+      // Backbone stem and early stages
+      {InputDims(1, 24, 512, 512),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 48, 256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 48, 256, 256),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 192, 64, 64),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 192, 64, 64),
+       KernelSize(3, 3),
+       Stride(2, 2),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 384, 32, 32),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      // 5x5 kernels
+      {InputDims(1, 48, 256, 256),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 96, 128, 128),
+       KernelSize(5, 5),
+       Stride(1, 1),
+       Padding(2, 2),
+       Dilation(1, 1),
+       false},
+      // FPN/Neck
+      {InputDims(1, 256, 256, 256),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+      {InputDims(1, 256, 128, 128),
+       KernelSize(3, 3),
+       Stride(1, 1),
+       Padding(1, 1),
+       Dilation(1, 1),
+       false},
+  };
+
+  // Generate accuracy test cases (float only)
+  for (const auto& config : accuracy_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(
+          create_conv2d_dw_test_case(config, vkapi::kFloat, st, layout));
+    }
+  }
+
+  // Generate performance test cases (float and half)
+  for (const auto& config : perf_configs) {
+    std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+    for (auto dtype : dtypes) {
+      for (auto st : storage_types) {
+        // Auto-selection (empty impl_selector)
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout));
+
+        // Force b4x2 variant
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout, "b4x2"));
+
+        // Force b1x1 variant (only for 3x3 kernels; for 5x5 it falls back
+        // to default, but we still generate it to test the fallback path)
+        test_cases.push_back(
+            create_conv2d_dw_test_case(config, dtype, st, layout, "b1x1"));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv2d_dw_flop_calculator(const TestCase& test_case) {
+  auto input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+  auto output_sizes = test_case.outputs()[0].get_tensor_sizes();
+
+  int64_t N = output_sizes[0];
+  int64_t C = output_sizes[1];
+  int64_t H_out = output_sizes[2];
+  int64_t W_out = output_sizes[3];
+  int64_t K_h = weight_sizes[2];
+  int64_t K_w = weight_sizes[3];
+
+  // Each output element: K_h * K_w multiplies + (K_h * K_w - 1) adds
+  return 2 * N * C * H_out * W_out * K_h * K_w;
+}
+
+static void reference_impl(TestCase& test_case) {
+  conv2d_dw_reference_impl(test_case);
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Depthwise Conv2d Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv2d_dw_test_cases,
+      conv2d_dw_flop_calculator,
+      "Conv2dDW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}
diff --git a/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp b/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp
new file mode 100644
index 00000000000..1e8eec2a6c2
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/test_conv2d_pw.cpp
@@ -0,0 +1,268 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <iostream>
+#include <vector>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+using namespace vkcompute;
+
+static constexpr int64_t kRefDimSizeLimit = 64;
+
+struct Conv2dPwConfig {
+  int64_t N;
+  int64_t C_in;
+  int64_t C_out;
+  int64_t H;
+  int64_t W;
+  bool has_bias;
+};
+
+static TestCase create_conv2d_pw_test_case(
+    const Conv2dPwConfig& config,
+    vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
+    utils::GPUMemoryLayout memory_layout) {
+  TestCase test_case;
+
+  bool is_perf = config.C_in > kRefDimSizeLimit ||
+      config.C_out > kRefDimSizeLimit || config.H > kRefDimSizeLimit ||
+      config.W > kRefDimSizeLimit;
+
+  std::string prefix = is_perf ? "PERF" : "ACCU";
+  std::string storage_str = storage_type_abbrev(storage_type);
+  std::string layout_str = layout_abbrev(memory_layout);
+  std::string dtype_str = (dtype == vkapi::kHalf) ? "f16" : "f32";
+  std::string bias_str = config.has_bias ? "+bias" : "";
+
+  std::string shape = "[" + std::to_string(config.N) + "," +
+      std::to_string(config.C_in) + "," + std::to_string(config.H) + "," +
+      std::to_string(config.W) + "]->[" + std::to_string(config.N) + "," +
+      std::to_string(config.C_out) + "," + std::to_string(config.H) + "," +
+      std::to_string(config.W) + "]";
+
+  std::string name = prefix + "  conv2d_pw" + bias_str + " " + shape + "  " +
+      storage_str + "(" + layout_str + ") " + dtype_str;
+
+  test_case.set_name(name);
+  test_case.set_operator_name("test_etvk.test_conv2d_pw.default");
+
+  // Input tensor [N, C_in, H, W]
+  ValueSpec input(
+      {config.N, config.C_in, config.H, config.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+
+  // Weight tensor [C_out, C_in, 1, 1] - constant
+  ValueSpec weight(
+      {config.C_out, config.C_in, 1, 1},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::RANDOM);
+  weight.set_constant(true);
+
+  test_case.add_input_spec(input);
+  test_case.add_input_spec(weight);
+
+  // Bias (or none)
+  if (config.has_bias) {
+    ValueSpec bias(
+        {config.C_out},
+        dtype,
+        storage_type,
+        memory_layout,
+        DataGenType::RANDOM);
+    bias.set_constant(true);
+    test_case.add_input_spec(bias);
+  } else {
+    ValueSpec none_bias(static_cast<int32_t>(0));
+    none_bias.set_none(true);
+    test_case.add_input_spec(none_bias);
+  }
+
+  // impl_selector
+  ValueSpec impl_selector_spec = ValueSpec::make_string("default");
+  test_case.add_input_spec(impl_selector_spec);
+
+  // Output tensor [N, C_out, H, W]
+  ValueSpec output(
+      {config.N, config.C_out, config.H, config.W},
+      dtype,
+      storage_type,
+      memory_layout,
+      DataGenType::ZEROS);
+  test_case.add_output_spec(output);
+
+  if (dtype == vkapi::kHalf) {
+    test_case.set_abs_tolerance(1e-1f);
+    test_case.set_rel_tolerance(1e-2f);
+  } else {
+    test_case.set_abs_tolerance(1e-3f);
+    test_case.set_rel_tolerance(1e-3f);
+  }
+
+  test_case.set_shader_filter({"nchw_to", "to_nchw", "view_copy"});
+
+  return test_case;
+}
+
+// Reference implementation: pointwise conv2d is essentially a matmul
+// output[n][c_out][h][w] = bias[c_out] +
+//   sum_over_c_in(input[n][c_in][h][w] * weight[c_out][c_in][0][0])
+static void conv2d_pw_reference_impl(TestCase& test_case) {
+  // input[0], weight[1], bias[2], impl_selector[3]
+  const ValueSpec& input = test_case.inputs()[0];
+  const ValueSpec& weight = test_case.inputs()[1];
+  const ValueSpec& bias_spec = test_case.inputs()[2];
+  ValueSpec& output = test_case.outputs()[0];
+
+  if (input.dtype != vkapi::kFloat) {
+    throw std::invalid_argument("Reference only supports float");
+  }
+
+  auto input_sizes = input.get_tensor_sizes();
+  auto weight_sizes = weight.get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H = input_sizes[2];
+  int64_t W = input_sizes[3];
+  int64_t C_out = weight_sizes[0];
+
+  auto& input_data = input.get_float_data();
+  auto& weight_data = weight.get_float_data();
+  auto& ref_data = output.get_ref_float_data();
+  ref_data.resize(N * C_out * H * W, 0.0f);
+
+  for (int64_t n = 0; n < N; ++n) {
+    for (int64_t co = 0; co < C_out; ++co) {
+      for (int64_t h = 0; h < H; ++h) {
+        for (int64_t w = 0; w < W; ++w) {
+          float sum = 0.0f;
+          for (int64_t ci = 0; ci < C_in; ++ci) {
+            float in_val =
+                input_data[n * (C_in * H * W) + ci * (H * W) + h * W + w];
+            // weight is [C_out, C_in, 1, 1]
+            float w_val = weight_data[co * C_in + ci];
+            sum += in_val * w_val;
+          }
+          if (!bias_spec.is_none()) {
+            auto& bias_data = bias_spec.get_float_data();
+            sum += bias_data[co];
+          }
+          ref_data[n * (C_out * H * W) + co * (H * W) + h * W + w] = sum;
+        }
+      }
+    }
+  }
+}
+
+static std::vector<TestCase> generate_conv2d_pw_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  // Conv2d shaders are texture-only and require channels-packed layout
+  std::vector<utils::StorageType> storage_types = {utils::kTexture3D};
+  utils::GPUMemoryLayout layout = utils::kChannelsPacked;
+
+  // Accuracy shapes (small enough for float reference validation)
+  std::vector<Conv2dPwConfig> accuracy_configs = {
+      {1, 16, 32, 8, 8, false},
+      {1, 32, 16, 8, 8, false},
+      {1, 16, 32, 8, 8, true},
+      {1, 48, 96, 16, 16, false},
+      {1, 96, 48, 16, 16, false},
+      // Non-multiple-of-4 channels
+      {1, 13, 27, 8, 8, false},
+      {1, 33, 17, 8, 8, false},
+  };
+
+  // EdgeTAM performance shapes
+  std::vector<Conv2dPwConfig> perf_configs = {
+      // EdgeTAM backbone stages
+      {1, 48, 96, 256, 256, false},
+      {1, 96, 48, 256, 256, false},
+      {1, 96, 192, 128, 128, false},
+      {1, 192, 96, 128, 128, false},
+      {1, 192, 384, 64, 64, false},
+      {1, 384, 192, 64, 64, false},
+      {1, 384, 768, 32, 32, false},
+      {1, 768, 384, 32, 32, false},
+      // EdgeTAM FPN/Neck
+      {1, 48, 256, 256, 256, false},
+      {1, 256, 32, 256, 256, false},
+      {1, 96, 256, 128, 128, false},
+      {1, 256, 64, 128, 128, false},
+  };
+
+  // Generate accuracy test cases (float only)
+  for (const auto& config : accuracy_configs) {
+    for (auto st : storage_types) {
+      test_cases.push_back(
+          create_conv2d_pw_test_case(config, vkapi::kFloat, st, layout));
+    }
+  }
+
+  // Generate performance test cases (float and half)
+  for (const auto& config : perf_configs) {
+    std::vector<vkapi::ScalarType> dtypes = {vkapi::kFloat, vkapi::kHalf};
+    for (auto dtype : dtypes) {
+      for (auto st : storage_types) {
+        test_cases.push_back(
+            create_conv2d_pw_test_case(config, dtype, st, layout));
+      }
+    }
+  }
+
+  return test_cases;
+}
+
+static int64_t conv2d_pw_flop_calculator(const TestCase& test_case) {
+  auto input_sizes = test_case.inputs()[0].get_tensor_sizes();
+  auto weight_sizes = test_case.inputs()[1].get_tensor_sizes();
+
+  int64_t N = input_sizes[0];
+  int64_t C_in = input_sizes[1];
+  int64_t H = input_sizes[2];
+  int64_t W = input_sizes[3];
+  int64_t C_out = weight_sizes[0];
+
+  return 2 * N * C_out * C_in * H * W;
+}
+
+static void reference_impl(TestCase& test_case) {
+  conv2d_pw_reference_impl(test_case);
+}
+
+int main(int argc, char* argv[]) {
+  set_debugging(false);
+  set_print_output(false);
+  set_print_latencies(false);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Pointwise Conv2d (1x1) Benchmark" << std::endl;
+  print_separator();
+
+  ReferenceComputeFunc ref_fn = reference_impl;
+
+  auto results = execute_test_cases(
+      generate_conv2d_pw_test_cases,
+      conv2d_pw_flop_calculator,
+      "Conv2dPW",
+      3,
+      10,
+      ref_fn);
+
+  return 0;
+}