From 9a8fc97820872aff3677863a7a3fef169da86174 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:24:05 -0800
Subject: [PATCH 1/3] [ET-VK] Fixing conv2d dw incorrect output when stride !=
 dilation issue.

Pull Request resolved: https://github.com/pytorch/executorch/pull/7595

This diff moves current implementation of conv2d dw as a special case when stride equals dilation in the Vulkan backend of Executorch, since that's the only time this kind of caching is possible.

If stride does not equal dilation the old implementation is used.

Additional test cases are added to ensure computation is correct when stride != dilation.
ghstack-source-id: 261183385
@exported-using-ghexport

Differential Revision: [D67908916](https://our.internmc.facebook.com/intern/diff/D67908916/)
---
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 43 +++++++++++++++++++
 .../graph/ops/glsl/conv2d_dw_output_tile.yaml | 13 ++++++
 .../runtime/graph/ops/impl/Convolution.cpp    | 43 +++++++++++++++----
 backends/vulkan/test/op_tests/cases.py        | 33 ++++++++++++++
 4 files changed, 123 insertions(+), 9 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index ad4ff245a17..cd385718ce0 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -14,6 +14,8 @@
 
 #define TILE_SIZE ${TILE_SIZE}
 
+#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION}
+
 #define BATCH_SIZE_X ${BATCH_SIZE_X}
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
@@ -40,6 +42,8 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * Computes a depthwise convolution. Each shader invocation calculates the
  * output at a single output location.
  */
+
+#if STRIDE_EQ_DILATION
 void main() {
   // x and y are divided by batch size to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
@@ -121,3 +125,42 @@ void main() {
     }
   }
 }
+
+#else
+void main() {
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
+  const ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits.x,
+    div_by_x % out_limits.y,
+    div_by_x / out_limits.y);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
+    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(t_out, pos, op(sum, out_min, out_max));
+}
+
+#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index 9cf6c22c6ca..d3672f5ec2e 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -12,6 +12,7 @@ conv2d_dw_output_tile:
     TILE_SIZE: 3
     BATCH_SIZE_X: 4
     BATCH_SIZE_Y: 2
+    STRIDE_EQ_DILATION: 0
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -25,3 +26,15 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
+    - NAME: conv2d_dw_sed_output_tile_3x3
+      STRIDE_EQ_DILATION: 1
+    - NAME: conv2d_dw_sed_output_tile_3x3_clamp
+      OPERATOR: clamp(X, A, B)
+      STRIDE_EQ_DILATION: 1
+    - NAME: conv2d_dw_sed_output_tile_5x5
+      TILE_SIZE: 5
+      STRIDE_EQ_DILATION: 1
+    - NAME: conv2d_dw_sed_output_tile_5x5_clamp
+      OPERATOR: clamp(X, A, B)
+      TILE_SIZE: 5
+      STRIDE_EQ_DILATION: 1
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index 64c145fb7e7..a7c11cc8535 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -126,13 +126,17 @@ vkapi::ShaderInfo get_conv2d_shader(
     const bool prepack_weights,
     const Conv2dMethod method,
     const ValueRef weight,
-    const bool clamp_out = false) {
+    const bool clamp_out = false,
+    const bool stride_equals_dilation = false) {
   std::string kernel_name;
   kernel_name.reserve(kShaderNameReserve);
   switch (method) {
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
       if (!prepack_weights) {
+        if (stride_equals_dilation) {
+          kernel_name += "_sed";
+        }
         const auto& weight_sizes = graph.get_tref(weight)->sizes;
         if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
           kernel_name += "_output_tile_3x3";
@@ -286,22 +290,37 @@ Conv2dMethod get_conv2d_method(
   return Conv2dMethod::SlidingWindow;
 }
 
+utils::uvec2 get_conv2d_dw_dispatch_divisor(
+    const std::vector<int64_t>& weight_sizes) {
+  if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {
+    return {4u, 2u};
+  }
+  if (weight_sizes.at(2) == 5 && weight_sizes.at(3) == 5) {
+    return {4u, 2u};
+  }
+  return {4u, 2u};
+}
+
 utils::uvec3 create_conv2d_global_wg_size(
     ComputeGraph& graph,
     const Conv2dMethod method,
-    const ValueRef out) {
+    const ValueRef out,
+    const ValueRef weight_data,
+    const bool stride_equals_dilation) {
   if (method == Conv2dMethod::Pointwise) {
     const utils::uvec3 image_extents = graph.logical_limits_of(out);
     return {
         utils::div_up(image_extents[0u], 2u),
         utils::div_up(image_extents[1u], 2u),
         image_extents[2u]};
-  } else if (method == Conv2dMethod::Depthwise) {
-    const utils::uvec3 image_extents = graph.logical_limits_of(out);
+  } else if (method == Conv2dMethod::Depthwise && stride_equals_dilation) {
+    const utils::uvec3 image_extents = graph.create_global_wg_size(out);
+    const utils::uvec2 div =
+        get_conv2d_dw_dispatch_divisor(graph.get_tref(weight_data)->sizes);
     return {
-        utils::div_up(image_extents[0u], 4u),
-        utils::div_up(image_extents[1u], 2u),
-        image_extents[2u]};
+        utils::div_up(image_extents[0], div[0]),
+        utils::div_up(image_extents[1], div[1]),
+        image_extents[2]};
   } else {
     return graph.create_global_wg_size(out);
   }
@@ -364,6 +383,10 @@ void add_conv2d_node(
   Conv2dParams extra_params =
       create_conv2d_params(graph, weight_data, kernel_params, transposed_val);
 
+  const bool stride_equals_dilation =
+      (kernel_params.stride[0] == kernel_params.dilation[0] &&
+       kernel_params.stride[1] == kernel_params.dilation[1]);
+
   OutputParams out_params = {out_min_val, out_max_val};
 
   check_conv2d_params(kernel_params, transposed_val);
@@ -374,9 +397,11 @@ void add_conv2d_node(
       /*prepack_weights = */ false,
       method,
       weight_data,
-      clamp_out);
+      clamp_out,
+      stride_equals_dilation);
 
-  utils::uvec3 wg_size = create_conv2d_global_wg_size(graph, method, out);
+  utils::uvec3 wg_size = create_conv2d_global_wg_size(
+      graph, method, out, weight_data, stride_equals_dilation);
 
   if (method == Conv2dMethod::Pointwise || method == Conv2dMethod::Depthwise) {
     wg_size = {wg_size[0] * wg_size[1] * wg_size[2], 1, 1};
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
index 85732d77011..d32fa715734 100644
--- a/backends/vulkan/test/op_tests/cases.py
+++ b/backends/vulkan/test/op_tests/cases.py
@@ -348,6 +348,39 @@ def get_conv_inputs():
                 [0, 0],
                 1,
             ),
+            (
+                (1, 4, 234, 234),
+                (4, 1, 3, 3),
+                (4,),
+                [2, 1],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                4,
+            ),
+            (
+                (1, 4, 234, 234),
+                (4, 1, 3, 3),
+                (4,),
+                [1, 2],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                4,
+            ),
+            (
+                (1, 4, 234, 234),
+                (4, 1, 3, 3),
+                (4,),
+                [2, 2],
+                [1, 1],
+                [1, 1],
+                False,
+                [0, 0],
+                4,
+            ),
         ]
     )
     return test_suite

From f23e17f787f906259f923fe7dadb337eb3880c40 Mon Sep 17 00:00:00 2001
From: Vivek Trivedi <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:24:06 -0800
Subject: [PATCH 2/3] [ET-VK] Making stride equals dilation the default mode
 for conv2d dw.

Pull Request resolved: https://github.com/pytorch/executorch/pull/7596

This diff makes changes make stride equals dilation the default mode for conv2d dw output op.
Adds a different source file to handle stride not equal dilation case.
ghstack-source-id: 261183386

Differential Revision: [D67979760](https://our.internmc.facebook.com/intern/diff/D67979760/)
---
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 42 -----------
 .../graph/ops/glsl/conv2d_dw_output_tile.yaml | 13 ----
 .../ops/glsl/conv2d_dw_sned_output_tile.glsl  | 74 +++++++++++++++++++
 .../ops/glsl/conv2d_dw_sned_output_tile.yaml  | 25 +++++++
 .../runtime/graph/ops/impl/Convolution.cpp    |  4 +-
 5 files changed, 101 insertions(+), 57 deletions(-)
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
 create mode 100644 backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index cd385718ce0..86e1e037261 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -14,8 +14,6 @@
 
 #define TILE_SIZE ${TILE_SIZE}
 
-#define STRIDE_EQ_DILATION ${STRIDE_EQ_DILATION}
-
 #define BATCH_SIZE_X ${BATCH_SIZE_X}
 
 #define BATCH_SIZE_Y ${BATCH_SIZE_Y}
@@ -43,7 +41,6 @@ layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
  * output at a single output location.
  */
 
-#if STRIDE_EQ_DILATION
 void main() {
   // x and y are divided by batch size to determine 3d position
   // since work size is calculated by x * ((y + B_Y - 1) / B_Y) * z
@@ -125,42 +122,3 @@ void main() {
     }
   }
 }
-
-#else
-void main() {
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  int kx = 0;
-  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
-    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
-      // The weight kernel was rearranged such that every NxN filter is
-      // flattened to fit in one row. Each filter was then stacked on top of
-      // each other vertically.
-      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
-      kx++;
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
-
-#endif
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
index d3672f5ec2e..9cf6c22c6ca 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.yaml
@@ -12,7 +12,6 @@ conv2d_dw_output_tile:
     TILE_SIZE: 3
     BATCH_SIZE_X: 4
     BATCH_SIZE_Y: 2
-    STRIDE_EQ_DILATION: 0
   generate_variant_forall:
     DTYPE:
       - VALUE: half
@@ -26,15 +25,3 @@ conv2d_dw_output_tile:
     - NAME: conv2d_dw_output_tile_5x5_clamp
       OPERATOR: clamp(X, A, B)
       TILE_SIZE: 5
-    - NAME: conv2d_dw_sed_output_tile_3x3
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_3x3_clamp
-      OPERATOR: clamp(X, A, B)
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_5x5
-      TILE_SIZE: 5
-      STRIDE_EQ_DILATION: 1
-    - NAME: conv2d_dw_sed_output_tile_5x5_clamp
-      OPERATOR: clamp(X, A, B)
-      TILE_SIZE: 5
-      STRIDE_EQ_DILATION: 1
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
new file mode 100644
index 00000000000..d0fc6707bff
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.glsl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+#define TILE_SIZE ${TILE_SIZE}
+
+#define op(X, A, B) ${OPERATOR}
+
+#include "indexing_utils.h"
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "t_kernel", DTYPE, "texture2d")}
+${layout_declare_tensor(3, "r", "t_bias", DTYPE, "texture2d")}
+${layout_declare_ubo(4, "ivec3", "out_limits")}
+${layout_declare_ubo(5, "ivec4", "in_sizes")}
+${layout_declare_ubo(6, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
+${layout_declare_ubo(7, "ivec2", "overlay_region", "int", "in_group_size")}
+${layout_declare_ubo(8, "float", "out_min", "float", "out_max")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+/*
+ * Computes a depthwise convolution. Each shader invocation calculates the
+ * output at a single output location.
+ */
+
+void main() {
+  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
+  const ivec3 pos = ivec3(
+    gl_GlobalInvocationID.x % out_limits.x,
+    div_by_x % out_limits.y,
+    div_by_x / out_limits.y);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  // Compute the index of the top-left element of the overlay region. Negative
+  // indices indicate that the top-left element is in a region added by padding.
+  const ivec2 ipos = pos.xy * stride - padding;
+
+  // Compute the start and end of the input indices to load. Padding is assumed
+  // to be constant 0 padding, so any reads from the padding region is skipped.
+  const ivec2 start = ipos;
+  const ivec2 end = ipos + overlay_region.xy;
+
+  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
+  int kx = 0;
+  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
+    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
+      // The weight kernel was rearranged such that every NxN filter is
+      // flattened to fit in one row. Each filter was then stacked on top of
+      // each other vertically.
+      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
+      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
+      kx++;
+    }
+  }
+
+  imageStore(t_out, pos, op(sum, out_min, out_max));
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml
new file mode 100644
index 00000000000..f2ece8fa0f9
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_sned_output_tile.yaml
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+conv2d_dw_sned_output_tile:
+  parameter_names_with_default_values:
+    OPERATOR: X
+    NDIM: 3
+    DTYPE: float
+    TILE_SIZE: 3
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: conv2d_dw_sned_output_tile_3x3
+    - NAME: conv2d_dw_sned_output_tile_3x3_clamp
+      OPERATOR: clamp(X, A, B)
+    - NAME: conv2d_dw_sned_output_tile_5x5
+      TILE_SIZE: 5
+    - NAME: conv2d_dw_sned_output_tile_5x5_clamp
+      OPERATOR: clamp(X, A, B)
+      TILE_SIZE: 5
diff --git a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
index a7c11cc8535..8c369914c1b 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Convolution.cpp
@@ -134,8 +134,8 @@ vkapi::ShaderInfo get_conv2d_shader(
     case Conv2dMethod::Depthwise:
       kernel_name = "conv2d_dw";
       if (!prepack_weights) {
-        if (stride_equals_dilation) {
-          kernel_name += "_sed";
+        if (!stride_equals_dilation) {
+          kernel_name += "_sned";
         }
         const auto& weight_sizes = graph.get_tref(weight)->sizes;
         if (weight_sizes.at(2) == 3 && weight_sizes.at(3) == 3) {

From 090101c3127d70ed5db537c10e8e832866339eab Mon Sep 17 00:00:00 2001
From: trivedivivek <5340687+trivedivivek@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:44:32 -0600
Subject: [PATCH 3/3] Update conv2d_dw_output_tile.glsl

---
 .../graph/ops/glsl/conv2d_dw_output_tile.glsl | 39 -------------------
 1 file changed, 39 deletions(-)

diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
index f0a7e814cfc..48afd3a9a7c 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl
@@ -124,42 +124,3 @@ void main() {
     }
   }
 }
-
-#else
-void main() {
-  const uint div_by_x = gl_GlobalInvocationID.x / out_limits.x;
-  const ivec3 pos = ivec3(
-    gl_GlobalInvocationID.x % out_limits.x,
-    div_by_x % out_limits.y,
-    div_by_x / out_limits.y);
-
-  if (any(greaterThanEqual(pos, out_limits))) {
-    return;
-  }
-
-  // Compute the index of the top-left element of the overlay region. Negative
-  // indices indicate that the top-left element is in a region added by padding.
-  const ivec2 ipos = pos.xy * stride - padding;
-
-  // Compute the start and end of the input indices to load. Padding is assumed
-  // to be constant 0 padding, so any reads from the padding region is skipped.
-  const ivec2 start = ipos;
-  const ivec2 end = ipos + overlay_region.xy;
-
-  VEC4_T sum = texelFetch(t_bias, ivec2(pos.z, 0), 0);
-  int kx = 0;
-  for (int y = start.y, i = 0; i < TILE_SIZE; y += dilation.y, i++) {
-    for (int x = start.x, j = 0; j < TILE_SIZE; x += dilation.x, j++) {
-      // The weight kernel was rearranged such that every NxN filter is
-      // flattened to fit in one row. Each filter was then stacked on top of
-      // each other vertically.
-      const vec4 in_texel = texelFetch(t_in, ivec3(x, y, pos.z), 0);
-      sum = fma(in_texel, texelFetch(t_kernel, ivec2(kx, pos.z), 0), sum);
-      kx++;
-    }
-  }
-
-  imageStore(t_out, pos, op(sum, out_min, out_max));
-}
-
-#endif