pytorch · nathanaelsee · Oct 19, 2024
@@ -5,6 +5,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# pyre-unsafe
+
 import argparse
 import array
 import codecs
@@ -42,6 +44,10 @@
     # layout binding index when declaring layout bindings. Note that a container
     # type is used because integers are immutable in Python.
     "B": [0],
+    # C is shorthand for "constant_id". This is used to automatically increment the
+    # constant_id index for specialization constants.
+    # Note that it starts at 3, as 0-2 are reserved for local workgroup size ids.
+    "C": [3],
 }
 
 # Establishes relationships between different tensor types and different GLSL types
@@ -300,14 +306,32 @@ def layout_declare_ubo(
 layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{
 """
     for type_name, var_name in var_list:
-        out_str += f"{type_name} {var_name};\n"
+        out_str += f"  {type_name} {var_name};\n"
     out_str += "};"
 
     if isinstance(slot, list):
         slot[0] = slot[0] + 1
     return out_str
 
 
+def layout_declare_spec_const(
+    slot: Union[int, List[int]],
+    type_name: str,
+    var_name: str,
+    initial_val: Optional[str] = None,
+) -> str:
+    assert type_name in ["int", "uint", "float", "bool"]
+
+    out_str = f"layout(constant_id = {get_slot_val(slot)}) const {type_name} {var_name}"
+    if initial_val is not None:
+        out_str += f" = {initial_val}"
+    out_str += ";"
+
+    if isinstance(slot, list):
+        slot[0] = slot[0] + 1
+    return out_str
+
+
 def define_active_storage_type(storage_type: str):
     if storage_type.lower() == "buffer":
         return "#define USING_BUFFER"
@@ -361,6 +385,7 @@ def define_required_extensions(dtypes: Union[str, List[str]]):
     "layout_declare_sampler": layout_declare_sampler,
     "layout_declare_tensor": layout_declare_tensor,
     "layout_declare_ubo": layout_declare_ubo,
+    "layout_declare_spec_const": layout_declare_spec_const,
     "define_active_storage_type": define_active_storage_type,
     "define_required_extensions": define_required_extensions,
 }

@@ -232,6 +232,13 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
   imageStore(im, lpos_to_pos(lpos, axis_map), texel)
 #endif
 
+// Converts hashed axis mapping and packed dim to a ivec4
+// e.g. 0x000102, 2 -> ivec4(0, 1, 2, 2)
+// e.g. 0x010200, 1 -> ivec4(1, 2, 0, 1)
+#define UNHASH_AXIS_MAP(hash, packed_dim) \
+  ivec4(hash >> 16, (hash >> 8) & 0xFF, hash & 0xFF, packed_dim)
+#define DEFAULT_AXIS_MAP_HASH 0x000102
+
 /************************
  * Deprecated Functions *
  ************************/

@@ -17,32 +17,32 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_mean;
-layout(set = 0, binding = 2, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_rstd;
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_mean", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_rstd", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
-layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
-layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE)}
 
-layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "float", "epsilon")}
 
-layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
-layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
-  float epsilon;
-};
+${layout_declare_spec_const(C, "int", "in_axis_map_hash", "DEFAULT_AXIS_MAP_HASH")}
+${layout_declare_spec_const(C, "int", "in_packed_dim", "C_DIM")}
+const ivec4 in_axis_map = UNHASH_AXIS_MAP(in_axis_map_hash, in_packed_dim);
 
-layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+${layout_declare_spec_const(C, "int", "out_axis_map_hash", "DEFAULT_AXIS_MAP_HASH")}
+${layout_declare_spec_const(C, "int", "out_packed_dim", "C_DIM")}
+const ivec4 out_axis_map = UNHASH_AXIS_MAP(out_axis_map_hash, out_packed_dim);
 
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec3 lpos = ivec3(gl_GlobalInvocationID);
 
-  if (any(greaterThanEqual(pos, out_limits))) {
+  if (any(greaterThanEqual(lpos, out_limits))) {
     return;
   }
 
@@ -55,8 +55,10 @@ void main() {
 
   // Use Welford's online algorithm to compute mean and variance in one pass
   // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
   for (int w = 0; w < width; ++w) {
-    VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
+    in_pos[in_axis_map.x] = w;
+    VEC4_T v = load_texel(t_in, in_pos);
     delta = v - mean;
     mean += delta / (w + 1);
     delta2 = v - mean;
@@ -68,14 +70,15 @@ void main() {
   VEC4_T offset = -rstd * mean;
 
   for (int w = 0; w < width; ++w) {
-    VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
+    in_pos[in_axis_map.x] = w;
+    VEC4_T v = load_texel(t_in, in_pos);
     // broadcasting
-    VEC4_T weight = texelFetch(weight_in, ivec3(w, 0, 0), 0).xxxx;
-    VEC4_T bias = texelFetch(bias_in, ivec3(w, 0, 0), 0).xxxx;
+    VEC4_T weight = load_texel(t_weight, ivec3(w, 0, 0)).xxxx;
+    VEC4_T bias = load_texel(t_bias, ivec3(w, 0, 0)).xxxx;
     VEC4_T outtex = (v * rstd + offset) * weight + bias;
-    imageStore(image_out, ivec3(w, pos.y, pos.z), outtex);
+    write_texel_lpos(t_out, ivec3(w, lpos.y, lpos.z), outtex, out_axis_map);
   }
 
-  imageStore(image_mean, pos, mean);
-  imageStore(image_rstd, pos, rstd);
+  write_texel(t_mean, lpos, mean);
+  write_texel(t_rstd, lpos, rstd);
 }
@@ -6,9 +6,8 @@
 
 native_layer_norm:
   parameter_names_with_default_values:
-    NDIM: 3
     DTYPE: float
-    PACKING: C_packed
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half

@@ -106,11 +106,18 @@ void add_native_layer_norm_node(
         vkapi::MemoryAccessType::WRITE},
        {{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
       // Shader params buffers
-      {t_out->logical_limits_ubo(),
-       t_out->sizes_ubo(),
-       graph.create_params_buffer(epsilon)},
+      {
+          t_out->logical_limits_ubo(),
+          t_out->sizes_ubo(),
+          graph.create_params_buffer(epsilon),
+      },
       // Specialization Constants
-      {},
+      {
+          hash_axis_map(t_input->axis_map()),
+          t_input->packed_dim(),
+          hash_axis_map(t_out->axis_map()),
+          t_out->packed_dim(),
+      },
       // Resizing Logic
       resize_native_layer_norm_node,
       {normalized_shape}));

@@ -79,4 +79,18 @@ T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
   return ndim - 1 - nchw_dim;
 }
 
+//
+// Tensor axis map utilities
+//
+
+// Converts ivec4 axis map to a single int32_t, to be able to pass it as a
+// specialization constant instead of a ubo. This allows for the spir-v to
+// bytecode compilation to perform compile-time folding on the axis map.
+// Only converts the first 3 indices, as the last index is the packed dim,
+// which is passed separately.
+// Example: ivec4(0, 1, 2, 2) -> 0x000102
+inline int32_t hash_axis_map(const std::vector<int64_t>& axis_map) {
+  return (axis_map.at(0) << 16) + (axis_map.at(1) << 8) + axis_map.at(2);
+}
+
 } // namespace vkcompute