Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion backends/vulkan/runtime/gen_vulkan_spv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# pyre-unsafe

import argparse
import array
import codecs
Expand Down Expand Up @@ -42,6 +44,10 @@
# layout binding index when declaring layout bindings. Note that a container
# type is used because integers are immutable in Python.
"B": [0],
# C is shorthand for "constant_id". This is used to automatically increment the
# constant_id index for specialization constants.
# Note that it starts at 3, as 0-2 are reserved for local workgroup size ids.
"C": [3],
}

# Establishes relationships between different tensor types and different GLSL types
Expand Down Expand Up @@ -300,14 +306,32 @@ def layout_declare_ubo(
layout(set = 0, binding = {get_slot_val(slot)}) uniform {precision} restrict readonly {ubo_name}UBO {{
"""
for type_name, var_name in var_list:
out_str += f"{type_name} {var_name};\n"
out_str += f" {type_name} {var_name};\n"
out_str += "};"

if isinstance(slot, list):
slot[0] = slot[0] + 1
return out_str


def layout_declare_spec_const(
slot: Union[int, List[int]],
type_name: str,
var_name: str,
initial_val: Optional[str] = None,
) -> str:
assert type_name in ["int", "uint", "float", "bool"]

out_str = f"layout(constant_id = {get_slot_val(slot)}) const {type_name} {var_name}"
if initial_val is not None:
out_str += f" = {initial_val}"
out_str += ";"

if isinstance(slot, list):
slot[0] = slot[0] + 1
return out_str


def define_active_storage_type(storage_type: str):
if storage_type.lower() == "buffer":
return "#define USING_BUFFER"
Expand Down Expand Up @@ -361,6 +385,7 @@ def define_required_extensions(dtypes: Union[str, List[str]]):
"layout_declare_sampler": layout_declare_sampler,
"layout_declare_tensor": layout_declare_tensor,
"layout_declare_ubo": layout_declare_ubo,
"layout_declare_spec_const": layout_declare_spec_const,
"define_active_storage_type": define_active_storage_type,
"define_required_extensions": define_required_extensions,
}
Expand Down
7 changes: 7 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,13 @@ ivec3 lpos_to_pos(const ivec3 lpos, const ivec4 axis_map) {
imageStore(im, lpos_to_pos(lpos, axis_map), texel)
#endif

// Converts hashed axis mapping and packed dim to a ivec4
// e.g. 0x000102, 2 -> ivec4(0, 1, 2, 2)
// e.g. 0x010200, 1 -> ivec4(1, 2, 0, 1)
#define UNHASH_AXIS_MAP(hash, packed_dim) \
ivec4(hash >> 16, (hash >> 8) & 0xFF, hash & 0xFF, packed_dim)
#define DEFAULT_AXIS_MAP_HASH 0x000102

/************************
* Deprecated Functions *
************************/
Expand Down
53 changes: 28 additions & 25 deletions backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,32 @@

layout(std430) buffer;

layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
layout(set = 0, binding = 1, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_mean;
layout(set = 0, binding = 2, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_rstd;
${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_tensor(B, "w", "t_mean", DTYPE, STORAGE)}
${layout_declare_tensor(B, "w", "t_rstd", DTYPE, STORAGE)}

layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "t_weight", DTYPE, STORAGE)}
${layout_declare_tensor(B, "r", "t_bias", DTYPE, STORAGE)}

layout(set = 0, binding = 6) uniform PRECISION restrict OutLimits {
ivec3 out_limits;
};
${layout_declare_ubo(B, "ivec3", "out_limits")}
${layout_declare_ubo(B, "ivec4", "sizes")}
${layout_declare_ubo(B, "float", "epsilon")}

layout(set = 0, binding = 7) uniform PRECISION restrict Sizes {
ivec4 sizes;
};
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

layout(set = 0, binding = 8) uniform PRECISION restrict Epsilon {
float epsilon;
};
${layout_declare_spec_const(C, "int", "in_axis_map_hash", "DEFAULT_AXIS_MAP_HASH")}
${layout_declare_spec_const(C, "int", "in_packed_dim", "C_DIM")}
const ivec4 in_axis_map = UNHASH_AXIS_MAP(in_axis_map_hash, in_packed_dim);

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
${layout_declare_spec_const(C, "int", "out_axis_map_hash", "DEFAULT_AXIS_MAP_HASH")}
${layout_declare_spec_const(C, "int", "out_packed_dim", "C_DIM")}
const ivec4 out_axis_map = UNHASH_AXIS_MAP(out_axis_map_hash, out_packed_dim);

void main() {
const ivec3 pos = ivec3(gl_GlobalInvocationID);
const ivec3 lpos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(pos, out_limits))) {
if (any(greaterThanEqual(lpos, out_limits))) {
return;
}

Expand All @@ -55,8 +55,10 @@ void main() {

// Use Welford's online algorithm to compute mean and variance in one pass
// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
ivec3 in_pos = lpos_to_pos(lpos, in_axis_map);
for (int w = 0; w < width; ++w) {
VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
in_pos[in_axis_map.x] = w;
VEC4_T v = load_texel(t_in, in_pos);
delta = v - mean;
mean += delta / (w + 1);
delta2 = v - mean;
Expand All @@ -68,14 +70,15 @@ void main() {
VEC4_T offset = -rstd * mean;

for (int w = 0; w < width; ++w) {
VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
in_pos[in_axis_map.x] = w;
VEC4_T v = load_texel(t_in, in_pos);
// broadcasting
VEC4_T weight = texelFetch(weight_in, ivec3(w, 0, 0), 0).xxxx;
VEC4_T bias = texelFetch(bias_in, ivec3(w, 0, 0), 0).xxxx;
VEC4_T weight = load_texel(t_weight, ivec3(w, 0, 0)).xxxx;
VEC4_T bias = load_texel(t_bias, ivec3(w, 0, 0)).xxxx;
VEC4_T outtex = (v * rstd + offset) * weight + bias;
imageStore(image_out, ivec3(w, pos.y, pos.z), outtex);
write_texel_lpos(t_out, ivec3(w, lpos.y, lpos.z), outtex, out_axis_map);
}

imageStore(image_mean, pos, mean);
imageStore(image_rstd, pos, rstd);
write_texel(t_mean, lpos, mean);
write_texel(t_rstd, lpos, rstd);
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@

native_layer_norm:
parameter_names_with_default_values:
NDIM: 3
DTYPE: float
PACKING: C_packed
STORAGE: texture3d
generate_variant_forall:
DTYPE:
- VALUE: half
Expand Down
15 changes: 11 additions & 4 deletions backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,18 @@ void add_native_layer_norm_node(
vkapi::MemoryAccessType::WRITE},
{{in, arg_weight, arg_bias}, vkapi::MemoryAccessType::READ}},
// Shader params buffers
{t_out->logical_limits_ubo(),
t_out->sizes_ubo(),
graph.create_params_buffer(epsilon)},
{
t_out->logical_limits_ubo(),
t_out->sizes_ubo(),
graph.create_params_buffer(epsilon),
},
// Specialization Constants
{},
{
hash_axis_map(t_input->axis_map()),
t_input->packed_dim(),
hash_axis_map(t_out->axis_map()),
t_out->packed_dim(),
},
// Resizing Logic
resize_native_layer_norm_node,
{normalized_shape}));
Expand Down
14 changes: 14 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,18 @@ T nchw_dim_to_whcn_dim(const T& nchw_dim, const int64_t ndim) {
return ndim - 1 - nchw_dim;
}

//
// Tensor axis map utilities
//

// Converts ivec4 axis map to a single int32_t, to be able to pass it as a
// specialization constant instead of a ubo. This allows for the spir-v to
// bytecode compilation to perform compile-time folding on the axis map.
// Only converts the first 3 indices, as the last index is the packed dim,
// which is passed separately.
// Example: ivec4(0, 1, 2, 2) -> 0x000102
inline int32_t hash_axis_map(const std::vector<int64_t>& axis_map) {
return (axis_map.at(0) << 16) + (axis_map.at(1) << 8) + axis_map.at(2);
}

} // namespace vkcompute
Loading