Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1009,6 +1009,7 @@ jobs:
./cmake-out/backends/vulkan/test/custom_ops/q8csw_conv2d
./cmake-out/backends/vulkan/test/custom_ops/q4gsw_linear
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
./cmake-out/backends/vulkan/test/custom_ops/qdq8ta_conv2d_activations

# "Classic" Operator tests
PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
Expand Down
24 changes: 24 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/common.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,30 @@ struct TensorIndex4D {
ivec4 data;
};

int sign_extend_8bit(const int val) {
if ((val & 0x80) != 0) {
return val | (~0xFF);
}
return val;
}

int extract_8bit_from_packed_int_le(const int packed, const int i) {
// account for little endian
int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
return byte;
}

int pack_4xqint_into_int32(
const int val0,
const int val1,
const int val2,
const int val3) {
int packed = (val0 & 0xFF) | ((val1 & 0xFF) << 8) | ((val2 & 0xFF) << 16) |
((val3 & 0xFF) << 24);

return packed;
}

#ifdef DEBUG_MODE

#extension GL_EXT_debug_printf : require
Expand Down
42 changes: 42 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/conv2d_common.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,48 @@ struct Conv2DParams {
int K4;
};

struct Conv2dTensorIndex {
ivec3 data;
int texel_i;
};

struct Conv2dBlockIndex {
ivec3 data;
};

Conv2dTensorIndex block_idx_to_tensor_idx(const Conv2dBlockIndex block_idx) {
Conv2dTensorIndex tensor_idx;
tensor_idx.data.x = mul_4(block_idx.data.x);
tensor_idx.data.y = block_idx.data.y;
tensor_idx.data.z = block_idx.data.z;
tensor_idx.texel_i = 0;
return tensor_idx;
}

struct Conv2dBlockExtents {
ivec3 data;
int data_xz;
};

Conv2dBlockExtents make_block_extents(const ivec4 tensor_sizes) {
Conv2dBlockExtents block_sizes;
block_sizes.data.x = div_up_4(tensor_sizes.x);
block_sizes.data.y = tensor_sizes.y;
block_sizes.data.z = div_up_4(tensor_sizes.z);

block_sizes.data_xz = block_sizes.data.x * block_sizes.data.z;

return block_sizes;
}

bool block_idx_out_of_bounds(
const Conv2dBlockIndex block_idx,
const Conv2dBlockExtents block_extents) {
return block_idx.data.x >= block_extents.data.x ||
block_idx.data.y >= block_extents.data.y ||
block_idx.data.z >= block_extents.data.z;
}

#ifdef DEBUG_MODE

void printConv2DParams(const Conv2DParams params) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#ifndef CONV2D_FP_INPUT_TILE_LOAD
#define CONV2D_FP_INPUT_TILE_LOAD

#extension GL_EXT_control_flow_attributes : require

#include "linear_fp_input_tile.glslh"

VEC4_T load_fp_input_texel(const Conv2dTensorIndex tidx) {
return texelFetch(t_fp_input, tidx.data, 0);
}

void load_fp_input_tile(
out FPInputTile tile,
const Conv2dBlockIndex block_idx) {
#if TILE_M == 4 && TILE_K4 == 1
Conv2dTensorIndex load_tidx = block_idx_to_tensor_idx(block_idx);
[[unroll]] for (int w = 0; w < TILE_M; w++) {
tile.data[w][0] = load_fp_input_texel(load_tidx);
load_tidx.data.x++;
}
#else
not_implemented;
#endif
}

#endif // CONV2D_FP_INPUT_TILE_LOAD
13 changes: 0 additions & 13 deletions backends/vulkan/runtime/graph/ops/glsl/linear_common.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,6 @@

#include "common.glslh"

int sign_extend_8bit(const int val) {
if ((val & 0x80) != 0) {
return val | (~0xFF);
}
return val;
}

int extract_8bit_from_packed_int_le(const int packed, const int i) {
// account for little endian
int byte = sign_extend_8bit(packed >> (8 * i) & 0xFF);
return byte;
}

// Extract a 4-bit value from a packed int (little endian)
// It is assumed that the 4-bit value is in the range [0, 15]
int extract_4bit_from_packed_int_le(const int packed, const int col) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}
#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}

// corresponds to the input width dim
#define TILE_M4 1
// corresponds to the input channels dim
#define TILE_K4 1

#define TILE_M 4

$if OUTPUT_STORAGE == "buffer":
#define OUTPUT_BUFFER
$if INPUT_STORAGE == "buffer":
#define INPUT_BUFFER

${define_required_extensions(DTYPE)}

layout(std430) buffer;

#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_packed_int8_input", "int", OUTPUT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_fp_input", DTYPE, INPUT_STORAGE, is_scalar_array=False)}

${layout_declare_ubo(B, "ivec4", "input_sizes")}

layout(push_constant) uniform restrict Block {
float inv_scale;
int zp;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

#include "conv2d_fp_input_tile_load.glslh"
#include "linear_int8_input_block.glslh"

void store_packed_int8_block(
const Conv2dBlockIndex block_idx,
const Conv2dBlockExtents block_extents,
const Int8InputBlock packed_int8_block) {
#ifdef OUTPUT_BUFFER
const int buffer_idx = block_idx.data.y * block_extents.data_xz +
block_idx.data.x * block_extents.data.z + block_idx.data.z;
t_packed_int8_input[buffer_idx] = packed_int8_block.data;
#else
imageStore(t_packed_int8_input, block_idx.data, packed_int8_block.data);
#endif
}

void main() {
Conv2dBlockIndex block_idx;
block_idx.data = ivec3(gl_GlobalInvocationID);

Conv2dBlockExtents block_extents = make_block_extents(input_sizes);
if (block_idx_out_of_bounds(block_idx, block_extents)) {
return;
}

FPInputTile fp_tile;
load_fp_input_tile(fp_tile, block_idx);

Int8InputBlock int8_block;
quantize_and_pack(int8_block, fp_tile, inv_scale, zp);

store_packed_int8_block(block_idx, block_extents, int8_block);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

quantize_and_pack_q8ta_conv2d_input:
parameter_names_with_default_values:
DTYPE: float
OUTPUT_STORAGE: texture3d
INPUT_STORAGE: texture3d
generate_variant_forall:
combination:
parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
combos:
- parameter_values: [texture3d, texture3d]
- parameter_values: [buffer, texture3d]
DTYPE:
- VALUE: float
shader_variants:
- NAME: quantize_and_pack_q8ta_conv2d_input
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}
#define VEC4_T ${texel_load_type(DTYPE, INPUT_STORAGE)}
#define T ${texel_load_component_type(DTYPE, INPUT_STORAGE)}

// corresponds to the output width dim
#define TILE_M4 1
// corresponds to the output channels dim
#define TILE_K4 1

#define TILE_M 4

$if OUTPUT_STORAGE == "buffer":
#define OUTPUT_BUFFER
$if INPUT_STORAGE == "buffer":
#define INPUT_BUFFER

${define_required_extensions(DTYPE)}

layout(std430) buffer;

#define DEBUG_MODE
#include "conv2d_common.glslh"

${layout_declare_tensor(B, "w", "t_fp_output", DTYPE, OUTPUT_STORAGE, is_scalar_array=False)}
${layout_declare_tensor(B, "r", "t_packed_int8_output", "int", INPUT_STORAGE, is_scalar_array=False)}

${layout_declare_ubo(B, "ivec4", "output_sizes")}

layout(push_constant) uniform restrict Block {
float scale;
int zp;
};

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

#include "linear_fp_input_tile.glslh"
#include "linear_int8_input_tile.glslh"

void load_packed_int8_tile(
out Int8InputTile int8_tile,
const Conv2dBlockIndex block_idx,
const Conv2dBlockExtents block_extents) {
#ifdef INPUT_BUFFER
const int buffer_idx = block_idx.data.y * block_extents.data_xz +
block_idx.data.x * block_extents.data.z + block_idx.data.z;
int8_tile.data[0][0] = t_packed_int8_output[buffer_idx];
#else
int8_tile.data[0][0] = texelFetch(t_packed_int8_output, block_idx.data, 0);
#endif
}

VEC4_T
dequantize_8bit(const ivec4 val, const float q_scale, const int q_zero_point) {
return VEC4_T(val - q_zero_point) * q_scale;
}

void unpack_and_dequantize(
out FPInputTile fp_tile,
const Int8InputTile int8_tile,
const float q_scale,
const int q_zero_point) {
[[unroll]] for (int w = 0; w < 4; ++w) {
int packed = int8_tile.data[0][0][w];
fp_tile.data[w][0] = dequantize_8bit(
ivec4(
extract_8bit_from_packed_int_le(packed, 0),
extract_8bit_from_packed_int_le(packed, 1),
extract_8bit_from_packed_int_le(packed, 2),
extract_8bit_from_packed_int_le(packed, 3)),
q_scale,
q_zero_point);
}
}

void store_fp_output_texel(
const Conv2dTensorIndex tidx,
const VEC4_T out_texel) {
imageStore(t_fp_output, tidx.data, out_texel);
}

void store_fp_tile(
const FPInputTile block,
const Conv2dBlockIndex block_idx) {
Conv2dTensorIndex store_tidx = block_idx_to_tensor_idx(block_idx);
[[unroll]] for (int w = 0; w < 4; w++) {
store_fp_output_texel(store_tidx, block.data[w][0]);
store_tidx.data.x++;
}
}

void main() {
Conv2dBlockIndex block_idx;
block_idx.data = ivec3(gl_GlobalInvocationID);

Conv2dBlockExtents block_extents = make_block_extents(output_sizes);
if (block_idx_out_of_bounds(block_idx, block_extents)) {
return;
}

Int8InputTile int8_tile;
load_packed_int8_tile(int8_tile, block_idx, block_extents);

FPInputTile fp_tile;
unpack_and_dequantize(
fp_tile, int8_tile, scale, zp);

store_fp_tile(fp_tile, block_idx);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

unpack_and_dequantize_q8ta_conv2d_output:
parameter_names_with_default_values:
DTYPE: float
OUTPUT_STORAGE: texture3d
INPUT_STORAGE: texture3d
generate_variant_forall:
combination:
parameter_names: [OUTPUT_STORAGE, INPUT_STORAGE]
combos:
- parameter_values: [texture3d, texture3d]
- parameter_values: [texture3d, buffer]
DTYPE:
- VALUE: float
shader_variants:
- NAME: unpack_and_dequantize_q8ta_conv2d_output
Loading
Loading