Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 18 additions & 29 deletions backends/vulkan/runtime/graph/ops/glsl/q_4w_linear.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,14 @@ layout(std430) buffer;

${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
${layout_declare_tensor(1, "r", "t_mat1", DTYPE, STORAGE)}
${layout_declare_tensor(2, "r", "t_mat2", "int8", STORAGE)}
${layout_declare_tensor(2, "r", "t_mat2", "int8", "buffer")}
${layout_declare_tensor(3, "r", "t_scales_and_zeros", DTYPE, STORAGE)}

$if STORAGE == "texture3d":
${layout_declare_ubo(4, "ivec4", "out_sizes")}
${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
${layout_declare_ubo(6, "ivec4", "scales_strides")}
${layout_declare_ubo(6, "ivec4", "mat2_strides")}
${layout_declare_ubo(7, "ivec4", "scales_strides")}
$else:
${layout_declare_ubo(4, "ivec4", "out_sizes")}
${layout_declare_ubo(5, "ivec4", "out_strides")}
Expand Down Expand Up @@ -64,9 +65,9 @@ void main() {

float rc = 0.0;
int k = 0;
const uint k_block = (K + group_size - 1) / group_size;

#ifdef USING_BUFFER
const uint k_block = (K + group_size - 1) / group_size;
ivec4 mat1_pos = ivec4(0, m, out_pos.z, out_pos.w);
ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
ivec4 scale_pos = ivec4(0, n, 0, out_pos.w);
Expand Down Expand Up @@ -101,42 +102,30 @@ void main() {
t_out[out_bufi] = FLOAT_T(rc);

#else // Using texture
const uint texel_group_size = group_size / FOUR;
const uint k_block = (K + texel_group_size - 1) / texel_group_size;
ivec3 mat1_pos = ivec3(0, m, out_pos.z);
ivec3 mat2_pos = ivec3(0, n, out_pos.z);
ivec3 scale_pos = ivec3(0, n, 0);
ivec3 zero_pos = ivec3(0, n, 1);
ivec4 mat2_pos = ivec4(0, n, out_pos.z, out_pos.w);
ivec3 scale_zero_pos = ivec3(0, n, 0);
uint K_texel = K / FOUR;

for (int kb = 0; kb < k_block; kb++) {
const int texel_kb = kb / FOUR;
const int kb_offset = kb % FOUR;

scale_pos.x = texel_kb;
const VEC4_T scale_texel = load_texel(t_scales_and_zeros, scale_pos);
const float scale = float(scale_texel[kb_offset]);
scale_zero_pos.x = kb;
const vec4 scale_zero = load_texel(t_scales_and_zeros, scale_zero_pos);
const float scale = scale_zero.x;
const float zero = scale_zero.y - scale * 8.0;

zero_pos.x = texel_kb;
const VEC4_T zero_texel = load_texel(t_scales_and_zeros, zero_pos);
const float zero = float(zero_texel[kb_offset]) - scale * 8.0;

for(uint idx = 0; idx < texel_group_size && k < K; idx++, k++) {
for(uint idx = 0; idx < group_size && k < K_texel; idx += FOUR, k++) {
mat1_pos.x = k;
const VEC4_T mat1_tex = load_texel(t_mat1, mat1_pos);

mat2_pos.x = k / 2;
const i8vec4 mat2_tex = i8vec4(load_texel(t_mat2, mat2_pos));
mat2_pos.x = k * 2; // k * FOUR / 2
const int mat2_id = tidx_to_bufi(mat2_pos, mat2_strides);

// Every two texels of mat1 correspond to one texel of mat2
// Even mat1 indeces correspond to first half of mat2 texel and
// odd indeces correspond to second half
const int mat2_offset = (k & 1) == 0 ? 0 : 2;
for (int texel_idx = 0; texel_idx < FOUR; texel_idx++){
for (int texel_pos = 0; texel_pos < FOUR; texel_pos++) {
// Bitwise op treats sign bit from int8 as a value bit instead,
// since there is no uint8_t datatype
uint mat2_val = (mat2_tex[mat2_offset + texel_idx / 2] & 0xFF);
mat2_val = (texel_idx & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
rc += mat1_tex[texel_idx] * (scale * float(mat2_val) + zero);
uint mat2_val = (t_mat2[mat2_id + texel_pos / 2] & 0xFF);
mat2_val = (texel_pos & 1) == 0 ? mat2_val & mask : (mat2_val >> 4);
rc += mat1_tex[texel_pos] * (scale * float(mat2_val) + zero);
}
}
}
Expand Down
18 changes: 10 additions & 8 deletions backends/vulkan/runtime/graph/ops/impl/QuantizedMatMul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ void check_q_matmul_args(
using namespace WHCN;
VK_CHECK_COND(graph.packed_dim_of(mat1) == kWidthDim);
VK_CHECK_COND(graph.packed_dim_of(mat2_data) == kWidthDim);
VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
// VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);

if (graph.storage_type_of(scales_and_zeros) == utils::kBuffer) {
VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kWidthDim);
} else {
VK_CHECK_COND(graph.packed_dim_of(scales_and_zeros) == kChannelsDim);
}

if (graph.storage_type_of(out) == utils::kBuffer) {
VK_CHECK_COND(graph.packed_dim_of(out) == kWidthDim);
Expand Down Expand Up @@ -106,13 +112,8 @@ void add_q_matmul_node(
const ValueRef out) {
auto storage_type = graph.storage_type_of(out);

ValueRef mat2;

if (storage_type == utils::kBuffer) {
mat2 = prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
} else {
mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);
}
ValueRef mat2 =
prepack_buffer_if_tensor_ref(graph, mat2_data, utils::kWidthPacked);

ValueRef scales_and_zeros =
prepack_if_tensor_ref(graph, scales_and_zeros_data, utils::kWidthPacked);
Expand All @@ -135,6 +136,7 @@ void add_q_matmul_node(
} else {
ubos.append(graph.sizes_ubo(out));
ubos.append(graph.sizes_ubo(mat1));
ubos.append(graph.strides_ubo(mat2));
ubos.append(graph.strides_ubo(scales_and_zeros));
}

Expand Down
18 changes: 14 additions & 4 deletions backends/vulkan/test/vulkan_compute_api_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2932,16 +2932,26 @@ void test_int4pack_mm(
int4mm_pack_weights(mat2_size, B_quant_data.data());

IOValueRef B_int4 =
graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, storage_type);
graph.add_input_tensor(mat2_q_size, vkapi::kQInt8, utils::kBuffer);
graph.copy_into_staging(
B_int4.staging, B_int4_data.data(), B_int4_data.size());

const int k_groups = K / group_size;

// Random scales and zeroes. Keep scales small to avoid overflow and zeroes in
// int4 range
IOValueRef scales_and_zeros =
graph.add_input_tensor({2, N, k_groups}, vkapi::kFloat, storage_type);
IOValueRef scales_and_zeros;

if (storage_type == utils::kBuffer) {
scales_and_zeros.value = graph.add_tensor(
{2, N, k_groups}, vkapi::kFloat, storage_type, utils::kWidthPacked);
} else {
scales_and_zeros.value = graph.add_tensor(
{2, N, k_groups}, vkapi::kFloat, storage_type, utils::kChannelsPacked);
}

scales_and_zeros.staging = graph.set_input_tensor(scales_and_zeros.value);

std::vector<float> s_data(graph.numel_of(scales_and_zeros.value));
const int zeros_stride = s_data.size() / 2;
for (size_t i = 0; i < zeros_stride; i++) {
Expand Down Expand Up @@ -3003,7 +3013,7 @@ void test_int4pack_mm(
out_deq.staging, out_deq_data.data(), out_deq_data.size());

for (int i = 0; i < out_int4_data.size(); i++) {
CHECK_VALUE(out_int4_data, i, out_deq_data[i]);
EXPECT_TRUE(check_close(out_int4_data[i], out_deq_data[i]));
}
}

Expand Down
Loading