Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
4b6378d
Add interface is_model_splitted() to check the c-graph is splited or not
zhaixuejun1993 Mar 6, 2026
1b39f34
Infer and propagate dynamic-dimension indices for all tensors in the …
zhaixuejun1993 Mar 17, 2026
3aad3f1
Only do this for fallback sub graph
zhaixuejun1993 Mar 19, 2026
3a10860
Move dynamic dims compute in graph missmatch
zhaixuejun1993 Mar 23, 2026
d6b80bc
ggml-openvino: fix tensor data handling for PERMUTE/VIEW ops in split…
zhaixuejun1993 Mar 19, 2026
64ee289
ggml-openvino:add comments
zhaixuejun1993 Mar 19, 2026
7ab30fc
ggml-openvino: override VIEW op_case to 0 for split model inputs
zhaixuejun1993 Mar 19, 2026
7887089
openvino backend: Handle unsupported VIEW shape-mismatch in OpenVINO …
zhaixuejun1993 Mar 19, 2026
cf6f541
Enable additional mul_mat tests and add tensor data saving function (…
zhaixuejun1993 Mar 23, 2026
3bca21b
ggml-openvino: fix CONT/TRANSPOSE mapping and improve dynamic-dimensi…
zhaixuejun1993 Mar 26, 2026
dafc05a
OpenVINO: add NORM/TANH support and rework SOFT_MAX translation
zhaixuejun1993 Mar 28, 2026
c947188
ggml-openvino: extend VIEW handling
zhaixuejun1993 Mar 30, 2026
e712a8e
Enable -fa off (#118)
wine99 Apr 2, 2026
02acad1
Enable --context-shift
wine99 Apr 10, 2026
3f0d4c7
Fix llm param compute error for normal softmax not the softmax in att…
zhaixuejun1993 Apr 13, 2026
c6e06ee
OpenVINO backend: fix error for attention size compute in llm param
zhaixuejun1993 Apr 13, 2026
cb9ca0a
use tensor->extra in infer_request i/o
wine99 Apr 27, 2026
900d7c9
OpenVINO backend: refacter the compute_llm_params() func add get_atte…
zhaixuejun1993 Apr 29, 2026
c72768b
OpenVINO backend: clean unused code
zhaixuejun1993 Apr 29, 2026
e119aea
added translate_1to1_match_1_input function and updated gelu and tanh…
May 5, 2026
c1a5208
Remove unused translation function calls
May 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
495 changes: 446 additions & 49 deletions ggml/src/ggml-openvino/ggml-decoder.cpp

Large diffs are not rendered by default.

35 changes: 23 additions & 12 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "ggml-quants.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml.h"
#include "openvino/decoder.h"

Expand All @@ -9,16 +10,13 @@
#include <map>
#include <memory>
#include <openvino/core/partial_shape.hpp>
#include <optional>
#include <vector>

struct ModelParams {
int ctx = -1;
int ctx_swa = -1;
int ctx_per_seq = -1;
int ctx_per_seq_swa = -1;
int n_seq = 1;
int n_heads = -1;
int n_heads_kv = -1;
int head_size = -1;
int32_t rope_params[15];
Expand Down Expand Up @@ -69,6 +67,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful = false,
bool model_is_splitted = false,
bool is_prefill = false,
int prefill_chunk_size = 256);

Expand Down Expand Up @@ -106,10 +105,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual ov::element::Type get_output_type(int node_idx) const override;

virtual std::vector<size_t> get_output_stride(int node_idx) const override;

virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;

virtual int32_t * get_output_op_params(int node_idx) const override;

virtual size_t get_output_op_offset(int node_idx) const override;

virtual std::vector<std::string> get_output_names(int node_idx) const override;

virtual const std::string & get_op_type() const override;
Expand All @@ -120,6 +123,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual const std::string & get_op_name(int node_idx) const override;

virtual int32_t get_op_dynamic_dim(int node_idx) const override;

virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;

ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
Expand Down Expand Up @@ -150,8 +155,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual int get_ctx_size() const { return m_model_params.ctx; }

virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; }

virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; }

virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; }
Expand All @@ -175,7 +178,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

virtual bool is_stateful() const override { return m_is_stateful; }

ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
virtual bool is_splited_model() const override {
return m_model_is_splitted;
}

ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;

static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);

Expand Down Expand Up @@ -205,6 +212,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
bool m_is_prefill = false;
bool m_naive = false;
int m_prefill_chunk_size = 0;
bool m_model_is_splitted = false; // label the cgraph is splited or not

static ov::Shape get_shape(const ggml_tensor * tensor);
static std::vector<size_t> get_stride(const ggml_tensor * tensor);
Expand All @@ -227,15 +235,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
}

inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) ||
(op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]);
}

inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_ROPE && tensor == op->src[2];
}

inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY ||
(op->op == GGML_OP_SET_ROWS && op->src[2] == tensor);
}

inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
Expand All @@ -256,9 +266,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
if (is_inp_emb(tensor, op)) {
return "embd";
}
if (is_output_idx(tensor, op)) {
return "inp_out_ids";
}
if (is_inp_mask(tensor, op)) {
return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
}
Expand All @@ -272,6 +279,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
void compute_model_inputs();
void compute_model_outputs();

// Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
void compute_node_dynamic_dims();

void validate_cgraph() const;

ggml_cgraph * m_cgraph = nullptr;
Expand All @@ -284,6 +294,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
std::map<std::string, ggml_tensor *> m_model_outputs;
std::vector<std::string> m_model_output_names;
std::vector<NodeInfo> m_node_info_list;
std::map<ggml_tensor *, int> m_node_dynamic_dims;

ModelParams m_model_params;
ComputeParams m_compute_params;
Expand Down
30 changes: 13 additions & 17 deletions ggml/src/ggml-openvino/ggml-openvino.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -823,15 +823,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
return true;
}
float scale = 1.0f;
float max_bias = 0.0f;
const auto * op_params = op->op_params;
memcpy(&scale, (const float *) op_params + 0, sizeof(float));
memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
if (max_bias > 0) {
// GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
return true;
}
break;
}
case GGML_OP_FLASH_ATTN_EXT: {
Expand Down Expand Up @@ -883,9 +874,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
return true;
}
if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
return true;
}
if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
// MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
// triggers a bug in ov matmul_shape_inference.hpp
Expand All @@ -909,7 +897,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
// op->src[0]->ne[0]);
return true;
}
if (op->type != GGML_TYPE_F32) {
if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) {
// GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
return true;
}
Expand All @@ -930,6 +918,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
}
break;
}
case GGML_OP_TRANSPOSE: {
// if the type is bf16, will return true
if (op->type == GGML_TYPE_BF16) {
// GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n");
return true;
}
break;
}
default:
break;
}
Expand All @@ -951,14 +947,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};

static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
/*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
// softmax is not updated due to replaced by flash_attn_ext
// GGML_OP_SOFT_MAX,
GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_NORM,
GGML_OP_SOFT_MAX,
GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
static const std::set<ggml_unary_op> supported_unary_ops{
GGML_UNARY_OP_GELU,
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_TANH,
};
static const std::set<ggml_glu_op> supported_glu_ops{
GGML_GLU_OP_SWIGLU,
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-openvino/openvino/decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,14 @@ class GgmlDecoder : public DecoderBase {

virtual element::Type get_output_type(const int node_idx) const = 0;

virtual std::vector<size_t> get_output_stride(int node_idx) const = 0;

virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;

virtual int32_t * get_output_op_params(int node_idx) const = 0;

virtual size_t get_output_op_offset(int node_idx) const = 0;

virtual std::vector<std::string> get_output_names(int node_idx) const = 0;

virtual const std::string& get_op_type() const = 0;
Expand Down Expand Up @@ -66,7 +70,11 @@ class GgmlDecoder : public DecoderBase {

virtual bool is_stateful() const = 0;

virtual bool is_splited_model() const = 0;

virtual int is_swa_layer(int layer) const = 0;

virtual int32_t get_op_dynamic_dim(int node_idx) const = 0;
};

} // namespace ggml
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-openvino/openvino/node_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,22 @@ class NodeContext : public frontend::NodeContext {
return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
}

int32_t get_op_dynamic_dim() const {
return m_decoder->get_op_dynamic_dim(m_node_idx);
}

int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }

size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); }

ov::element::Type get_output_type() const {
return m_decoder->get_output_type(m_node_idx);
}

std::vector<size_t> get_output_stride() const {
return m_decoder->get_output_stride(m_node_idx);
}

Output<Node> get_input(int idx) const override {
return m_tensor_map->at(m_input_names[idx]);
}
Expand Down
22 changes: 6 additions & 16 deletions ggml/src/ggml-openvino/openvino/op/cont.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,17 @@ namespace op {
OutputVector translate_cont(const NodeContext & context) {
num_inputs_check(context, 1, 1);

int op_case = context.get_op_case();
FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");

auto src_shape = context.get_input_shape(0).to_shape();
auto dst_shape = context.get_output_shape().to_shape();
ov::Output<Node> res;

if (op_case == 1) {
// The input comes from a PERMUTE
throw std::runtime_error("Code of this case might be outdated");
dst_shape[1] = -1;
res = std::make_shared<ov::op::v1::Reshape>(
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
} else if (op_case == 2) {
// The input comes from a TRANSPOSE
return {context.get_input(0)};
} else {
// The input comes from a VIEW
res = process_view_input(context, 0);
if (context.get_op_dynamic_dim() != -1) {
dst_shape[3 - context.get_op_dynamic_dim()] = -1;
}

ov::Output<Node> res;
res = std::make_shared<ov::op::v1::Reshape>(
context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);

return rename_outputs_with_suffix({res}, context.get_name());
}

Expand Down
18 changes: 7 additions & 11 deletions ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,23 +34,19 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});

ov::Output<ov::Node> mask_sliced, res;
ov::Output<ov::Node> res;

// For stateful
std::string mask_name = "KQ_mask_sliced";
if (context.get_input_names()[3].find("swa") != std::string::npos) {
mask_name = "KQ_mask_swa_sliced";
}
if (context.has_input(mask_name)) {
mask_sliced = context.get_input(mask_name);
} else {
auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
auto token_len = get_dimensions(q, {2});
mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, two);
mask = context.get_input(mask_name);
}

if (mask_sliced.get_element_type() != ov::element::f16) {
mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
if (mask.get_element_type() != ov::element::f16) {
mask = std::make_shared<ov::op::v0::Convert>(mask, ov::element::f16);
}

auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
Expand All @@ -77,7 +73,7 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);

auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask, scale_node, false);
res = std::make_shared<ov::op::v1::Transpose>(sdpa,
ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);
Expand Down
12 changes: 7 additions & 5 deletions ggml/src/ggml-openvino/openvino/op/mulmat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
ov::Output<ov::Node> A = context.get_input(1);

bool transpose_b = true;
if (op_case == 2) {
B = B.get_node_shared_ptr()->input_value(0);
transpose_b = false;
} else if (op_case == 3) {
if (op_case == 3) {
B = process_view_input(context, 0);
A = process_view_input(context, 1);
}
Expand All @@ -55,6 +52,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
auto batch_small = A_batch_larger ? B_batch : A_batch;

Output<Node> Z = A_batch_larger ? B : A;
auto Z_shape = A_batch_larger ? B_shape : A_shape;
int64_t factor = batch_large / batch_small;
if (factor > 1 && batch_small > 1) {
auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{batch_large});
Expand All @@ -67,7 +65,11 @@ OutputVector translate_mulmat(const NodeContext & context) {
auto broadcast_shape = ov::op::v0::Constant::create(
ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
{(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]});
{(int64_t) 0, batch_large, (int64_t) -1, (int64_t) Z_shape[3]});
if (op_case == 2) {
new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
{(int64_t) 0, batch_large, (int64_t) Z_shape[2], (int64_t) -1});
}

auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape,
ov::op::BroadcastType::BIDIRECTIONAL);
Expand Down
Loading
Loading