ravi9 · cavusmustafa · Mar 6, 2026 · Mar 17, 2026 · Mar 19, 2026 · Mar 23, 2026
@@ -1,6 +1,7 @@
 #pragma once
 
-#include "ggml-quants.h"
+#include "ggml-backend-impl.h"
+#include "ggml-backend.h"
 #include "ggml.h"
 #include "openvino/decoder.h"
 
@@ -9,16 +10,13 @@
 #include <map>
 #include <memory>
 #include <openvino/core/partial_shape.hpp>
-#include <optional>
 #include <vector>
 
 struct ModelParams {
     int ctx = -1;
-    int ctx_swa = -1;
     int ctx_per_seq = -1;
     int ctx_per_seq_swa = -1;
     int n_seq = 1;
-    int n_heads = -1;
     int n_heads_kv = -1;
     int head_size = -1;
     int32_t rope_params[15];
@@ -69,6 +67,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
                   std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                   bool is_static,
                   bool is_stateful = false,
+                  bool model_is_splitted = false,
                   bool is_prefill = false,
                   int prefill_chunk_size = 256);
 
@@ -106,10 +105,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual ov::element::Type get_output_type(int node_idx) const override;
 
+    virtual std::vector<size_t> get_output_stride(int node_idx) const override;
+
     virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override;
 
     virtual int32_t * get_output_op_params(int node_idx) const override;
 
+    virtual size_t get_output_op_offset(int node_idx) const override;
+
     virtual std::vector<std::string> get_output_names(int node_idx) const override;
 
     virtual const std::string & get_op_type() const override;
@@ -120,6 +123,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual const std::string & get_op_name(int node_idx) const override;
 
+    virtual int32_t get_op_dynamic_dim(int node_idx) const override;
+
     virtual void visit_subgraph(std::function<void(std::shared_ptr<GgmlDecoder>, int node_idx)> node_visitor) const override;
 
     ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); }
@@ -150,8 +155,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual int get_ctx_size() const { return m_model_params.ctx; }
 
-    virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; }
-
     virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; }
 
     virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; }
@@ -175,7 +178,11 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     virtual bool is_stateful() const override { return m_is_stateful; }
 
-    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const;
+    virtual bool is_splited_model() const override {
+        return m_model_is_splitted;
+    }
+
+    ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input, int dynamic_dim_index=-1) const;
 
     static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename);
 
@@ -205,6 +212,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     bool m_is_prefill = false;
     bool m_naive = false;
     int m_prefill_chunk_size = 0;
+    bool m_model_is_splitted = false; // label the cgraph is splited or not
 
     static ov::Shape get_shape(const ggml_tensor * tensor);
     static std::vector<size_t> get_stride(const ggml_tensor * tensor);
@@ -227,15 +235,17 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     }
 
     inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]);
+        return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) ||
+               (op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]);
     }
 
     inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) {
         return op->op == GGML_OP_ROPE && tensor == op->src[2];
     }
 
     inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) {
-        return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor;
+        return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY ||
+               (op->op == GGML_OP_SET_ROWS && op->src[2] == tensor);
     }
 
     inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) {
@@ -256,9 +266,6 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
         if (is_inp_emb(tensor, op)) {
             return "embd";
         }
-        if (is_output_idx(tensor, op)) {
-            return "inp_out_ids";
-        }
         if (is_inp_mask(tensor, op)) {
             return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa";
         }
@@ -272,6 +279,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     void compute_model_inputs();
     void compute_model_outputs();
 
+    // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph.
+    void compute_node_dynamic_dims();
+
     void validate_cgraph() const;
 
     ggml_cgraph * m_cgraph = nullptr;
@@ -284,6 +294,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
     std::map<std::string, ggml_tensor *> m_model_outputs;
     std::vector<std::string> m_model_output_names;
     std::vector<NodeInfo> m_node_info_list;
+    std::map<ggml_tensor *, int> m_node_dynamic_dims;
 
     ModelParams m_model_params;
     ComputeParams m_compute_params;

@@ -823,15 +823,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n");
             return true;
         }
-        float scale = 1.0f;
-        float max_bias = 0.0f;
-        const auto * op_params = op->op_params;
-        memcpy(&scale, (const float *) op_params + 0, sizeof(float));
-        memcpy(&max_bias, (const float *) op_params + 1, sizeof(float));
-        if (max_bias > 0) {
-            // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n");
-            return true;
-        }
         break;
     }
     case GGML_OP_FLASH_ATTN_EXT: {
@@ -883,9 +874,6 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) {
             return true;
         }
-        if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) {
-            return true;
-        }
         if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) {
             // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1)
             // triggers a bug in ov matmul_shape_inference.hpp
@@ -909,7 +897,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
             //               op->src[0]->ne[0]);
             return true;
         }
-        if (op->type != GGML_TYPE_F32) {
+        if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) {
             // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type));
             return true;
         }
@@ -930,6 +918,14 @@ static bool is_op_unsupported_case(const ggml_tensor * op) {
         }
         break;
     }
+    case GGML_OP_TRANSPOSE: {
+        // if the type is bf16, will return true
+        if (op->type == GGML_TYPE_BF16) {
+            // GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n");
+            return true;
+        }
+        break;
+    }
     default:
         break;
     }
@@ -951,14 +947,14 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con
                                                GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K};
 
     static const std::set<ggml_op> supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW,
-                                                 /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
-                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE,
-                                                 // softmax is not updated due to replaced by flash_attn_ext
-                                                 // GGML_OP_SOFT_MAX,
+                                                 GGML_OP_CONT, GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE,
+                                                 GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, GGML_OP_NORM,
+                                                 GGML_OP_SOFT_MAX,
                                                  GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY};
     static const std::set<ggml_unary_op> supported_unary_ops{
         GGML_UNARY_OP_GELU,
         GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_TANH,
     };
     static const std::set<ggml_glu_op> supported_glu_ops{
         GGML_GLU_OP_SWIGLU,

@@ -35,10 +35,14 @@ class GgmlDecoder : public DecoderBase {
 
     virtual element::Type get_output_type(const int node_idx) const = 0;
 
+    virtual std::vector<size_t> get_output_stride(int node_idx) const = 0;
+
     virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0;
 
     virtual int32_t * get_output_op_params(int node_idx) const = 0;
 
+    virtual size_t get_output_op_offset(int node_idx) const = 0;
+
     virtual std::vector<std::string> get_output_names(int node_idx) const = 0;
 
     virtual const std::string& get_op_type() const = 0;
@@ -66,7 +70,11 @@ class GgmlDecoder : public DecoderBase {
 
     virtual bool is_stateful() const = 0;
 
+    virtual bool is_splited_model() const = 0;
+
     virtual int is_swa_layer(int layer) const = 0;
+
+    virtual int32_t get_op_dynamic_dim(int node_idx) const = 0;
 };
 
 }  // namespace ggml

@@ -59,12 +59,22 @@ class NodeContext : public frontend::NodeContext {
         return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]);
     }
 
+    int32_t get_op_dynamic_dim() const {
+        return m_decoder->get_op_dynamic_dim(m_node_idx);
+    }
+
     int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); }
 
+    size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); }
+
     ov::element::Type get_output_type() const {
         return m_decoder->get_output_type(m_node_idx);
     }
 
+    std::vector<size_t> get_output_stride() const {
+        return m_decoder->get_output_stride(m_node_idx);
+    }
+
     Output<Node> get_input(int idx) const override {
         return m_tensor_map->at(m_input_names[idx]);
     }

@@ -18,27 +18,17 @@ namespace op {
 OutputVector translate_cont(const NodeContext & context) {
     num_inputs_check(context, 1, 1);
 
-    int op_case = context.get_op_case();
-    FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case");
-
     auto src_shape = context.get_input_shape(0).to_shape();
     auto dst_shape = context.get_output_shape().to_shape();
-    ov::Output<Node> res;
 
-    if (op_case == 1) {
-        // The input comes from a PERMUTE
-        throw std::runtime_error("Code of this case might be outdated");
-        dst_shape[1] = -1;
-        res = std::make_shared<ov::op::v1::Reshape>(
-            context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
-    } else if (op_case == 2) {
-        // The input comes from a TRANSPOSE
-        return {context.get_input(0)};
-    } else {
-        // The input comes from a VIEW
-        res = process_view_input(context, 0);
+    if (context.get_op_dynamic_dim() != -1) {
+        dst_shape[3 - context.get_op_dynamic_dim()] = -1;
     }
 
+    ov::Output<Node> res;
+    res = std::make_shared<ov::op::v1::Reshape>(
+        context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false);
+
     return rename_outputs_with_suffix({res}, context.get_name());
 }
 

@@ -34,23 +34,19 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
     auto q = std::make_shared<ov::op::v0::Convert>(q_f32, ov::element::f16);
     auto scale_node = std::make_shared<ov::op::v0::Constant>(ov::element::f16, ov::Shape{}, std::vector<float>{scale});
 
-    ov::Output<ov::Node> mask_sliced, res;
+    ov::Output<ov::Node> res;
+
+    // For stateful
     std::string mask_name = "KQ_mask_sliced";
     if (context.get_input_names()[3].find("swa") != std::string::npos) {
         mask_name = "KQ_mask_swa_sliced";
     }
     if (context.has_input(mask_name)) {
-        mask_sliced = context.get_input(mask_name);
-    } else {
-        auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0});
-        auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1});
-        auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2});
-        auto token_len = get_dimensions(q, {2});
-        mask_sliced = std::make_shared<ov::op::v8::Slice>(mask, zero, token_len, one, two);
+        mask = context.get_input(mask_name);
     }
 
-    if (mask_sliced.get_element_type() != ov::element::f16) {
-        mask_sliced = std::make_shared<ov::op::v0::Convert>(mask_sliced, ov::element::f16);
+    if (mask.get_element_type() != ov::element::f16) {
+        mask = std::make_shared<ov::op::v0::Convert>(mask, ov::element::f16);
     }
 
     auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output<Node> kv) {
@@ -77,7 +73,7 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) {
     k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k);
     v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v);
 
-    auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask_sliced, scale_node, false);
+    auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(q, k, v, mask, scale_node, false);
     res = std::make_shared<ov::op::v1::Transpose>(sdpa,
                                                   ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}));
     res = std::make_shared<ov::op::v0::Convert>(res, ov::element::f32);

@@ -34,10 +34,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
     ov::Output<ov::Node> A = context.get_input(1);
 
     bool transpose_b = true;
-    if (op_case == 2) {
-        B = B.get_node_shared_ptr()->input_value(0);
-        transpose_b = false;
-    } else if (op_case == 3) {
+    if (op_case == 3) {
         B = process_view_input(context, 0);
         A = process_view_input(context, 1);
     }
@@ -55,6 +52,7 @@ OutputVector translate_mulmat(const NodeContext & context) {
     auto batch_small = A_batch_larger ? B_batch : A_batch;
 
     Output<Node> Z = A_batch_larger ? B : A;
+    auto Z_shape = A_batch_larger ? B_shape : A_shape;
     int64_t factor = batch_large / batch_small;
     if (factor > 1 && batch_small > 1) {
         auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector<int64_t>{batch_large});
@@ -67,7 +65,11 @@ OutputVector translate_mulmat(const NodeContext & context) {
         auto broadcast_shape = ov::op::v0::Constant::create(
             ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1});
         auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
-                                                        {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]});
+                                                        {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) Z_shape[3]});
+        if (op_case == 2) {
+            new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4},
+                                                       {(int64_t) 0, batch_large, (int64_t) Z_shape[2], (int64_t) -1});
+        }
 
         auto Z_broadcasted = std::make_shared<ov::op::v3::Broadcast>(Z_unsqueezed, broadcast_shape,
                                                                      ov::op::BroadcastType::BIDIRECTIONAL);