Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 39 additions & 24 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1445,7 +1445,7 @@ class MmprojModel(ModelBase):
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]

n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "width.clip-l-14-224.layers", "sam_vit_b.layers"]
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "layers"]

has_vision_encoder: bool = True # by default
has_audio_encoder: bool = False
Expand Down Expand Up @@ -1494,8 +1494,8 @@ def __init__(self, *args, **kwargs):
# FIXME: DeepseekOCRVisionModel specific hack
if self.block_count is None:
if isinstance(self, DeepseekOCRVisionModel):
clip_block_count = self.hparams['width']['clip-l-14-224']['layers']
sam_block_count = self.hparams['width']['sam_vit_b']['layers']
print(self.hparams)
clip_block_count = self.hparams['layers']
if clip_block_count is not None:
self.block_count = clip_block_count
if sam_block_count is not None:
Expand Down Expand Up @@ -5793,6 +5793,16 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter

@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

proc_fname = self.dir_model / "processor_config.json"

if proc_fname.is_file():
with open(proc_fname, "r") as f:
self.preprocessor_config = json.load(f)


def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
Expand All @@ -5811,10 +5821,25 @@ def set_gguf_parameters(self):
# in this case, we are converting a test model
self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor)

# SAM configuration
sam_hparams = hparams['sam']
self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers'])
self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width'])

def get_vision_config(self) -> dict[str, Any]:
orig_vision_config = self.global_config.get("vision_config")
vision_config: dict[str, Any] | None = self.global_config.get("vision_config")

if not vision_config:
raise ValueError("DeepseekOCR model requires 'vision_config' in the model configuration, but it was not found")

vision_config['sam'] = vision_config['width']['sam_vit_b']
vision_config.update(vision_config['width']['clip-l-14-224'])
vision_config['hidden_size'] = vision_config['width']
vision_config['num_heads'] = vision_config['heads']
vision_config['intermediate_size'] = vision_config['heads'] * 4

return vision_config

super().get_vision_config()

def tensor_force_quant(self, name, new_name, bid, n_dims):
# related to https://github.com/ggml-org/llama.cpp/issues/13025
Expand All @@ -5825,27 +5850,17 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

if "vision_model.head." in name:
return [] # skip redundant tensors for tinygemma3

if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
# process vision tensors
name = name.replace("_weight", ".weight")

# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
# the other norm values are part of SigLIP model, and they are already correct
# ref code: Gemma3RMSNorm
if "soft_emb_norm.weight" in name:
logger.info(f"Correcting norm value for '{name}'")
data_torch = data_torch + 1

return [(self.map_tensor_name(name), data_torch)]
# Only process vision-related tensors, skip language model tensors
# Vision components: sam_model, vision_model, projector, image_newline, view_seperator
# Language model components to skip: lm_head, embed_tokens, layers, norm
if name.startswith(("lm_head.", "model.embed_tokens.", "model.layers.", "model.norm.")):
return []

return [] # skip other tensors
if ".attn.rel_pos_h" in name or ".attn.rel_pos_w" in name:
return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)]

return [(self.map_tensor_name(name), data_torch)]


@ModelBase.register("Gemma3nForConditionalGeneration")
class Gemma3NModel(Gemma3Model):
Expand Down
50 changes: 36 additions & 14 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,10 @@ class Attention:
class Projector:
SCALE_FACTOR = "clip.vision.projector.scale_factor"

class SAM:
BLOCK_COUNT = "clip.vision.sam.block_count"
EMBEDDING_LENGTH = "clip.vision.sam.embedding_length"

class ClipAudio:
NUM_MEL_BINS = "clip.audio.num_mel_bins"
EMBEDDING_LENGTH = "clip.audio.embedding_length"
Expand Down Expand Up @@ -664,20 +668,21 @@ class MODEL_TENSOR(IntEnum):
V_MM_GATE = auto() # cogvlm
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
# DeepSeek-OCR sam_model
V_SAM_POS_EMBD = auto()
V_SAM_PATCH_EMBD = auto()
V_SAM_PRE_NORM = auto()
V_SAM_POST_NORM = auto()
V_SAM_ATTN_POS_H = auto()
V_SAM_ATTN_POS_W = auto()
V_SAM_ATTN_QKV = auto()
V_SAM_ATTN_OUT = auto()
V_SAM_MLP_LIN_1 = auto()
V_SAM_MLP_LIN_2 = auto()
V_SAM_NECK = auto()
V_SAM_NET_2 = auto()
V_SAM_NET_3 = auto()
V_SAM_POS_EMBD = auto() # Deepseek-OCR
V_SAM_PATCH_EMBD = auto() # Deepseek-OCR
V_SAM_PRE_NORM = auto() # Deepseek-OCR
V_SAM_POST_NORM = auto() # Deepseek-OCR
V_SAM_ATTN_POS_H = auto() # Deepseek-OCR
V_SAM_ATTN_POS_W = auto() # Deepseek-OCR
V_SAM_ATTN_QKV = auto() # Deepseek-OCR
V_SAM_ATTN_OUT = auto() # Deepseek-OCR
V_SAM_MLP_LIN_1 = auto() # Deepseek-OCR
V_SAM_MLP_LIN_2 = auto() # Deepseek-OCR
V_SAM_NECK = auto() # Deepseek-OCR
V_SAM_NET_2 = auto() # Deepseek-OCR
V_SAM_NET_3 = auto() # Deepseek-OCR
V_ENC_EMBD_IMGNL = auto() # Deepseek-OCR
V_ENC_EMBD_VSEP = auto() # Deepseek-OCR

# audio (mtmd)
A_ENC_EMBD_POS = auto()
Expand Down Expand Up @@ -1059,6 +1064,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_SAM_NECK: "v.sam.neck.{bid}",
MODEL_TENSOR.V_SAM_NET_2: "v.sam.net_2",
MODEL_TENSOR.V_SAM_NET_3: "v.sam.net_3",
MODEL_TENSOR.V_ENC_EMBD_IMGNL: "v.image_newline_embd", # Deepseek-OCR
MODEL_TENSOR.V_ENC_EMBD_VSEP: "v.view_separator_embd", # Deepseek-OCR
# audio (mtmd)
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
Expand Down Expand Up @@ -1095,6 +1102,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_ENC_EMBD_CLS,
MODEL_TENSOR.V_ENC_EMBD_PATCH,
MODEL_TENSOR.V_ENC_EMBD_POS,
MODEL_TENSOR.V_ENC_EMBD_IMGNL,
MODEL_TENSOR.V_ENC_EMBD_VSEP,
MODEL_TENSOR.V_ENC_INPUT_NORM,
MODEL_TENSOR.V_ENC_ATTN_QKV,
MODEL_TENSOR.V_ENC_ATTN_Q,
Expand Down Expand Up @@ -1137,6 +1146,19 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.V_MM_GATE,
MODEL_TENSOR.V_TOK_BOI,
MODEL_TENSOR.V_TOK_EOI,
MODEL_TENSOR.V_SAM_POS_EMBD,
MODEL_TENSOR.V_SAM_PATCH_EMBD,
MODEL_TENSOR.V_SAM_PRE_NORM,
MODEL_TENSOR.V_SAM_POST_NORM,
MODEL_TENSOR.V_SAM_ATTN_POS_H,
MODEL_TENSOR.V_SAM_ATTN_POS_W,
MODEL_TENSOR.V_SAM_ATTN_QKV,
MODEL_TENSOR.V_SAM_ATTN_OUT,
MODEL_TENSOR.V_SAM_MLP_LIN_1,
MODEL_TENSOR.V_SAM_MLP_LIN_2,
MODEL_TENSOR.V_SAM_NECK,
MODEL_TENSOR.V_SAM_NET_2,
MODEL_TENSOR.V_SAM_NET_3,
# audio
MODEL_TENSOR.A_ENC_EMBD_POS,
MODEL_TENSOR.A_ENC_CONV1D,
Expand Down
6 changes: 6 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,12 @@ def add_vision_n_wa_pattern(self, value: int) -> None:
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)


def add_vision_sam_layers_count(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.SAM.BLOCK_COUNT, value)

def add_vision_sam_embedding_length(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value)
# audio models

def add_audio_projection_dim(self, value: int) -> None:
Expand Down
39 changes: 30 additions & 9 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1179,6 +1179,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ_FC: (
"model.connector.modality_projection.proj", # SmolVLM
"model.vision.linear_proj.linear_proj", # cogvlm
"model.projector.layers", # Deepseek-OCR
),

MODEL_TENSOR.V_MMPROJ_MLP: (
Expand All @@ -1197,6 +1198,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.cls_token", # Intern-S1
"vision_model.class_embedding", # llama 4
"model.vision.patch_embedding.cls_embedding", # cogvlm
"model.vision_model.embeddings.class_embedding", # Deepseek-OCR
),

MODEL_TENSOR.V_ENC_EMBD_PATCH: (
Expand All @@ -1210,6 +1212,7 @@ class TensorNameMap:
"visual.patch_embed.proj", # qwen2vl
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
"model.vision_model.embeddings.patch_embedding", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_ENC_EMBD_POS: (
Expand All @@ -1222,10 +1225,19 @@ class TensorNameMap:
"visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm
),

MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
"model.image_newline", # Deepseek-OCR
),

MODEL_TENSOR.V_ENC_EMBD_VSEP: (
"model.view_seperator", # Deepseek-OCR
),

MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_ENC_ATTN_Q: (
Expand All @@ -1238,6 +1250,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated
"model.vision_model.transformer.layers.{bid}.self_attn.q_proj", # Deepseek-OCR CLIP, generated
),

MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
Expand All @@ -1255,6 +1268,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated
"model.vision_model.transformer.layers.{bid}.self_attn.k_proj", # Deepseek-OCR CLIP, generated
),

MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
Expand All @@ -1272,6 +1286,7 @@ class TensorNameMap:
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
"vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated
"model.vision_model.transformer.layers.{bid}.self_attn.v_proj", # Deepseek-OCR CLIP, generated
),

MODEL_TENSOR.V_ENC_INPUT_NORM: (
Expand All @@ -1286,6 +1301,7 @@ class TensorNameMap:
"visual.blocks.{bid}.norm1", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_ENC_ATTN_O: (
Expand All @@ -1301,6 +1317,7 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.proj", # qwen2vl
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"model.vision_model.transformer.layers.{bid}.self_attn.out_proj", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
Expand All @@ -1315,6 +1332,7 @@ class TensorNameMap:
"visual.blocks.{bid}.norm2", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"model.vision_model.transformer.layers.{bid}.layer_norm2", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_ENC_FFN_UP: (
Expand All @@ -1329,6 +1347,7 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision_model.transformer.layers.{bid}.mlp.fc1", # Deepseek-OCR CLIP
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
),

Expand All @@ -1351,6 +1370,7 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"model.vision_model.transformer.layers.{bid}.mlp.fc2", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_LAYER_SCALE_1: (
Expand All @@ -1368,6 +1388,7 @@ class TensorNameMap:
"vision_tower.ln_pre", # pixtral-hf
"vision_encoder.ln_pre", # pixtral
"vision_model.layernorm_pre", # llama4
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
),

MODEL_TENSOR.V_POST_NORM: (
Expand Down Expand Up @@ -1460,11 +1481,11 @@ class TensorNameMap:
),

MODEL_TENSOR.V_SAM_POS_EMBD: (
"model.sam_model.pos_embed"
"model.sam_model.pos_embed",
),

MODEL_TENSOR.V_SAM_PATCH_EMBD: (
"model.sam_model.patch_embed.proj"
"model.sam_model.patch_embed.proj",
),

MODEL_TENSOR.V_SAM_PRE_NORM: (
Expand All @@ -1476,19 +1497,19 @@ class TensorNameMap:
),

MODEL_TENSOR.V_SAM_ATTN_POS_H: (
"model.sam_model.blocks.{bid}.attn.rel_pos_h"
"model.sam_model.blocks.{bid}.attn.rel_pos_h",
),

MODEL_TENSOR.V_SAM_ATTN_POS_W: (
"model.sam_model.blocks.{bid}.attn.rel_pos_w"
"model.sam_model.blocks.{bid}.attn.rel_pos_w",
),

MODEL_TENSOR.V_SAM_ATTN_QKV: (
"model.sam_model.blocks.{bid}.attn.qkv"
"model.sam_model.blocks.{bid}.attn.qkv",
),

MODEL_TENSOR.V_SAM_ATTN_OUT: (
"model.sam_model.blocks.{bid}.attn.proj"
"model.sam_model.blocks.{bid}.attn.proj",
),

MODEL_TENSOR.V_SAM_MLP_LIN_1: (
Expand All @@ -1500,15 +1521,15 @@ class TensorNameMap:
),

MODEL_TENSOR.V_SAM_NECK: (
"model.sam_model.neck.{bid}"
"model.sam_model.neck.{bid}",
),

MODEL_TENSOR.V_SAM_NET_2: (
"model.sam_model.net_2"
"model.sam_model.net_2",
),

MODEL_TENSOR.V_SAM_NET_3: (
"model.sam_model.net_3"
"model.sam_model.net_3",
),

MODEL_TENSOR.V_MM_POST_FC_NORM: (
Expand Down