sfallah · sfallah · Dec 2, 2025 · Nov 29, 2025 · Nov 29, 2025 · Nov 29, 2025
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1824,6 +1824,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.image_max_tokens = value;
         }
     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
+    add_opt(common_arg(
+        {"--dsocr-mode"}, "MODE",
+        "DeepSeek-OCR resolution mode, one of:\n"
+        "- auto (default): automatically select resolution\n"
+        "- tiny, small, base, large: native resolution\n"
+        "- gundam, gundam-master: dynamic resolution",
+        [](common_params & params, const std::string & value) {
+            if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
+                value == "large" || value == "gundam" || value == "gundam-master") {
+                params.dsocr_mode = value;
+            } else {
+                throw std::invalid_argument("invalid value");
+            }
+        }
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",

diff --git a/common/common.h b/common/common.h
@@ -433,6 +433,7 @@ struct common_params {
     std::vector<std::string> image; // path to image file(s)
     int image_min_tokens = -1;
     int image_max_tokens = -1;
+    std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master
 
     // finetune
     struct lr_opt lr;

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -6013,12 +6013,14 @@ def get_vision_config(self) -> dict[str, Any]:
 
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # TODO: increase numercial stability. maybe delete later. 
+        return gguf.GGMLQuantizationType.F32
         # related to https://github.com/ggml-org/llama.cpp/issues/13025
-        if "input_projection" in name:
-            return gguf.GGMLQuantizationType.F16
-        if ".embeddings." in name:
-            return gguf.GGMLQuantizationType.F32
-        return super().tensor_force_quant(name, new_name, bid, n_dims)
+        # if "input_projection" in name:
+        #     return gguf.GGMLQuantizationType.F16
+        # if ".embeddings." in name:
+        #     return gguf.GGMLQuantizationType.F32
+        # return super().tensor_force_quant(name, new_name, bid, n_dims)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Only process vision-related tensors, skip language model tensors

diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
@@ -214,5 +214,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                                  src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
                                  sf0, sf1, sf2, sf3, pixel_offset, stream);
+    } else {
+        GGML_ABORT("fatal error");
     }
 }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -5204,6 +5204,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
     GGML_ASSERT(q->ne[3] == v->ne[3]);
 
     if (mask) {
+        GGML_ASSERT(mask->type == GGML_TYPE_F16);
         GGML_ASSERT(ggml_is_contiguous(mask));
         GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
                 "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");

diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -5,6 +5,7 @@
 #include <climits>
 #include <cstdarg>
 #include <cinttypes>
+#include <cstring>
 #include <string>
 #include <map>
 #include <sstream>
@@ -442,6 +443,33 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
 // debugging
 //
 
+
+static std::string to_ne_string(const ggml_tensor * t) {
+    std::string str;
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        str += std::to_string(t->ne[i]);
+        if (i + 1 < GGML_MAX_DIMS) {
+            str += ", ";
+        }
+    }
+    return str;
+}
+
+static void print_tensor_info(ggml_tensor * t) {
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    char src1_str[128] = {0};
+    if (src1) {
+        snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, to_ne_string(src1).c_str());
+    }
+
+    printf("%s: %s = %s(%s{%s}, %s)\n",
+        t->name, ggml_type_name(t->type), ggml_op_desc(t),
+        src0->name, to_ne_string(src0).c_str(),
+        src1 ? src1_str : "");
+}
+
 static void print_tensor_shape(ggml_tensor * t) {
     printf("%s.shape = [", t->name);
     for (int i = 0; i < ggml_n_dims(t); ++i) {
@@ -453,12 +481,50 @@ static void print_tensor_shape(ggml_tensor * t) {
     printf("]\n");
 }
 
+static void print_tensor_sum(ggml_tensor * t, uint8_t * data, int64_t n) {
+    (void) n; // unused parameter
+    ggml_type type = t->type;
+    int64_t * ne = t->ne;
+    size_t * nb = t->nb;
+    double sum = 0.0;
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) &data[i];
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) &data[i];
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) &data[i];
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) &data[i];
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                    sum += v;
+                }
+            }
+        }
+    }
+    printf("%s.sum = %.6f\n", t->name, sum);
+}
+
 static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
     ggml_type type = t->type;
     int64_t * ne = t->ne;
     size_t * nb = t->nb;
+    printf("%s.data: [\n", t->name);
     for (int64_t i3 = 0; i3 < ne[3]; i3++) {
-        printf("%s.data: [\n", t->name);
+        if (i3 == n && ne[3] > 2*n) {
+            printf("    ..., \n");
+            i3 = ne[3] - n;
+        }
+        printf("    [\n");
         for (int64_t i2 = 0; i2 < ne[2]; i2++) {
             if (i2 == n && ne[2] > 2*n) {
                 printf("     ..., \n");
@@ -500,6 +566,120 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
         }
         printf("    ]\n");
     }
+    printf("   ]\n");
+}
+
+static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t * data_ptr) {
+    char filename[512];
+    snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name);
+
+    FILE * f = fopen(filename, "w");
+    if (!f) {
+        fprintf(stderr, "Failed to open %s\n", filename);
+        return;
+    }
+
+    // Check tensor size and warn if too large
+    int64_t total_elements = ggml_nelements(tensor);
+    fprintf(stderr, "Saving tensor %s (%lld elements) to %s\n",
+            tensor->name, (long long)total_elements, filename);
+
+    if (total_elements > 10000000) { // 10M elements
+        fprintf(stderr, "Warning: tensor is very large (%lld elements), this may take time\n",
+                (long long)total_elements);
+    }
+
+    const uint8_t * data = (data_ptr) ? data_ptr : (uint8_t *) tensor->data;
+    ggml_type type = tensor->type;
+    const int64_t * ne = tensor->ne;
+    const size_t * nb = tensor->nb;
+
+    // Use a buffer to reduce I/O calls
+    const size_t BUF_SIZE = 8192;
+    char * buf = (char *) malloc(BUF_SIZE);
+    if (!buf) {
+        fprintf(stderr, "Failed to allocate buffer\n");
+        fclose(f);
+        return;
+    }
+    size_t buf_pos = 0;
+
+    // Helper lambda to flush buffer
+    auto flush_buf = [&]() {
+        if (buf_pos > 0) {
+            fwrite(buf, 1, buf_pos, f);
+            buf_pos = 0;
+        }
+    };
+
+    // Helper to append to buffer
+    auto append = [&](const char * str, size_t len) {
+        if (buf_pos + len >= BUF_SIZE) {
+            flush_buf();
+        }
+        if (len >= BUF_SIZE) {
+            // String too large for buffer, write directly
+            fwrite(str, 1, len, f);
+        } else {
+            memcpy(buf + buf_pos, str, len);
+            buf_pos += len;
+        }
+    };
+
+    auto append_str = [&](const char * str) {
+        append(str, strlen(str));
+    };
+
+    char num_buf[32];
+
+    // Write header once for all batches
+    append_str(tensor->name);
+    append_str(".data: [\n");
+
+    for (int64_t i3 = 0; i3 < ne[3]; i3++) {
+        append_str("    [\n");  // Start of batch
+        for (int64_t i2 = 0; i2 < ne[2]; i2++) {
+            append_str("     [\n");
+            for (int64_t i1 = 0; i1 < ne[1]; i1++) {
+                append_str("      [");
+                for (int64_t i0 = 0; i0 < ne[0]; i0++) {
+                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
+                    float v;
+                    if (type == GGML_TYPE_F16) {
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                    } else if (type == GGML_TYPE_F32) {
+                        v = *(float *) &data[i];
+                    } else if (type == GGML_TYPE_I32) {
+                        v = (float) *(int32_t *) &data[i];
+                    } else if (type == GGML_TYPE_I16) {
+                        v = (float) *(int16_t *) &data[i];
+                    } else if (type == GGML_TYPE_I8) {
+                        v = (float) *(int8_t *) &data[i];
+                    } else {
+                        GGML_ABORT("fatal error");
+                    }
+                    int len = snprintf(num_buf, sizeof(num_buf), "%8.4f", v);
+                    append(num_buf, len);
+                    if (i0 < ne[0] - 1) append_str(", ");
+                }
+                append_str("],\n");
+            }
+            append_str("     ],\n");
+        }
+        append_str("    ]");  // End of batch
+        if (i3 < ne[3] - 1) {
+            append_str(",\n");  // Comma between batches
+        } else {
+            append_str("\n");
+        }
+    }
+
+    append_str("]\n");  // Close the top-level array
+
+    flush_buf();
+    free(buf);
+    fclose(f);
+    fprintf(stderr, "Tensor saved successfully\n");
 }
 
 //