Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1824,6 +1824,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.image_max_tokens = value;
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
add_opt(common_arg(
{"--dsocr-mode"}, "MODE",
"DeepSeek-OCR resolution mode, one of:\n"
"- auto (default): automatically select resolution\n"
"- tiny, small, base, large: native resolution\n"
"- gundam, gundam-master: dynamic resolution",
[](common_params & params, const std::string & value) {
if (value == "auto" || value == "tiny" || value == "small" || value == "base" ||
value == "large" || value == "gundam" || value == "gundam-master") {
params.dsocr_mode = value;
} else {
throw std::invalid_argument("invalid value");
}
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_DSOCR_MODE"));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,7 @@ struct common_params {
std::vector<std::string> image; // path to image file(s)
int image_min_tokens = -1;
int image_max_tokens = -1;
std::string dsocr_mode = "auto"; // DeepSeek-OCR resolution mode: auto, tiny, small, base, large, gundam, gundam-master

// finetune
struct lr_opt lr;
Expand Down
12 changes: 7 additions & 5 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6013,12 +6013,14 @@ def get_vision_config(self) -> dict[str, Any]:


def tensor_force_quant(self, name, new_name, bid, n_dims):
# TODO: increase numercial stability. maybe delete later.
return gguf.GGMLQuantizationType.F32
# related to https://github.com/ggml-org/llama.cpp/issues/13025
if "input_projection" in name:
return gguf.GGMLQuantizationType.F16
if ".embeddings." in name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
# if "input_projection" in name:
# return gguf.GGMLQuantizationType.F16
# if ".embeddings." in name:
# return gguf.GGMLQuantizationType.F32
# return super().tensor_force_quant(name, new_name, bid, n_dims)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Only process vision-related tensors, skip language model tensors
Expand Down
2 changes: 2 additions & 0 deletions ggml/src/ggml-cuda/upscale.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,5 +214,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
sf0, sf1, sf2, sf3, pixel_offset, stream);
} else {
GGML_ABORT("fatal error");
}
}
1 change: 1 addition & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -5204,6 +5204,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
GGML_ASSERT(q->ne[3] == v->ne[3]);

if (mask) {
GGML_ASSERT(mask->type == GGML_TYPE_F16);
GGML_ASSERT(ggml_is_contiguous(mask));
GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
"the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
Expand Down
182 changes: 181 additions & 1 deletion tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <climits>
#include <cstdarg>
#include <cinttypes>
#include <cstring>
#include <string>
#include <map>
#include <sstream>
Expand Down Expand Up @@ -442,6 +443,33 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
// debugging
//


static std::string to_ne_string(const ggml_tensor * t) {
std::string str;
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
str += std::to_string(t->ne[i]);
if (i + 1 < GGML_MAX_DIMS) {
str += ", ";
}
}
return str;
}

static void print_tensor_info(ggml_tensor * t) {
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];

char src1_str[128] = {0};
if (src1) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, to_ne_string(src1).c_str());
}

printf("%s: %s = %s(%s{%s}, %s)\n",
t->name, ggml_type_name(t->type), ggml_op_desc(t),
src0->name, to_ne_string(src0).c_str(),
src1 ? src1_str : "");
}

static void print_tensor_shape(ggml_tensor * t) {
printf("%s.shape = [", t->name);
for (int i = 0; i < ggml_n_dims(t); ++i) {
Expand All @@ -453,12 +481,50 @@ static void print_tensor_shape(ggml_tensor * t) {
printf("]\n");
}

static void print_tensor_sum(ggml_tensor * t, uint8_t * data, int64_t n) {
(void) n; // unused parameter
ggml_type type = t->type;
int64_t * ne = t->ne;
size_t * nb = t->nb;
double sum = 0.0;
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ABORT("fatal error");
}
sum += v;
}
}
}
}
printf("%s.sum = %.6f\n", t->name, sum);
}

static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
ggml_type type = t->type;
int64_t * ne = t->ne;
size_t * nb = t->nb;
printf("%s.data: [\n", t->name);
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
printf("%s.data: [\n", t->name);
if (i3 == n && ne[3] > 2*n) {
printf(" ..., \n");
i3 = ne[3] - n;
}
printf(" [\n");
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
if (i2 == n && ne[2] > 2*n) {
printf(" ..., \n");
Expand Down Expand Up @@ -500,6 +566,120 @@ static void print_tensor_data(ggml_tensor * t, uint8_t * data, int64_t n) {
}
printf(" ]\n");
}
printf(" ]\n");
}

static void save_tensor_to_file(const struct ggml_tensor * tensor, const uint8_t * data_ptr) {
char filename[512];
snprintf(filename, sizeof(filename), "%s_cpp.txt", tensor->name);

FILE * f = fopen(filename, "w");
if (!f) {
fprintf(stderr, "Failed to open %s\n", filename);
return;
}

// Check tensor size and warn if too large
int64_t total_elements = ggml_nelements(tensor);
fprintf(stderr, "Saving tensor %s (%lld elements) to %s\n",
tensor->name, (long long)total_elements, filename);

if (total_elements > 10000000) { // 10M elements
fprintf(stderr, "Warning: tensor is very large (%lld elements), this may take time\n",
(long long)total_elements);
}

const uint8_t * data = (data_ptr) ? data_ptr : (uint8_t *) tensor->data;
ggml_type type = tensor->type;
const int64_t * ne = tensor->ne;
const size_t * nb = tensor->nb;

// Use a buffer to reduce I/O calls
const size_t BUF_SIZE = 8192;
char * buf = (char *) malloc(BUF_SIZE);
if (!buf) {
fprintf(stderr, "Failed to allocate buffer\n");
fclose(f);
return;
}
size_t buf_pos = 0;

// Helper lambda to flush buffer
auto flush_buf = [&]() {
if (buf_pos > 0) {
fwrite(buf, 1, buf_pos, f);
buf_pos = 0;
}
};

// Helper to append to buffer
auto append = [&](const char * str, size_t len) {
if (buf_pos + len >= BUF_SIZE) {
flush_buf();
}
if (len >= BUF_SIZE) {
// String too large for buffer, write directly
fwrite(str, 1, len, f);
} else {
memcpy(buf + buf_pos, str, len);
buf_pos += len;
}
};

auto append_str = [&](const char * str) {
append(str, strlen(str));
};

char num_buf[32];

// Write header once for all batches
append_str(tensor->name);
append_str(".data: [\n");

for (int64_t i3 = 0; i3 < ne[3]; i3++) {
append_str(" [\n"); // Start of batch
for (int64_t i2 = 0; i2 < ne[2]; i2++) {
append_str(" [\n");
for (int64_t i1 = 0; i1 < ne[1]; i1++) {
append_str(" [");
for (int64_t i0 = 0; i0 < ne[0]; i0++) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else {
GGML_ABORT("fatal error");
}
int len = snprintf(num_buf, sizeof(num_buf), "%8.4f", v);
append(num_buf, len);
if (i0 < ne[0] - 1) append_str(", ");
}
append_str("],\n");
}
append_str(" ],\n");
}
append_str(" ]"); // End of batch
if (i3 < ne[3] - 1) {
append_str(",\n"); // Comma between batches
} else {
append_str("\n");
}
}

append_str("]\n"); // Close the top-level array

flush_buf();
free(buf);
fclose(f);
fprintf(stderr, "Tensor saved successfully\n");
}

//
Expand Down
Loading