diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 8ff93f08b9d..5e6cc79f379 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -228,10 +228,12 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { text.add_special = add_bos; text.parse_special = true; + std::string formatted_chat; + if (!mtmd_is_deepseekocr(ctx.ctx_vision.get())) { - auto formatted_chat = chat_add_and_format(ctx, msg); + formatted_chat = chat_add_and_format(ctx, msg); LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); - text.text = formatted_chat.c_str(); + text.text = formatted_chat.c_str(); } if (g_is_interrupted) return 0; @@ -316,8 +318,18 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { - for (size_t i = 0; i < params.image.size(); i++) { - params.prompt += mtmd_default_marker(); + if (mtmd_is_deepseekocr(ctx.ctx_vision.get())) { + std::string image_tokens = ""; + for (size_t i = 0; i < params.image.size(); i++) { + image_tokens += mtmd_default_marker(); + image_tokens += '\n'; + } + params.prompt = image_tokens + params.prompt; + } + else { + for (size_t i = 0; i < params.image.size(); i++) { + params.prompt += mtmd_default_marker(); + } } } common_chat_msg msg;