Merge pull request #589 from quarkiverse/ollama-token-opt

Make token limit configuration for Ollama optional
quarkiverse · May 16, 2024 · 369ac9b · 369ac9b
2 parents 62c20d5 + 66e3285
commit 369ac9b
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 20 deletions.
diff --git a/docs/modules/ROOT/pages/includes/quarkus-langchain4j-ollama.adoc b/docs/modules/ROOT/pages/includes/quarkus-langchain4j-ollama.adoc
@@ -195,7 +195,7 @@ ifndef::add-copy-button-to-env-var[]
 Environment variable: `+++QUARKUS_LANGCHAIN4J_OLLAMA_CHAT_MODEL_NUM_PREDICT+++`
 endif::add-copy-button-to-env-var[]
 --|int 
-|`128`
+|
 
 
 a| [[quarkus-langchain4j-ollama_quarkus-langchain4j-ollama-chat-model-stop]]`link:#quarkus-langchain4j-ollama_quarkus-langchain4j-ollama-chat-model-stop[quarkus.langchain4j.ollama.chat-model.stop]`
@@ -598,7 +598,7 @@ ifndef::add-copy-button-to-env-var[]
 Environment variable: `+++QUARKUS_LANGCHAIN4J_OLLAMA__MODEL_NAME__CHAT_MODEL_NUM_PREDICT+++`
 endif::add-copy-button-to-env-var[]
 --|int 
-|`128`
+|
 
 
 a| [[quarkus-langchain4j-ollama_quarkus-langchain4j-ollama-model-name-chat-model-stop]]`link:#quarkus-langchain4j-ollama_quarkus-langchain4j-ollama-model-name-chat-model-stop[quarkus.langchain4j.ollama."model-name".chat-model.stop]`

diff --git a/docs/modules/ROOT/pages/includes/quarkus-langchain4j.adoc b/docs/modules/ROOT/pages/includes/quarkus-langchain4j.adoc
@@ -46,6 +46,23 @@ endif::add-copy-button-to-env-var[]
 |`11434`
 
 
+a|icon:lock[title=Fixed at build time] [[quarkus-langchain4j_quarkus-langchain4j-devservices-preload]]`link:#quarkus-langchain4j_quarkus-langchain4j-devservices-preload[quarkus.langchain4j.devservices.preload]`
+
+
+[.description]
+--
+Instructs Ollama to preload a model in order to get faster response times
+
+ifdef::add-copy-button-to-env-var[]
+Environment variable: env_var_with_copy_button:+++QUARKUS_LANGCHAIN4J_DEVSERVICES_PRELOAD+++[]
+endif::add-copy-button-to-env-var[]
+ifndef::add-copy-button-to-env-var[]
+Environment variable: `+++QUARKUS_LANGCHAIN4J_DEVSERVICES_PRELOAD+++`
+endif::add-copy-button-to-env-var[]
+--|boolean 
+|`true`
+
+
 a| [[quarkus-langchain4j_quarkus-langchain4j-log-requests]]`link:#quarkus-langchain4j_quarkus-langchain4j-log-requests[quarkus.langchain4j.log-requests]`
 
 

diff --git a/ollama/runtime/src/main/java/io/quarkiverse/langchain4j/ollama/ChatResponse.java b/ollama/runtime/src/main/java/io/quarkiverse/langchain4j/ollama/ChatResponse.java
@@ -3,7 +3,7 @@
 public record ChatResponse(String model, String createdAt, Message message, Boolean done, Integer promptEvalCount,
         Integer evalCount) {
 
-    public static ChatResponse emptyDone() {
+    public static ChatResponse emptyNotDone() {
         return new ChatResponse(null, null, new Message(Role.ASSISTANT, "", null), true, null, null);
     }
 }
diff --git a/ollama/runtime/src/main/java/io/quarkiverse/langchain4j/ollama/OllamaRestApi.java b/ollama/runtime/src/main/java/io/quarkiverse/langchain4j/ollama/OllamaRestApi.java
@@ -93,18 +93,24 @@ public Object aroundReadFrom(ReaderInterceptorContext context) throws IOExceptio
                                     throw e;
                                 }
 
-                                // This piece of code deals with is the case where the last message from Ollama is not sent as entire line
-                                // but in pieces. There is nothing we can do in this case except for returning empty responses.
-                                // We have to keep track of when "done": true has been recorded in order to make sure that subsequent pieces
-                                // are dealt with instead of throwing an exception. We keep track of this by using Vert.x duplicated context
-
-                                if (chunk.contains("\"done\":true")) {
-                                    ctx.putLocal("done", true);
-                                    return ChatResponse.emptyDone();
-                                } else {
-                                    if (Boolean.TRUE.equals(ctx.getLocal("done"))) {
-                                        return ChatResponse.emptyDone();
+                                // This piece of code deals with is the case where a message from Ollama is not received as an entire line
+                                // but in pieces (my guess is that it is a Vertx bug).
+                                // There is nothing we can do in this case except for returning empty responses and in the meantime buffer the pieces
+                                // by storing them in the Vertx Duplicated Context
+                                String existingBuffer = ctx.getLocal("buffer");
+                                if ((existingBuffer != null) && !existingBuffer.isEmpty()) {
+                                    if (chunk.endsWith("}")) {
+                                        ctx.putLocal("buffer", "");
+                                        String entireLine = existingBuffer + chunk;
+                                        return QuarkusJsonCodecFactory.SnakeCaseObjectMapperHolder.MAPPER.readValue(entireLine,
+                                                ChatResponse.class);
+                                    } else {
+                                        ctx.putLocal("buffer", existingBuffer + chunk);
+                                        return ChatResponse.emptyNotDone();
                                     }
+                                } else {
+                                    ctx.putLocal("buffer", chunk);
+                                    return ChatResponse.emptyNotDone();
                                 }
                             }
                         }

diff --git a/ollama/runtime/src/main/java/io/quarkiverse/langchain4j/ollama/runtime/OllamaRecorder.java b/ollama/runtime/src/main/java/io/quarkiverse/langchain4j/ollama/runtime/OllamaRecorder.java
@@ -36,9 +36,11 @@ public Supplier<ChatLanguageModel> chatModel(LangChain4jOllamaConfig runtimeConf
             Options.Builder optionsBuilder = Options.builder()
                     .temperature(chatModelConfig.temperature())
                     .topK(chatModelConfig.topK())
-                    .topP(chatModelConfig.topP())
-                    .numPredict(chatModelConfig.numPredict());
+                    .topP(chatModelConfig.topP());
 
+            if (chatModelConfig.numPredict().isPresent()) {
+                optionsBuilder.numPredict(chatModelConfig.numPredict().getAsInt());
+            }
             if (chatModelConfig.stop().isPresent()) {
                 optionsBuilder.stop(chatModelConfig.stop().get());
             }
@@ -123,9 +125,11 @@ public Supplier<StreamingChatLanguageModel> streamingChatModel(LangChain4jOllama
             Options.Builder optionsBuilder = Options.builder()
                     .temperature(chatModelConfig.temperature())
                     .topK(chatModelConfig.topK())
-                    .topP(chatModelConfig.topP())
-                    .numPredict(chatModelConfig.numPredict());
+                    .topP(chatModelConfig.topP());
 
+            if (chatModelConfig.numPredict().isPresent()) {
+                optionsBuilder.numPredict(chatModelConfig.numPredict().getAsInt());
+            }
             if (chatModelConfig.stop().isPresent()) {
                 optionsBuilder.stop(chatModelConfig.stop().get());
             }

diff --git a/...ntime/src/main/java/io/quarkiverse/langchain4j/ollama/runtime/config/ChatModelConfig.java b/...ntime/src/main/java/io/quarkiverse/langchain4j/ollama/runtime/config/ChatModelConfig.java
@@ -2,6 +2,7 @@
 
 import java.util.List;
 import java.util.Optional;
+import java.util.OptionalInt;
 
 import io.quarkus.runtime.annotations.ConfigDocDefault;
 import io.quarkus.runtime.annotations.ConfigGroup;
@@ -20,8 +21,7 @@ public interface ChatModelConfig {
     /**
      * Maximum number of tokens to predict when generating text
      */
-    @WithDefault("128")
-    Integer numPredict();
+    OptionalInt numPredict();
 
     /**
      * Sets the stop sequences to use. When this pattern is encountered the LLM will stop generating text and return