From 59df4ed4327bc06b8d37ef6b54f42c13b71511e1 Mon Sep 17 00:00:00 2001
From: Yunfeng Bai <yunfeng.bai@scale.com>
Date: Mon, 7 Aug 2023 09:51:20 -0700
Subject: [PATCH 1/2] Link to HF greedy search

---
 clients/python/llmengine/completion.py | 199 ++++++++++++++-----------
 docs/guides/completions.md             |   1 +
 2 files changed, 110 insertions(+), 90 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 661ac30e..54473751 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -59,6 +59,14 @@ async def acreate(
             temperature (float):
                 What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output
                 more random, while lower values like 0.2 will make it more focused and deterministic.
+                When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used.
+
+            stop_sequences (Optional[List[str]]):
+                One or more sequences where the API will stop generating tokens for the current completion.
+
+            return_token_log_probs (Optional[bool]):
+                Whether to return the log probabilities of generated tokens.
+                When True, the response will include a list of tokens and their log probabilities.
 
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
@@ -179,97 +187,108 @@ def create(
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]:
         """
-        Creates a completion for the provided prompt and parameters synchronously.
-
-        This API can be used to get the LLM to generate a completion *synchronously*.
-        It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`.
-        Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`.
-        It returns a
-        [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse)
-        if `stream=False` or an async iterator of
-        [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse)
-        with `request_id` and `outputs` fields.
-
-        Args:
-            model (str):
-                Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported.
-
-            prompt (str):
-                The prompt to generate completions for, encoded as a string.
-
-            max_new_tokens (int):
-                The maximum number of tokens to generate in the completion.
-
-                The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See
-                [Model Zoo](../../model_zoo) for information on each supported model's context length.
-
-            temperature (float):
-                What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more focused and deterministic.
-
-            timeout (int):
-                Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
-
-            stream (bool):
-                Whether to stream the response. If true, the return type is an
-                `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`.
-                When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format).
-
-
-        Returns:
-            response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`)
-
-        === "Synchronous completion without token streaming in Python"
-            ```python
-            from llmengine import Completion
-
-            response = Completion.create(
-                model="llama-2-7b",
-                prompt="Hello, my name is",
-                max_new_tokens=10,
-                temperature=0.2,
-            )
-            print(response.json())
-            ```
-
-        === "Response in JSON"
-            ```json
-            {
-                "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9",
-                "output": {
-                    "text": "_______ and I am a _______",
-                    "num_completion_tokens": 10
-                }
-            }
-            ```
-
-        Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming:
-
-        === "Synchronous completion with token streaming in Python"
-            ```python
-            from llmengine import Completion
-
-            stream = Completion.create(
-                model="llama-2-7b",
-                prompt="why is the sky blue?",
-                max_new_tokens=5,
-                temperature=0.2,
-                stream=True,
-            )
-
-            for response in stream:
-                if response.output:
+                Creates a completion for the provided prompt and parameters synchronously.
+
+                This API can be used to get the LLM to generate a completion *synchronously*.
+                It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`.
+                Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`.
+                It returns a
+                [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse)
+                if `stream=False` or an async iterator of
+                [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse)
+                with `request_id` and `outputs` fields.
+
+                Args:
+                    model (str):
+                        Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported.
+
+                    prompt (str):
+                        The prompt to generate completions for, encoded as a string.
+
+                    max_new_tokens (int):
+                        The maximum number of tokens to generate in the completion.
+
+                        The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See
+                        [Model Zoo](../../model_zoo) for information on each supported model's context length.
+
+                    temperature (float):
+                        What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output
+                        more random, while lower values like 0.2 will make it more focused and deterministic.
+        <<<<<<< HEAD
+        =======
+                        When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used.
+
+                    stop_sequences (Optional[List[str]]):
+                        One or more sequences where the API will stop generating tokens for the current completion.
+
+                    return_token_log_probs (Optional[bool]):
+                        Whether to return the log probabilities of generated tokens.
+                        When True, the response will include a list of tokens and their log probabilities.
+        >>>>>>> f99db04 (Link to HF greedy search)
+
+                    timeout (int):
+                        Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
+
+                    stream (bool):
+                        Whether to stream the response. If true, the return type is an
+                        `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`.
+                        When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format).
+
+
+                Returns:
+                    response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`)
+
+                === "Synchronous completion without token streaming in Python"
+                    ```python
+                    from llmengine import Completion
+
+                    response = Completion.create(
+                        model="llama-2-7b",
+                        prompt="Hello, my name is",
+                        max_new_tokens=10,
+                        temperature=0.2,
+                    )
                     print(response.json())
-            ```
-
-        === "Response in JSON"
-            ```json
-            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } }
-            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } }
-            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } }
-            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } }
-            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } }
-            ```
+                    ```
+
+                === "Response in JSON"
+                    ```json
+                    {
+                        "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9",
+                        "output": {
+                            "text": "_______ and I am a _______",
+                            "num_completion_tokens": 10
+                        }
+                    }
+                    ```
+
+                Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming:
+
+                === "Synchronous completion with token streaming in Python"
+                    ```python
+                    from llmengine import Completion
+
+                    stream = Completion.create(
+                        model="llama-2-7b",
+                        prompt="why is the sky blue?",
+                        max_new_tokens=5,
+                        temperature=0.2,
+                        stream=True,
+                    )
+
+                    for response in stream:
+                        if response.output:
+                            print(response.json())
+                    ```
+
+                === "Response in JSON"
+                    ```json
+                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } }
+                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } }
+                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } }
+                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } }
+                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } }
+                    ```
         """
         if stream:
 
diff --git a/docs/guides/completions.md b/docs/guides/completions.md
index eb16b94e..a5ea9a06 100644
--- a/docs/guides/completions.md
+++ b/docs/guides/completions.md
@@ -34,6 +34,7 @@ print(response.output.text)
 - **max_new_tokens:** The maximum number of tokens to generate in the chat completion.
 - **temperature:** The sampling temperature to use. Higher values make the output more random,
   while lower values will make it more focused and deterministic.
+  When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used.
 
 See the full [Completion API reference documentation](../../api/python_client/#llmengine.Completion) to learn more.
 

From 20888e38bfc74a28b83abad10ef5f0f9a0b7c636 Mon Sep 17 00:00:00 2001
From: Yunfeng Bai <yunfeng.bai@scale.com>
Date: Mon, 7 Aug 2023 09:53:51 -0700
Subject: [PATCH 2/2] fix

---
 clients/python/llmengine/completion.py | 199 ++++++++++++-------------
 1 file changed, 98 insertions(+), 101 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 54473751..a012e591 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -187,108 +187,105 @@ def create(
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]:
         """
-                Creates a completion for the provided prompt and parameters synchronously.
-
-                This API can be used to get the LLM to generate a completion *synchronously*.
-                It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`.
-                Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`.
-                It returns a
-                [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse)
-                if `stream=False` or an async iterator of
-                [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse)
-                with `request_id` and `outputs` fields.
-
-                Args:
-                    model (str):
-                        Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported.
-
-                    prompt (str):
-                        The prompt to generate completions for, encoded as a string.
-
-                    max_new_tokens (int):
-                        The maximum number of tokens to generate in the completion.
-
-                        The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See
-                        [Model Zoo](../../model_zoo) for information on each supported model's context length.
-
-                    temperature (float):
-                        What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output
-                        more random, while lower values like 0.2 will make it more focused and deterministic.
-        <<<<<<< HEAD
-        =======
-                        When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used.
-
-                    stop_sequences (Optional[List[str]]):
-                        One or more sequences where the API will stop generating tokens for the current completion.
-
-                    return_token_log_probs (Optional[bool]):
-                        Whether to return the log probabilities of generated tokens.
-                        When True, the response will include a list of tokens and their log probabilities.
-        >>>>>>> f99db04 (Link to HF greedy search)
-
-                    timeout (int):
-                        Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
-
-                    stream (bool):
-                        Whether to stream the response. If true, the return type is an
-                        `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`.
-                        When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format).
-
-
-                Returns:
-                    response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`)
-
-                === "Synchronous completion without token streaming in Python"
-                    ```python
-                    from llmengine import Completion
-
-                    response = Completion.create(
-                        model="llama-2-7b",
-                        prompt="Hello, my name is",
-                        max_new_tokens=10,
-                        temperature=0.2,
-                    )
+        Creates a completion for the provided prompt and parameters synchronously.
+
+        This API can be used to get the LLM to generate a completion *synchronously*.
+        It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`.
+        Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`.
+        It returns a
+        [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse)
+        if `stream=False` or an async iterator of
+        [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse)
+        with `request_id` and `outputs` fields.
+
+        Args:
+            model (str):
+                Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported.
+
+            prompt (str):
+                The prompt to generate completions for, encoded as a string.
+
+            max_new_tokens (int):
+                The maximum number of tokens to generate in the completion.
+
+                The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See
+                [Model Zoo](../../model_zoo) for information on each supported model's context length.
+
+            temperature (float):
+                What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output
+                more random, while lower values like 0.2 will make it more focused and deterministic.
+                When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used.
+
+            stop_sequences (Optional[List[str]]):
+                One or more sequences where the API will stop generating tokens for the current completion.
+
+            return_token_log_probs (Optional[bool]):
+                Whether to return the log probabilities of generated tokens.
+                When True, the response will include a list of tokens and their log probabilities.
+
+            timeout (int):
+                Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
+
+            stream (bool):
+                Whether to stream the response. If true, the return type is an
+                `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`.
+                When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format).
+
+
+        Returns:
+            response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`)
+
+        === "Synchronous completion without token streaming in Python"
+            ```python
+            from llmengine import Completion
+
+            response = Completion.create(
+                model="llama-2-7b",
+                prompt="Hello, my name is",
+                max_new_tokens=10,
+                temperature=0.2,
+            )
+            print(response.json())
+            ```
+
+        === "Response in JSON"
+            ```json
+            {
+                "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9",
+                "output": {
+                    "text": "_______ and I am a _______",
+                    "num_completion_tokens": 10
+                }
+            }
+            ```
+
+        Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming:
+
+        === "Synchronous completion with token streaming in Python"
+            ```python
+            from llmengine import Completion
+
+            stream = Completion.create(
+                model="llama-2-7b",
+                prompt="why is the sky blue?",
+                max_new_tokens=5,
+                temperature=0.2,
+                stream=True,
+            )
+
+            for response in stream:
+                if response.output:
                     print(response.json())
-                    ```
-
-                === "Response in JSON"
-                    ```json
-                    {
-                        "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9",
-                        "output": {
-                            "text": "_______ and I am a _______",
-                            "num_completion_tokens": 10
-                        }
-                    }
-                    ```
-
-                Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming:
-
-                === "Synchronous completion with token streaming in Python"
-                    ```python
-                    from llmengine import Completion
-
-                    stream = Completion.create(
-                        model="llama-2-7b",
-                        prompt="why is the sky blue?",
-                        max_new_tokens=5,
-                        temperature=0.2,
-                        stream=True,
-                    )
-
-                    for response in stream:
-                        if response.output:
-                            print(response.json())
-                    ```
-
-                === "Response in JSON"
-                    ```json
-                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } }
-                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } }
-                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } }
-                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } }
-                    {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } }
-                    ```
+            ```
+
+        === "Response in JSON"
+            ```json
+            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } }
+            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } }
+            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } }
+            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } }
+            {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } }
+            ```
         """
         if stream: