From 59df4ed4327bc06b8d37ef6b54f42c13b71511e1 Mon Sep 17 00:00:00 2001 From: Yunfeng Bai Date: Mon, 7 Aug 2023 09:51:20 -0700 Subject: [PATCH 1/2] Link to HF greedy search --- clients/python/llmengine/completion.py | 199 ++++++++++++++----------- docs/guides/completions.md | 1 + 2 files changed, 110 insertions(+), 90 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 661ac30e..54473751 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -59,6 +59,14 @@ async def acreate( temperature (float): What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. + When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used. + + stop_sequences (Optional[List[str]]): + One or more sequences where the API will stop generating tokens for the current completion. + + return_token_log_probs (Optional[bool]): + Whether to return the log probabilities of generated tokens. + When True, the response will include a list of tokens and their log probabilities. timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -179,97 +187,108 @@ def create( stream: bool = False, ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]: """ - Creates a completion for the provided prompt and parameters synchronously. - - This API can be used to get the LLM to generate a completion *synchronously*. - It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`. - Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`. - It returns a - [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse) - if `stream=False` or an async iterator of - [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse) - with `request_id` and `outputs` fields. - - Args: - model (str): - Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported. - - prompt (str): - The prompt to generate completions for, encoded as a string. - - max_new_tokens (int): - The maximum number of tokens to generate in the completion. - - The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See - [Model Zoo](../../model_zoo) for information on each supported model's context length. - - temperature (float): - What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more focused and deterministic. - - timeout (int): - Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. - - stream (bool): - Whether to stream the response. If true, the return type is an - `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`. - When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format). - - - Returns: - response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`) - - === "Synchronous completion without token streaming in Python" - ```python - from llmengine import Completion - - response = Completion.create( - model="llama-2-7b", - prompt="Hello, my name is", - max_new_tokens=10, - temperature=0.2, - ) - print(response.json()) - ``` - - === "Response in JSON" - ```json - { - "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9", - "output": { - "text": "_______ and I am a _______", - "num_completion_tokens": 10 - } - } - ``` - - Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming: - - === "Synchronous completion with token streaming in Python" - ```python - from llmengine import Completion - - stream = Completion.create( - model="llama-2-7b", - prompt="why is the sky blue?", - max_new_tokens=5, - temperature=0.2, - stream=True, - ) - - for response in stream: - if response.output: + Creates a completion for the provided prompt and parameters synchronously. + + This API can be used to get the LLM to generate a completion *synchronously*. + It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`. + Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`. + It returns a + [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse) + if `stream=False` or an async iterator of + [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse) + with `request_id` and `outputs` fields. + + Args: + model (str): + Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported. + + prompt (str): + The prompt to generate completions for, encoded as a string. + + max_new_tokens (int): + The maximum number of tokens to generate in the completion. + + The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See + [Model Zoo](../../model_zoo) for information on each supported model's context length. + + temperature (float): + What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output + more random, while lower values like 0.2 will make it more focused and deterministic. + <<<<<<< HEAD + ======= + When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used. + + stop_sequences (Optional[List[str]]): + One or more sequences where the API will stop generating tokens for the current completion. + + return_token_log_probs (Optional[bool]): + Whether to return the log probabilities of generated tokens. + When True, the response will include a list of tokens and their log probabilities. + >>>>>>> f99db04 (Link to HF greedy search) + + timeout (int): + Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. + + stream (bool): + Whether to stream the response. If true, the return type is an + `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`. + When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format). + + + Returns: + response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`) + + === "Synchronous completion without token streaming in Python" + ```python + from llmengine import Completion + + response = Completion.create( + model="llama-2-7b", + prompt="Hello, my name is", + max_new_tokens=10, + temperature=0.2, + ) print(response.json()) - ``` - - === "Response in JSON" - ```json - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } } - ``` + ``` + + === "Response in JSON" + ```json + { + "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9", + "output": { + "text": "_______ and I am a _______", + "num_completion_tokens": 10 + } + } + ``` + + Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming: + + === "Synchronous completion with token streaming in Python" + ```python + from llmengine import Completion + + stream = Completion.create( + model="llama-2-7b", + prompt="why is the sky blue?", + max_new_tokens=5, + temperature=0.2, + stream=True, + ) + + for response in stream: + if response.output: + print(response.json()) + ``` + + === "Response in JSON" + ```json + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } } + ``` """ if stream: diff --git a/docs/guides/completions.md b/docs/guides/completions.md index eb16b94e..a5ea9a06 100644 --- a/docs/guides/completions.md +++ b/docs/guides/completions.md @@ -34,6 +34,7 @@ print(response.output.text) - **max_new_tokens:** The maximum number of tokens to generate in the chat completion. - **temperature:** The sampling temperature to use. Higher values make the output more random, while lower values will make it more focused and deterministic. + When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used. See the full [Completion API reference documentation](../../api/python_client/#llmengine.Completion) to learn more. From 20888e38bfc74a28b83abad10ef5f0f9a0b7c636 Mon Sep 17 00:00:00 2001 From: Yunfeng Bai Date: Mon, 7 Aug 2023 09:53:51 -0700 Subject: [PATCH 2/2] fix --- clients/python/llmengine/completion.py | 199 ++++++++++++------------- 1 file changed, 98 insertions(+), 101 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 54473751..a012e591 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -187,108 +187,105 @@ def create( stream: bool = False, ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]: """ - Creates a completion for the provided prompt and parameters synchronously. - - This API can be used to get the LLM to generate a completion *synchronously*. - It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`. - Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`. - It returns a - [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse) - if `stream=False` or an async iterator of - [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse) - with `request_id` and `outputs` fields. - - Args: - model (str): - Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported. - - prompt (str): - The prompt to generate completions for, encoded as a string. - - max_new_tokens (int): - The maximum number of tokens to generate in the completion. - - The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See - [Model Zoo](../../model_zoo) for information on each supported model's context length. - - temperature (float): - What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output - more random, while lower values like 0.2 will make it more focused and deterministic. - <<<<<<< HEAD - ======= - When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used. - - stop_sequences (Optional[List[str]]): - One or more sequences where the API will stop generating tokens for the current completion. - - return_token_log_probs (Optional[bool]): - Whether to return the log probabilities of generated tokens. - When True, the response will include a list of tokens and their log probabilities. - >>>>>>> f99db04 (Link to HF greedy search) - - timeout (int): - Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. - - stream (bool): - Whether to stream the response. If true, the return type is an - `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`. - When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format). - - - Returns: - response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`) - - === "Synchronous completion without token streaming in Python" - ```python - from llmengine import Completion - - response = Completion.create( - model="llama-2-7b", - prompt="Hello, my name is", - max_new_tokens=10, - temperature=0.2, - ) + Creates a completion for the provided prompt and parameters synchronously. + + This API can be used to get the LLM to generate a completion *synchronously*. + It takes as parameters the `model` ([see Model Zoo](../../model_zoo)) and the `prompt`. + Optionally it takes `max_new_tokens`, `temperature`, `timeout` and `stream`. + It returns a + [CompletionSyncResponse](../../api/data_types/#llmengine.CompletionSyncResponse) + if `stream=False` or an async iterator of + [CompletionStreamResponse](../../api/data_types/#llmengine.CompletionStreamResponse) + with `request_id` and `outputs` fields. + + Args: + model (str): + Name of the model to use. See [Model Zoo](../../model_zoo) for a list of Models that are supported. + + prompt (str): + The prompt to generate completions for, encoded as a string. + + max_new_tokens (int): + The maximum number of tokens to generate in the completion. + + The token count of your prompt plus `max_new_tokens` cannot exceed the model's context length. See + [Model Zoo](../../model_zoo) for information on each supported model's context length. + + temperature (float): + What sampling temperature to use, in the range `(0, 1]`. Higher values like 0.8 will make the output + more random, while lower values like 0.2 will make it more focused and deterministic. + When temperature is 0 [greedy search](https://huggingface.co/docs/transformers/generation_strategies#greedy-search) is used. + + stop_sequences (Optional[List[str]]): + One or more sequences where the API will stop generating tokens for the current completion. + + return_token_log_probs (Optional[bool]): + Whether to return the log probabilities of generated tokens. + When True, the response will include a list of tokens and their log probabilities. + + timeout (int): + Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. + + stream (bool): + Whether to stream the response. If true, the return type is an + `Iterator[CompletionStreamResponse]`. Otherwise, the return type is a `CompletionSyncResponse`. + When streaming, tokens will be sent as data-only [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#event_stream_format). + + + Returns: + response (Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]): The generated response (if `stream=False`) or iterator of response chunks (if `stream=True`) + + === "Synchronous completion without token streaming in Python" + ```python + from llmengine import Completion + + response = Completion.create( + model="llama-2-7b", + prompt="Hello, my name is", + max_new_tokens=10, + temperature=0.2, + ) + print(response.json()) + ``` + + === "Response in JSON" + ```json + { + "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9", + "output": { + "text": "_______ and I am a _______", + "num_completion_tokens": 10 + } + } + ``` + + Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming: + + === "Synchronous completion with token streaming in Python" + ```python + from llmengine import Completion + + stream = Completion.create( + model="llama-2-7b", + prompt="why is the sky blue?", + max_new_tokens=5, + temperature=0.2, + stream=True, + ) + + for response in stream: + if response.output: print(response.json()) - ``` - - === "Response in JSON" - ```json - { - "request_id": "8bbd0e83-f94c-465b-a12b-aabad45750a9", - "output": { - "text": "_______ and I am a _______", - "num_completion_tokens": 10 - } - } - ``` - - Token streaming can be used to reduce _perceived_ latency for applications. Here is how applications can use streaming: - - === "Synchronous completion with token streaming in Python" - ```python - from llmengine import Completion - - stream = Completion.create( - model="llama-2-7b", - prompt="why is the sky blue?", - max_new_tokens=5, - temperature=0.2, - stream=True, - ) - - for response in stream: - if response.output: - print(response.json()) - ``` - - === "Response in JSON" - ```json - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } } - {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } } - ``` + ``` + + === "Response in JSON" + ```json + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "\\n", "finished": false, "num_completion_tokens": 1 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "I", "finished": false, "num_completion_tokens": 2 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": " don", "finished": false, "num_completion_tokens": 3 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "’", "finished": false, "num_completion_tokens": 4 } } + {"request_id": "ebbde00c-8c31-4c03-8306-24f37cd25fa2", "output": {"text": "t", "finished": true, "num_completion_tokens": 5 } } + ``` """ if stream: