From b9f876682715a43256302133b6a4ccf4834a2fb6 Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Thu, 28 Sep 2023 00:46:21 +0000 Subject: [PATCH 1/8] add repetition_penalty, top_k, top_p --- clients/python/llmengine/completion.py | 36 +++++++++++++++++++ clients/python/llmengine/data_types.py | 6 ++++ .../model_engine_server/common/dtos/llms.py | 24 +++++++++++++ .../use_cases/llm_model_endpoint_use_cases.py | 15 ++++++++ 4 files changed, 81 insertions(+) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 8cecd765..b6ca729f 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -33,6 +33,9 @@ async def acreate( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, + repetition_penalty: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]: @@ -72,6 +75,15 @@ async def acreate( Whether to return the log probabilities of generated tokens. When True, the response will include a list of tokens and their log probabilities. + repetition_penalty (Optional[float]): + The parameter for repetition penalty. 1.0 means no penalty. + + top_k (Optional[int]): + Integer that controls the number of top tokens to consider. -1 means consider all tokens. + + top_p (Optional[float]): + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens. + timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -164,6 +176,9 @@ async def _acreate_stream( temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, + repetition_penalty=repetition_penalty, + top_k=top_k, + top_p=top_p, timeout=timeout, ) @@ -184,6 +199,9 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse: temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, + repetition_penalty=repetition_penalty, + top_k=top_k, + top_p=top_p, ) @classmethod @@ -195,6 +213,9 @@ def create( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, + repetition_penalty: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]: @@ -235,6 +256,15 @@ def create( Whether to return the log probabilities of generated tokens. When True, the response will include a list of tokens and their log probabilities. + repetition_penalty (Optional[float]): + The parameter for repetition penalty. 1.0 means no penalty. + + top_k (Optional[int]): + Integer that controls the number of top tokens to consider. -1 means consider all tokens. + + top_p (Optional[float]): + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens. + timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -317,6 +347,9 @@ def _create_stream(**kwargs): temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, + repetition_penalty=repetition_penalty, + top_k=top_k, + top_p=top_p, ) else: @@ -326,6 +359,9 @@ def _create_stream(**kwargs): temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, + repetition_penalty=repetition_penalty, + top_k=top_k, + top_p=top_p, ).dict() response = cls.post_sync( resource_name=f"v1/llm/completions-sync?model_endpoint_name={model}", diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index 1a8baba3..bcc8bc90 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -268,6 +268,9 @@ class CompletionSyncV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) + repetition_penalty: Optional[float] = Field(default=False) + top_k: Optional[int] = Field(default=False) + top_p: Optional[float] = Field(default=False) class TokenOutput(BaseModel): @@ -329,6 +332,9 @@ class CompletionStreamV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) + repetition_penalty: Optional[float] = Field(default=False) + top_k: Optional[int] = Field(default=False) + top_p: Optional[float] = Field(default=False) class CompletionStreamOutput(BaseModel): diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index 2735f577..7b4ccf47 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -116,6 +116,18 @@ class CompletionSyncV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. """ + repetition_penalty: Optional[float] = None + """ + The parameter for repetition penalty. 1.0 means no penalty. + """ + top_k: Optional[int] = None + """ + Integer that controls the number of top tokens to consider. + """ + top_p: Optional[float] = None + """ + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. + """ class TokenOutput(BaseModel): @@ -157,6 +169,18 @@ class CompletionStreamV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models """ + repetition_penalty: Optional[float] = None + """ + The parameter for repetition penalty. 1.0 means no penalty. + """ + top_k: Optional[int] = None + """ + Integer that controls the number of top tokens to consider. + """ + top_p: Optional[float] = None + """ + Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. + """ class CompletionStreamOutput(BaseModel): diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 9e4b6ba6..f0718ef2 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -1021,6 +1021,7 @@ async def execute( "parameters": { "max_new_tokens": request.max_new_tokens, "decoder_input_details": True, + "repetition_penalty": request.repetition_penalty, }, } if request.stop_sequences is not None: @@ -1028,6 +1029,13 @@ async def execute( if request.temperature > 0: tgi_args["parameters"]["temperature"] = request.temperature tgi_args["parameters"]["do_sample"] = True + if request.top_k == -1: # tgi set to None to consider all tokens. + request.top_k = None + tgi_args["parameters"]["top_k"] = request.top_k + if request.top_p == 1: # tgi set to None to consider all tokens. + request.top_p = None + else: + tgi_args["parameters"]["do_sample"] = False inference_request = SyncEndpointPredictV1Request( args=tgi_args, @@ -1060,6 +1068,13 @@ async def execute( if request.stop_sequences is not None: vllm_args["stop"] = request.stop_sequences vllm_args["temperature"] = request.temperature + if request.temperature > 0: + if request.top_k is None: # vllm set to -1 to consider all tokens. + request.top_k = -1 + vllm_args["top_k"] = request.top_k + if request.top_p is None: # vllm set to 1 to consider all tokens. + request.top_p = 1 + vllm_args["top_p"] = request.top_p if request.return_token_log_probs: vllm_args["logprobs"] = 1 From 1847fa2c9a3e44b4c6fd98941b14a9162a0424bd Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Thu, 28 Sep 2023 03:25:26 +0000 Subject: [PATCH 2/8] add frequency_penalty, presence_penalty, add lightllm --- clients/python/llmengine/completion.py | 64 +++++++++++++++---- clients/python/llmengine/data_types.py | 16 +++-- .../model_engine_server/common/dtos/llms.py | 44 +++++++++---- .../use_cases/llm_model_endpoint_use_cases.py | 34 ++++++++-- 4 files changed, 124 insertions(+), 34 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index b6ca729f..7c759299 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -33,9 +33,11 @@ async def acreate( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = None, - top_k: Optional[int] = None, - top_p: Optional[float] = None, + repetition_penalty: Optional[float] = 1, + presence_penalty: Optional[float] = 0, + frequency_penalty: Optional[float] = 0, + top_k: Optional[int] = -1, + top_p: Optional[float] = 1, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]: @@ -76,13 +78,27 @@ async def acreate( When True, the response will include a list of tokens and their log probabilities. repetition_penalty (Optional[float]): - The parameter for repetition penalty. 1.0 means no penalty. + The parameter for repetition penalty + https://arxiv.org/pdf/1909.05858.pdf + Range: [1.0, infinity). 1.0 means no penalty + + presence_penalty (Optional[float]): + Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + https://platform.openai.com/docs/guides/gpt/parameter-details + Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. + + frequency_penalty (Optional[float]): + Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + https://platform.openai.com/docs/guides/gpt/parameter-details + Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. top_k (Optional[int]): - Integer that controls the number of top tokens to consider. -1 means consider all tokens. + Integer that controls the number of top tokens to consider. + Range: [1, infinity). -1 means consider all tokens. top_p (Optional[float]): - Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens. + Float that controls the cumulative probability of the top tokens to consider. + Range: (0.0, 1.0]. 1.0 means consider all tokens. timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -177,6 +193,8 @@ async def _acreate_stream( stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, top_k=top_k, top_p=top_p, timeout=timeout, @@ -200,6 +218,8 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse: stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, top_k=top_k, top_p=top_p, ) @@ -213,9 +233,11 @@ def create( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = None, - top_k: Optional[int] = None, - top_p: Optional[float] = None, + repetition_penalty: Optional[float] = 1, + presence_penalty: Optional[float] = 0, + frequency_penalty: Optional[float] = 0, + top_k: Optional[int] = -1, + top_p: Optional[float] = 1, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]: @@ -257,13 +279,27 @@ def create( When True, the response will include a list of tokens and their log probabilities. repetition_penalty (Optional[float]): - The parameter for repetition penalty. 1.0 means no penalty. + The parameter for repetition penalty + https://arxiv.org/pdf/1909.05858.pdf + Range: [1.0, infinity). 1.0 means no penalty + + presence_penalty (Optional[float]): + Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. + https://platform.openai.com/docs/guides/gpt/parameter-details + Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. + + frequency_penalty (Optional[float]): + Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. + https://platform.openai.com/docs/guides/gpt/parameter-details + Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. top_k (Optional[int]): - Integer that controls the number of top tokens to consider. -1 means consider all tokens. + Integer that controls the number of top tokens to consider. + Range: [1, infinity). -1 means consider all tokens. top_p (Optional[float]): - Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens. + Float that controls the cumulative probability of the top tokens to consider. + Range: (0.0, 1.0]. 1.0 means consider all tokens. timeout (int): Timeout in seconds. This is the maximum amount of time you are willing to wait for a response. @@ -348,6 +384,8 @@ def _create_stream(**kwargs): stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, top_k=top_k, top_p=top_p, ) @@ -360,6 +398,8 @@ def _create_stream(**kwargs): stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, top_k=top_k, top_p=top_p, ).dict() diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index bcc8bc90..a3f4461a 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -268,9 +268,11 @@ class CompletionSyncV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - repetition_penalty: Optional[float] = Field(default=False) - top_k: Optional[int] = Field(default=False) - top_p: Optional[float] = Field(default=False) + repetition_penalty: Optional[float] = Field(default=1, ge=1) + presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) + frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) + top_k: Optional[int] = Field(default=-1, ge=-1) + top_p: Optional[float] = Field(default=1, gt=0, le=1) class TokenOutput(BaseModel): @@ -332,9 +334,11 @@ class CompletionStreamV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - repetition_penalty: Optional[float] = Field(default=False) - top_k: Optional[int] = Field(default=False) - top_p: Optional[float] = Field(default=False) + repetition_penalty: Optional[float] = Field(default=1, ge=1) + presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) + frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) + top_k: Optional[int] = Field(default=-1, ge=-1) + top_p: Optional[float] = Field(default=1, gt=0, le=1) class CompletionStreamOutput(BaseModel): diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index 7b4ccf47..ad9602bb 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -116,17 +116,27 @@ class CompletionSyncV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. """ - repetition_penalty: Optional[float] = None + + repetition_penalty: Optional[float] = Field(default=1, ge=1) + """ + Only affects text-generation-inference + The parameter for repetition penalty. 1.0 means no penalty """ - The parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) + frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) """ - top_k: Optional[int] = None + Only affects vllm, lightllm + presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty + frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty + """ + + top_k: Optional[int] = Field(default=-1, ge=-1) """ - Integer that controls the number of top tokens to consider. + Controls the number of top tokens to consider. -1 means consider all tokens. """ - top_p: Optional[float] = None + top_p: Optional[float] = Field(default=1, gt=0, le=1) """ - Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. + Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens. """ @@ -169,17 +179,27 @@ class CompletionStreamV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models """ - repetition_penalty: Optional[float] = None + + repetition_penalty: Optional[float] = Field(default=1, ge=1) + """ + Only affects text-generation-inference + The parameter for repetition penalty. 1.0 means no penalty """ - The parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) + frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) """ - top_k: Optional[int] = None + Only affects vllm, lightllm + presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty + frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty + """ + + top_k: Optional[int] = Field(default=-1, ge=-1) """ - Integer that controls the number of top tokens to consider. + Controls the number of top tokens to consider. -1 means consider all tokens. """ - top_p: Optional[float] = None + top_p: Optional[float] = Field(default=1, gt=0, le=1) """ - Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. + Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens. """ diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index f0718ef2..bc383c98 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -937,6 +937,8 @@ async def execute( request_id = str(uuid4()) add_trace_request_id(request_id) + if request.top_k == 0: + request.top_k = -1 model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints( owner=user.team_id, name=model_endpoint_name, order_by=None @@ -1034,6 +1036,7 @@ async def execute( tgi_args["parameters"]["top_k"] = request.top_k if request.top_p == 1: # tgi set to None to consider all tokens. request.top_p = None + tgi_args["parameters"]["top_p"] = request.top_p else: tgi_args["parameters"]["do_sample"] = False @@ -1064,16 +1067,14 @@ async def execute( vllm_args: Any = { "prompt": request.prompt, "max_tokens": request.max_new_tokens, + "presence_penalty": request.presence_penalty, + "frequency_penalty": request.frequency_penalty, } if request.stop_sequences is not None: vllm_args["stop"] = request.stop_sequences vllm_args["temperature"] = request.temperature if request.temperature > 0: - if request.top_k is None: # vllm set to -1 to consider all tokens. - request.top_k = -1 vllm_args["top_k"] = request.top_k - if request.top_p is None: # vllm set to 1 to consider all tokens. - request.top_p = 1 vllm_args["top_p"] = request.top_p if request.return_token_log_probs: vllm_args["logprobs"] = 1 @@ -1105,12 +1106,16 @@ async def execute( "inputs": request.prompt, "parameters": { "max_new_tokens": request.max_new_tokens, + "presence_penalty": request.presence_penalty, + "frequency_penalty": request.frequency_penalty, }, } # TODO: implement stop sequences if request.temperature > 0: lightllm_args["parameters"]["temperature"] = request.temperature lightllm_args["parameters"]["do_sample"] = True + lightllm_args["top_k"] = request.top_k + lightllm_args["top_p"] = request.top_p else: lightllm_args["parameters"]["do_sample"] = False if request.return_token_log_probs: @@ -1179,6 +1184,9 @@ async def execute( request_id = str(uuid4()) add_trace_request_id(request_id) + if request.top_k == 0: + request.top_k = -1 + model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints( owner=user.team_id, name=model_endpoint_name, order_by=None ) @@ -1237,6 +1245,7 @@ async def execute( "inputs": request.prompt, "parameters": { "max_new_tokens": request.max_new_tokens, + "repetition_penalty": request.repetition_penalty, }, } if request.stop_sequences is not None: @@ -1244,14 +1253,27 @@ async def execute( if request.temperature > 0: args["parameters"]["temperature"] = request.temperature args["parameters"]["do_sample"] = True + if request.top_k == -1: # tgi set to None to consider all tokens. + request.top_k = None + args["parameters"]["top_k"] = request.top_k + if request.top_p == 1: # tgi set to None to consider all tokens. + request.top_p = None + args["parameters"]["top_p"] = request.top_p + else: + args["parameters"]["do_sample"] = False elif model_content.inference_framework == LLMInferenceFramework.VLLM: args = { "prompt": request.prompt, "max_tokens": request.max_new_tokens, + "presence_penalty": request.presence_penalty, + "frequency_penalty": request.frequency_penalty, } if request.stop_sequences is not None: args["stop"] = request.stop_sequences args["temperature"] = request.temperature + if request.temperature > 0: + args["top_k"] = request.top_k + args["top_p"] = request.top_p if request.return_token_log_probs: args["logprobs"] = 1 args["stream"] = True @@ -1260,12 +1282,16 @@ async def execute( "inputs": request.prompt, "parameters": { "max_new_tokens": request.max_new_tokens, + "presence_penalty": request.presence_penalty, + "frequency_penalty": request.frequency_penalty, }, } # TODO: stop sequences if request.temperature > 0: args["parameters"]["temperature"] = request.temperature args["parameters"]["do_sample"] = True + args["parameters"]["top_k"] = request.top_k + args["parameters"]["top_p"] = request.top_p else: args["parameters"]["do_sample"] = False if request.return_token_log_probs: From e5de486a71b284e1dbd71a71afd7908f4f16841f Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Thu, 28 Sep 2023 16:26:48 +0000 Subject: [PATCH 3/8] add comments --- clients/python/llmengine/completion.py | 18 ++++++++++++------ .../use_cases/llm_model_endpoint_use_cases.py | 4 ++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 7c759299..8dacbaf3 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -33,9 +33,9 @@ async def acreate( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = 1, - presence_penalty: Optional[float] = 0, - frequency_penalty: Optional[float] = 0, + repetition_penalty: Optional[float] = 1, # text-generation-inference + presence_penalty: Optional[float] = 0, # vllm, lightllm + frequency_penalty: Optional[float] = 0, # vllm, lightllm top_k: Optional[int] = -1, top_p: Optional[float] = 1, timeout: int = COMPLETION_TIMEOUT, @@ -78,16 +78,19 @@ async def acreate( When True, the response will include a list of tokens and their log probabilities. repetition_penalty (Optional[float]): + *** Only affects: text-generation-inference *** The parameter for repetition penalty https://arxiv.org/pdf/1909.05858.pdf Range: [1.0, infinity). 1.0 means no penalty presence_penalty (Optional[float]): + *** Only affects: vllm, lightllm *** Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. frequency_penalty (Optional[float]): + *** Only affects: vllm, lightllm *** Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. @@ -233,9 +236,9 @@ def create( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = 1, - presence_penalty: Optional[float] = 0, - frequency_penalty: Optional[float] = 0, + repetition_penalty: Optional[float] = 1, # text-generation-inference + presence_penalty: Optional[float] = 0, # vllm, lightllm + frequency_penalty: Optional[float] = 0, # vllm, lightllm top_k: Optional[int] = -1, top_p: Optional[float] = 1, timeout: int = COMPLETION_TIMEOUT, @@ -279,16 +282,19 @@ def create( When True, the response will include a list of tokens and their log probabilities. repetition_penalty (Optional[float]): + *** Only affects: text-generation-inference *** The parameter for repetition penalty https://arxiv.org/pdf/1909.05858.pdf Range: [1.0, infinity). 1.0 means no penalty presence_penalty (Optional[float]): + *** Only affects: vllm, lightllm *** Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. frequency_penalty (Optional[float]): + *** Only affects: vllm, lightllm *** Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index bc383c98..aac35008 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -937,7 +937,7 @@ async def execute( request_id = str(uuid4()) add_trace_request_id(request_id) - if request.top_k == 0: + if request.top_k == 0: # top_k can't be 0, only takes >= 1, or -1/None to disable top_k request.top_k = -1 model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints( @@ -1184,7 +1184,7 @@ async def execute( request_id = str(uuid4()) add_trace_request_id(request_id) - if request.top_k == 0: + if request.top_k == 0: # top_k can't be 0, only takes >= 1, or -1/None to disable top_k request.top_k = -1 model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints( From 3d845e2d81800ba6d49712924be6176149f716f2 Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Thu, 28 Sep 2023 17:08:13 +0000 Subject: [PATCH 4/8] fix --- clients/python/llmengine/completion.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 8dacbaf3..7179fc75 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -78,19 +78,19 @@ async def acreate( When True, the response will include a list of tokens and their log probabilities. repetition_penalty (Optional[float]): - *** Only affects: text-generation-inference *** + Only affects: text-generation-inference The parameter for repetition penalty https://arxiv.org/pdf/1909.05858.pdf Range: [1.0, infinity). 1.0 means no penalty presence_penalty (Optional[float]): - *** Only affects: vllm, lightllm *** + Only affects: vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. frequency_penalty (Optional[float]): - *** Only affects: vllm, lightllm *** + Only affects: vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. @@ -282,19 +282,19 @@ def create( When True, the response will include a list of tokens and their log probabilities. repetition_penalty (Optional[float]): - *** Only affects: text-generation-inference *** + Only affects: text-generation-inference The parameter for repetition penalty https://arxiv.org/pdf/1909.05858.pdf Range: [1.0, infinity). 1.0 means no penalty presence_penalty (Optional[float]): - *** Only affects: vllm, lightllm *** + Only affects: vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. frequency_penalty (Optional[float]): - *** Only affects: vllm, lightllm *** + Only affects: vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. From ffd93416593aab9b96ef8b8936a3f71dee019f61 Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Thu, 28 Sep 2023 22:24:02 +0000 Subject: [PATCH 5/8] fix Optional, add params validation --- clients/python/llmengine/completion.py | 28 ++++++------ clients/python/llmengine/data_types.py | 20 ++++----- .../model_engine_server/common/dtos/llms.py | 24 +++++------ .../use_cases/llm_model_endpoint_use_cases.py | 43 +++++++++++++------ 4 files changed, 67 insertions(+), 48 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 7179fc75..8d4481b7 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -33,11 +33,11 @@ async def acreate( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = 1, # text-generation-inference - presence_penalty: Optional[float] = 0, # vllm, lightllm - frequency_penalty: Optional[float] = 0, # vllm, lightllm - top_k: Optional[int] = -1, - top_p: Optional[float] = 1, + repetition_penalty: Optional[float] = None, # text-generation-inference + presence_penalty: float = 0.0, # vllm, lightllm + frequency_penalty: float = 0.0, # vllm, lightllm + top_k: Optional[int] = None, + top_p: Optional[float] = None, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]: @@ -83,13 +83,13 @@ async def acreate( https://arxiv.org/pdf/1909.05858.pdf Range: [1.0, infinity). 1.0 means no penalty - presence_penalty (Optional[float]): + presence_penalty (float): Only affects: vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. - frequency_penalty (Optional[float]): + frequency_penalty (float): Only affects: vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details @@ -236,11 +236,11 @@ def create( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = 1, # text-generation-inference - presence_penalty: Optional[float] = 0, # vllm, lightllm - frequency_penalty: Optional[float] = 0, # vllm, lightllm - top_k: Optional[int] = -1, - top_p: Optional[float] = 1, + repetition_penalty: Optional[float] = None, # text-generation-inference + presence_penalty: float = 0.0, # vllm, lightllm + frequency_penalty: float = 0.0, # vllm, lightllm + top_k: Optional[int] = None, + top_p: Optional[float] = None, timeout: int = COMPLETION_TIMEOUT, stream: bool = False, ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]: @@ -287,13 +287,13 @@ def create( https://arxiv.org/pdf/1909.05858.pdf Range: [1.0, infinity). 1.0 means no penalty - presence_penalty (Optional[float]): + presence_penalty (float): Only affects: vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. - frequency_penalty (Optional[float]): + frequency_penalty (float): Only affects: vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index a3f4461a..f7f7782d 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -268,11 +268,11 @@ class CompletionSyncV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - repetition_penalty: Optional[float] = Field(default=1, ge=1) - presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) - frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) - top_k: Optional[int] = Field(default=-1, ge=-1) - top_p: Optional[float] = Field(default=1, gt=0, le=1) + repetition_penalty: Optional[float] = Field(default=None, ge=1.0) + presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + top_k: Optional[int] = Field(default=None, ge=-1) + top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) class TokenOutput(BaseModel): @@ -334,11 +334,11 @@ class CompletionStreamV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - repetition_penalty: Optional[float] = Field(default=1, ge=1) - presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) - frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) - top_k: Optional[int] = Field(default=-1, ge=-1) - top_p: Optional[float] = Field(default=1, gt=0, le=1) + repetition_penalty: Optional[float] = Field(default=None, ge=1.0) + presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + top_k: Optional[int] = Field(default=None, ge=-1) + top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) class CompletionStreamOutput(BaseModel): diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index ad9602bb..df943970 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -104,7 +104,7 @@ class CompletionSyncV1Request(BaseModel): prompt: str max_new_tokens: int - temperature: float = Field(ge=0, le=1) + temperature: float = Field(ge=0.0, le=1.0) """ Temperature of the sampling. Setting to 0 equals to greedy sampling. """ @@ -117,24 +117,24 @@ class CompletionSyncV1Request(BaseModel): Whether to return the log probabilities of the tokens. """ - repetition_penalty: Optional[float] = Field(default=1, ge=1) + repetition_penalty: Optional[float] = Field(default=None, ge=1.0) """ Only affects text-generation-inference The parameter for repetition penalty. 1.0 means no penalty """ - presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) - frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) + presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) """ Only affects vllm, lightllm presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty """ - top_k: Optional[int] = Field(default=-1, ge=-1) + top_k: Optional[int] = Field(default=None, ge=-1) """ Controls the number of top tokens to consider. -1 means consider all tokens. """ - top_p: Optional[float] = Field(default=1, gt=0, le=1) + top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) """ Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens. """ @@ -167,7 +167,7 @@ class CompletionStreamV1Request(BaseModel): prompt: str max_new_tokens: int - temperature: float = Field(ge=0, le=1) + temperature: float = Field(ge=0.0, le=1.0) """ Temperature of the sampling. Setting to 0 equals to greedy sampling. """ @@ -180,24 +180,24 @@ class CompletionStreamV1Request(BaseModel): Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models """ - repetition_penalty: Optional[float] = Field(default=1, ge=1) + repetition_penalty: Optional[float] = Field(default=None, ge=1.0) """ Only affects text-generation-inference The parameter for repetition penalty. 1.0 means no penalty """ - presence_penalty: Optional[float] = Field(default=0, ge=0, le=2) - frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2) + presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) """ Only affects vllm, lightllm presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty """ - top_k: Optional[int] = Field(default=-1, ge=-1) + top_k: Optional[int] = Field(default=None, ge=-1) """ Controls the number of top tokens to consider. -1 means consider all tokens. """ - top_p: Optional[float] = Field(default=1, gt=0, le=1) + top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) """ Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens. """ diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index aac35008..330cf32f 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -831,6 +831,34 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]: return tokens +def validate_completion_params(inference_framework: LLMInferenceFramework, request): + # can't pass mypy check with Union[CompletionSyncV1Request, CompletionStreamV1Request], doesn't support intersection types? + + if request.temperature == 0: # greedy, do_sample is False + if request.top_k not in [-1, None] or request.top_p not in [1.0, None]: + raise ObjectHasInvalidValueException( + "top_k and top_p can't be enabled when temperature is 0." + ) + + if request.top_k == 0: + raise ObjectHasInvalidValueException( + "top_k needs to be strictly positive, or set it to be -1 / None to disable top_k." + ) + + if request.top_k in [-1, None]: # disable top_k, consider all tokens. + if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: + request.top_k = None + elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: + request.top_k = -1 + if request.top_p in [1.0, None]: # disable top_p, consider all tokens. + if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: + request.top_p = None + elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: + request.top_p = 1.0 + + return request + + class CompletionSyncV1UseCase: """ Use case for running a prompt completion on an LLM endpoint. @@ -937,8 +965,6 @@ async def execute( request_id = str(uuid4()) add_trace_request_id(request_id) - if request.top_k == 0: # top_k can't be 0, only takes >= 1, or -1/None to disable top_k - request.top_k = -1 model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints( owner=user.team_id, name=model_endpoint_name, order_by=None @@ -977,6 +1003,8 @@ async def execute( endpoint_id=model_endpoint.record.id ) endpoint_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) + request = validate_completion_params(endpoint_content.inference_framework, request) + if endpoint_content.inference_framework == LLMInferenceFramework.DEEPSPEED: args: Any = { "prompts": [request.prompt], @@ -1031,11 +1059,7 @@ async def execute( if request.temperature > 0: tgi_args["parameters"]["temperature"] = request.temperature tgi_args["parameters"]["do_sample"] = True - if request.top_k == -1: # tgi set to None to consider all tokens. - request.top_k = None tgi_args["parameters"]["top_k"] = request.top_k - if request.top_p == 1: # tgi set to None to consider all tokens. - request.top_p = None tgi_args["parameters"]["top_p"] = request.top_p else: tgi_args["parameters"]["do_sample"] = False @@ -1184,8 +1208,6 @@ async def execute( request_id = str(uuid4()) add_trace_request_id(request_id) - if request.top_k == 0: # top_k can't be 0, only takes >= 1, or -1/None to disable top_k - request.top_k = -1 model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints( owner=user.team_id, name=model_endpoint_name, order_by=None @@ -1224,6 +1246,7 @@ async def execute( ) model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) + request = validate_completion_params(model_content.inference_framework, request) args: Any = None if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED: @@ -1253,11 +1276,7 @@ async def execute( if request.temperature > 0: args["parameters"]["temperature"] = request.temperature args["parameters"]["do_sample"] = True - if request.top_k == -1: # tgi set to None to consider all tokens. - request.top_k = None args["parameters"]["top_k"] = request.top_k - if request.top_p == 1: # tgi set to None to consider all tokens. - request.top_p = None args["parameters"]["top_p"] = request.top_p else: args["parameters"]["do_sample"] = False From 63d174c08a400e56d925ab55f5ca57b50e97458d Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Fri, 29 Sep 2023 01:16:21 +0000 Subject: [PATCH 6/8] remove repetition_penalty --- clients/python/llmengine/completion.py | 26 +++-------------- clients/python/llmengine/data_types.py | 2 -- .../model_engine_server/common/dtos/llms.py | 28 +++++++------------ .../use_cases/llm_model_endpoint_use_cases.py | 10 +++---- 4 files changed, 19 insertions(+), 47 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index 8d4481b7..ec52396e 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -33,7 +33,6 @@ async def acreate( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = None, # text-generation-inference presence_penalty: float = 0.0, # vllm, lightllm frequency_penalty: float = 0.0, # vllm, lightllm top_k: Optional[int] = None, @@ -77,20 +76,14 @@ async def acreate( Whether to return the log probabilities of generated tokens. When True, the response will include a list of tokens and their log probabilities. - repetition_penalty (Optional[float]): - Only affects: text-generation-inference - The parameter for repetition penalty - https://arxiv.org/pdf/1909.05858.pdf - Range: [1.0, infinity). 1.0 means no penalty - presence_penalty (float): - Only affects: vllm, lightllm + Only supported in vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. frequency_penalty (float): - Only affects: vllm, lightllm + Only supported in vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. @@ -195,7 +188,6 @@ async def _acreate_stream( temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, - repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, top_k=top_k, @@ -220,7 +212,6 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse: temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, - repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, top_k=top_k, @@ -236,7 +227,6 @@ def create( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - repetition_penalty: Optional[float] = None, # text-generation-inference presence_penalty: float = 0.0, # vllm, lightllm frequency_penalty: float = 0.0, # vllm, lightllm top_k: Optional[int] = None, @@ -281,20 +271,14 @@ def create( Whether to return the log probabilities of generated tokens. When True, the response will include a list of tokens and their log probabilities. - repetition_penalty (Optional[float]): - Only affects: text-generation-inference - The parameter for repetition penalty - https://arxiv.org/pdf/1909.05858.pdf - Range: [1.0, infinity). 1.0 means no penalty - presence_penalty (float): - Only affects: vllm, lightllm + Only supported in vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. frequency_penalty (float): - Only affects: vllm, lightllm + Only supported in vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. @@ -389,7 +373,6 @@ def _create_stream(**kwargs): temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, - repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, top_k=top_k, @@ -403,7 +386,6 @@ def _create_stream(**kwargs): temperature=temperature, stop_sequences=stop_sequences, return_token_log_probs=return_token_log_probs, - repetition_penalty=repetition_penalty, presence_penalty=presence_penalty, frequency_penalty=frequency_penalty, top_k=top_k, diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index f7f7782d..29e0a529 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -268,7 +268,6 @@ class CompletionSyncV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - repetition_penalty: Optional[float] = Field(default=None, ge=1.0) presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=-1) @@ -334,7 +333,6 @@ class CompletionStreamV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - repetition_penalty: Optional[float] = Field(default=None, ge=1.0) presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=-1) diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index df943970..2f59e065 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -116,20 +116,16 @@ class CompletionSyncV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. """ - - repetition_penalty: Optional[float] = Field(default=None, ge=1.0) + presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) """ - Only affects text-generation-inference - The parameter for repetition penalty. 1.0 means no penalty + Only supported in vllm, lightllm + Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty """ - presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) """ - Only affects vllm, lightllm - presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty - frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty + Only supported in vllm, lightllm + Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty """ - top_k: Optional[int] = Field(default=None, ge=-1) """ Controls the number of top tokens to consider. -1 means consider all tokens. @@ -179,20 +175,16 @@ class CompletionStreamV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models """ - - repetition_penalty: Optional[float] = Field(default=None, ge=1.0) + presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) """ - Only affects text-generation-inference - The parameter for repetition penalty. 1.0 means no penalty + Only supported in vllm, lightllm + Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty """ - presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) """ - Only affects vllm, lightllm - presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty - frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty + Only supported in vllm, lightllm + Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty """ - top_k: Optional[int] = Field(default=None, ge=-1) """ Controls the number of top tokens to consider. -1 means consider all tokens. diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index 330cf32f..c67d9226 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -8,7 +8,7 @@ import math import os from dataclasses import asdict -from typing import Any, AsyncIterable, Dict, List, Optional +from typing import Any, AsyncIterable, Dict, List, Optional, Union from uuid import uuid4 from model_engine_server.common.config import hmi_config @@ -831,8 +831,10 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]: return tokens -def validate_completion_params(inference_framework: LLMInferenceFramework, request): - # can't pass mypy check with Union[CompletionSyncV1Request, CompletionStreamV1Request], doesn't support intersection types? +def validate_completion_params( + inference_framework: LLMInferenceFramework, + request: Union[CompletionSyncV1Request, CompletionStreamV1Request], +) -> Union[CompletionSyncV1Request, CompletionStreamV1Request]: if request.temperature == 0: # greedy, do_sample is False if request.top_k not in [-1, None] or request.top_p not in [1.0, None]: @@ -1051,7 +1053,6 @@ async def execute( "parameters": { "max_new_tokens": request.max_new_tokens, "decoder_input_details": True, - "repetition_penalty": request.repetition_penalty, }, } if request.stop_sequences is not None: @@ -1268,7 +1269,6 @@ async def execute( "inputs": request.prompt, "parameters": { "max_new_tokens": request.max_new_tokens, - "repetition_penalty": request.repetition_penalty, }, } if request.stop_sequences is not None: From aa646739911ce51bc535b88786fa14d5c7e1f196 Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Fri, 29 Sep 2023 07:16:51 +0000 Subject: [PATCH 7/8] add back optional, update validation function --- clients/python/llmengine/completion.py | 16 ++--- clients/python/llmengine/data_types.py | 8 +-- .../model_engine_server/common/dtos/llms.py | 8 +-- .../use_cases/llm_model_endpoint_use_cases.py | 64 ++++++++++++------- 4 files changed, 58 insertions(+), 38 deletions(-) diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py index ec52396e..507754d8 100644 --- a/clients/python/llmengine/completion.py +++ b/clients/python/llmengine/completion.py @@ -33,8 +33,8 @@ async def acreate( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - presence_penalty: float = 0.0, # vllm, lightllm - frequency_penalty: float = 0.0, # vllm, lightllm + presence_penalty: Optional[float] = None, + frequency_penalty: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, timeout: int = COMPLETION_TIMEOUT, @@ -76,13 +76,13 @@ async def acreate( Whether to return the log probabilities of generated tokens. When True, the response will include a list of tokens and their log probabilities. - presence_penalty (float): + presence_penalty (Optional[float]): Only supported in vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. - frequency_penalty (float): + frequency_penalty (Optional[float]): Only supported in vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details @@ -227,8 +227,8 @@ def create( temperature: float = 0.2, stop_sequences: Optional[List[str]] = None, return_token_log_probs: Optional[bool] = False, - presence_penalty: float = 0.0, # vllm, lightllm - frequency_penalty: float = 0.0, # vllm, lightllm + presence_penalty: Optional[float] = None, + frequency_penalty: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, timeout: int = COMPLETION_TIMEOUT, @@ -271,13 +271,13 @@ def create( Whether to return the log probabilities of generated tokens. When True, the response will include a list of tokens and their log probabilities. - presence_penalty (float): + presence_penalty (Optional[float]): Only supported in vllm, lightllm Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics. https://platform.openai.com/docs/guides/gpt/parameter-details Range: [0.0, 2.0]. Higher values encourage the model to use new tokens. - frequency_penalty (float): + frequency_penalty (Optional[float]): Only supported in vllm, lightllm Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim. https://platform.openai.com/docs/guides/gpt/parameter-details diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py index 29e0a529..ecfcd884 100644 --- a/clients/python/llmengine/data_types.py +++ b/clients/python/llmengine/data_types.py @@ -268,8 +268,8 @@ class CompletionSyncV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) - frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) + frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=-1) top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) @@ -333,8 +333,8 @@ class CompletionStreamV1Request(BaseModel): temperature: float = Field(..., ge=0.0) stop_sequences: Optional[List[str]] = Field(default=None) return_token_log_probs: Optional[bool] = Field(default=False) - presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) - frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) + frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) top_k: Optional[int] = Field(default=None, ge=-1) top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0) diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py index 2f59e065..27a12ddc 100644 --- a/model-engine/model_engine_server/common/dtos/llms.py +++ b/model-engine/model_engine_server/common/dtos/llms.py @@ -116,12 +116,12 @@ class CompletionSyncV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. """ - presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) """ Only supported in vllm, lightllm Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty """ - frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) """ Only supported in vllm, lightllm Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty @@ -175,12 +175,12 @@ class CompletionStreamV1Request(BaseModel): """ Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models """ - presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) """ Only supported in vllm, lightllm Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty """ - frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0) + frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0) """ Only supported in vllm, lightllm Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index c67d9226..eadb4a22 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -831,32 +831,50 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]: return tokens -def validate_completion_params( +def validate_and_update_completion_params( inference_framework: LLMInferenceFramework, request: Union[CompletionSyncV1Request, CompletionStreamV1Request], -) -> Union[CompletionSyncV1Request, CompletionStreamV1Request]: - - if request.temperature == 0: # greedy, do_sample is False - if request.top_k not in [-1, None] or request.top_p not in [1.0, None]: +): + # top_k, top_p + if inference_framework in [ + LLMInferenceFramework.TEXT_GENERATION_INFERENCE, + LLMInferenceFramework.VLLM, + LLMInferenceFramework.LIGHTLLM, + ]: + if request.temperature == 0: + if request.top_k not in [-1, None] or request.top_p not in [1.0, None]: + raise ObjectHasInvalidValueException( + "top_k and top_p can't be enabled when temperature is 0." + ) + if request.top_k == 0: raise ObjectHasInvalidValueException( - "top_k and top_p can't be enabled when temperature is 0." + "top_k needs to be strictly positive, or set it to be -1 / None to disable top_k." + ) + if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: + request.top_k = None if request.top_k == -1 else request.top_k + request.top_p = None if request.top_p == 1.0 else request.top_p + if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: + request.top_k = -1 if request.top_k is None else request.top_k + request.top_p = 1.0 if request.top_p is None else request.top_p + else: + if request.top_k or request.top_p: + raise ObjectHasInvalidValueException( + "top_k and top_p are only supported in text-generation-inference, vllm, lightllm." ) - if request.top_k == 0: - raise ObjectHasInvalidValueException( - "top_k needs to be strictly positive, or set it to be -1 / None to disable top_k." + # presence_penalty, frequency_penalty + if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: + request.presence_penalty = ( + 0.0 if request.presence_penalty is None else request.presence_penalty ) - - if request.top_k in [-1, None]: # disable top_k, consider all tokens. - if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: - request.top_k = None - elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: - request.top_k = -1 - if request.top_p in [1.0, None]: # disable top_p, consider all tokens. - if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE: - request.top_p = None - elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]: - request.top_p = 1.0 + request.frequency_penalty = ( + 0.0 if request.frequency_penalty is None else request.frequency_penalty + ) + else: + if request.presence_penalty or request.frequency_penalty: + raise ObjectHasInvalidValueException( + "presence_penalty and frequency_penalty are only supported in vllm, lightllm." + ) return request @@ -1005,7 +1023,9 @@ async def execute( endpoint_id=model_endpoint.record.id ) endpoint_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) - request = validate_completion_params(endpoint_content.inference_framework, request) + request = validate_and_update_completion_params( + endpoint_content.inference_framework, request + ) if endpoint_content.inference_framework == LLMInferenceFramework.DEEPSPEED: args: Any = { @@ -1247,7 +1267,7 @@ async def execute( ) model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) - request = validate_completion_params(model_content.inference_framework, request) + request = validate_and_update_completion_params(model_content.inference_framework, request) args: Any = None if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED: From 4b1b4fb56a419bb9d0d1348ad56b6a7435bf10a2 Mon Sep 17 00:00:00 2001 From: Frances Yuan Date: Fri, 29 Sep 2023 20:00:17 +0000 Subject: [PATCH 8/8] type check --- .../use_cases/llm_model_endpoint_use_cases.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py index eadb4a22..241184fb 100644 --- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py +++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py @@ -834,7 +834,7 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]: def validate_and_update_completion_params( inference_framework: LLMInferenceFramework, request: Union[CompletionSyncV1Request, CompletionStreamV1Request], -): +) -> Union[CompletionSyncV1Request, CompletionStreamV1Request]: # top_k, top_p if inference_framework in [ LLMInferenceFramework.TEXT_GENERATION_INFERENCE, @@ -1023,9 +1023,14 @@ async def execute( endpoint_id=model_endpoint.record.id ) endpoint_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) - request = validate_and_update_completion_params( + validated_request = validate_and_update_completion_params( endpoint_content.inference_framework, request ) + if not isinstance(validated_request, CompletionSyncV1Request): + raise ValueError( + f"request has type {validated_request.__class__.__name__}, expected type CompletionSyncV1Request" + ) + request = validated_request if endpoint_content.inference_framework == LLMInferenceFramework.DEEPSPEED: args: Any = { @@ -1267,7 +1272,14 @@ async def execute( ) model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint) - request = validate_and_update_completion_params(model_content.inference_framework, request) + validated_request = validate_and_update_completion_params( + model_content.inference_framework, request + ) + if not isinstance(validated_request, CompletionStreamV1Request): + raise ValueError( + f"request has type {validated_request.__class__.__name__}, expected type CompletionStreamV1Request" + ) + request = validated_request args: Any = None if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED: