From b9f876682715a43256302133b6a4ccf4834a2fb6 Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Thu, 28 Sep 2023 00:46:21 +0000
Subject: [PATCH 1/8] add repetition_penalty, top_k, top_p

---
 clients/python/llmengine/completion.py        | 36 +++++++++++++++++++
 clients/python/llmengine/data_types.py        |  6 ++++
 .../model_engine_server/common/dtos/llms.py   | 24 +++++++++++++
 .../use_cases/llm_model_endpoint_use_cases.py | 15 ++++++++
 4 files changed, 81 insertions(+)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 8cecd765..b6ca729f 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -33,6 +33,9 @@ async def acreate(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
+        repetition_penalty: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]:
@@ -72,6 +75,15 @@ async def acreate(
                 Whether to return the log probabilities of generated tokens.
                 When True, the response will include a list of tokens and their log probabilities.
 
+            repetition_penalty (Optional[float]):
+                The parameter for repetition penalty. 1.0 means no penalty.
+
+            top_k (Optional[int]):
+                Integer that controls the number of top tokens to consider. -1 means consider all tokens.
+
+            top_p (Optional[float]):
+                Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens.
+
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
 
@@ -164,6 +176,9 @@ async def _acreate_stream(
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                top_p=top_p,
                 timeout=timeout,
             )
 
@@ -184,6 +199,9 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse:
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                top_p=top_p,
             )
 
     @classmethod
@@ -195,6 +213,9 @@ def create(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
+        repetition_penalty: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]:
@@ -235,6 +256,15 @@ def create(
                 Whether to return the log probabilities of generated tokens.
                 When True, the response will include a list of tokens and their log probabilities.
 
+            repetition_penalty (Optional[float]):
+                The parameter for repetition penalty. 1.0 means no penalty.
+
+            top_k (Optional[int]):
+                Integer that controls the number of top tokens to consider. -1 means consider all tokens.
+
+            top_p (Optional[float]):
+                Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens.
+
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
 
@@ -317,6 +347,9 @@ def _create_stream(**kwargs):
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                top_p=top_p,
             )
 
         else:
@@ -326,6 +359,9 @@ def _create_stream(**kwargs):
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
+                repetition_penalty=repetition_penalty,
+                top_k=top_k,
+                top_p=top_p,
             ).dict()
             response = cls.post_sync(
                 resource_name=f"v1/llm/completions-sync?model_endpoint_name={model}",
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
index 1a8baba3..bcc8bc90 100644
--- a/clients/python/llmengine/data_types.py
+++ b/clients/python/llmengine/data_types.py
@@ -268,6 +268,9 @@ class CompletionSyncV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
+    repetition_penalty: Optional[float] = Field(default=False)
+    top_k: Optional[int] = Field(default=False)
+    top_p: Optional[float] = Field(default=False)
 
 
 class TokenOutput(BaseModel):
@@ -329,6 +332,9 @@ class CompletionStreamV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
+    repetition_penalty: Optional[float] = Field(default=False)
+    top_k: Optional[int] = Field(default=False)
+    top_p: Optional[float] = Field(default=False)
 
 
 class CompletionStreamOutput(BaseModel):
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
index 2735f577..7b4ccf47 100644
--- a/model-engine/model_engine_server/common/dtos/llms.py
+++ b/model-engine/model_engine_server/common/dtos/llms.py
@@ -116,6 +116,18 @@ class CompletionSyncV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens.
     """
+    repetition_penalty: Optional[float] = None
+    """
+    The parameter for repetition penalty. 1.0 means no penalty. 
+    """
+    top_k: Optional[int] = None
+    """
+    Integer that controls the number of top tokens to consider.
+    """
+    top_p: Optional[float] = None
+    """
+    Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1].
+    """
 
 
 class TokenOutput(BaseModel):
@@ -157,6 +169,18 @@ class CompletionStreamV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
     """
+    repetition_penalty: Optional[float] = None
+    """
+    The parameter for repetition penalty. 1.0 means no penalty. 
+    """
+    top_k: Optional[int] = None
+    """
+    Integer that controls the number of top tokens to consider.
+    """
+    top_p: Optional[float] = None
+    """
+    Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1].
+    """
 
 
 class CompletionStreamOutput(BaseModel):
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 9e4b6ba6..f0718ef2 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -1021,6 +1021,7 @@ async def execute(
                 "parameters": {
                     "max_new_tokens": request.max_new_tokens,
                     "decoder_input_details": True,
+                    "repetition_penalty": request.repetition_penalty,
                 },
             }
             if request.stop_sequences is not None:
@@ -1028,6 +1029,13 @@ async def execute(
             if request.temperature > 0:
                 tgi_args["parameters"]["temperature"] = request.temperature
                 tgi_args["parameters"]["do_sample"] = True
+                if request.top_k == -1:  # tgi set to None to consider all tokens.
+                    request.top_k = None
+                tgi_args["parameters"]["top_k"] = request.top_k
+                if request.top_p == 1:  # tgi set to None to consider all tokens.
+                    request.top_p = None
+            else:
+                tgi_args["parameters"]["do_sample"] = False
 
             inference_request = SyncEndpointPredictV1Request(
                 args=tgi_args,
@@ -1060,6 +1068,13 @@ async def execute(
             if request.stop_sequences is not None:
                 vllm_args["stop"] = request.stop_sequences
             vllm_args["temperature"] = request.temperature
+            if request.temperature > 0:
+                if request.top_k is None:  # vllm set to -1 to consider all tokens.
+                    request.top_k = -1
+                vllm_args["top_k"] = request.top_k
+                if request.top_p is None:  # vllm set to 1 to consider all tokens.
+                    request.top_p = 1
+                vllm_args["top_p"] = request.top_p
             if request.return_token_log_probs:
                 vllm_args["logprobs"] = 1
 

From 1847fa2c9a3e44b4c6fd98941b14a9162a0424bd Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Thu, 28 Sep 2023 03:25:26 +0000
Subject: [PATCH 2/8] add frequency_penalty, presence_penalty, add lightllm

---
 clients/python/llmengine/completion.py        | 64 +++++++++++++++----
 clients/python/llmengine/data_types.py        | 16 +++--
 .../model_engine_server/common/dtos/llms.py   | 44 +++++++++----
 .../use_cases/llm_model_endpoint_use_cases.py | 34 ++++++++--
 4 files changed, 124 insertions(+), 34 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index b6ca729f..7c759299 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -33,9 +33,11 @@ async def acreate(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = None,
-        top_k: Optional[int] = None,
-        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = 1,
+        presence_penalty: Optional[float] = 0,
+        frequency_penalty: Optional[float] = 0,
+        top_k: Optional[int] = -1,
+        top_p: Optional[float] = 1,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]:
@@ -76,13 +78,27 @@ async def acreate(
                 When True, the response will include a list of tokens and their log probabilities.
 
             repetition_penalty (Optional[float]):
-                The parameter for repetition penalty. 1.0 means no penalty.
+                The parameter for repetition penalty
+                https://arxiv.org/pdf/1909.05858.pdf
+                Range: [1.0, infinity). 1.0 means no penalty
+
+            presence_penalty (Optional[float]):
+                Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+                https://platform.openai.com/docs/guides/gpt/parameter-details
+                Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
+
+            frequency_penalty (Optional[float]):
+                Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+                https://platform.openai.com/docs/guides/gpt/parameter-details
+                Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             top_k (Optional[int]):
-                Integer that controls the number of top tokens to consider. -1 means consider all tokens.
+                Integer that controls the number of top tokens to consider.
+                Range: [1, infinity). -1 means consider all tokens.
 
             top_p (Optional[float]):
-                Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens.
+                Float that controls the cumulative probability of the top tokens to consider.
+                Range: (0.0, 1.0]. 1.0 means consider all tokens.
 
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
@@ -177,6 +193,8 @@ async def _acreate_stream(
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
                 repetition_penalty=repetition_penalty,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
                 top_k=top_k,
                 top_p=top_p,
                 timeout=timeout,
@@ -200,6 +218,8 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse:
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
                 repetition_penalty=repetition_penalty,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
                 top_k=top_k,
                 top_p=top_p,
             )
@@ -213,9 +233,11 @@ def create(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = None,
-        top_k: Optional[int] = None,
-        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = 1,
+        presence_penalty: Optional[float] = 0,
+        frequency_penalty: Optional[float] = 0,
+        top_k: Optional[int] = -1,
+        top_p: Optional[float] = 1,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]:
@@ -257,13 +279,27 @@ def create(
                 When True, the response will include a list of tokens and their log probabilities.
 
             repetition_penalty (Optional[float]):
-                The parameter for repetition penalty. 1.0 means no penalty.
+                The parameter for repetition penalty
+                https://arxiv.org/pdf/1909.05858.pdf
+                Range: [1.0, infinity). 1.0 means no penalty
+
+            presence_penalty (Optional[float]):
+                Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
+                https://platform.openai.com/docs/guides/gpt/parameter-details
+                Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
+
+            frequency_penalty (Optional[float]):
+                Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
+                https://platform.openai.com/docs/guides/gpt/parameter-details
+                Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             top_k (Optional[int]):
-                Integer that controls the number of top tokens to consider. -1 means consider all tokens.
+                Integer that controls the number of top tokens to consider.
+                Range: [1, infinity). -1 means consider all tokens.
 
             top_p (Optional[float]):
-                Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. 1.0 means consider all tokens.
+                Float that controls the cumulative probability of the top tokens to consider.
+                Range: (0.0, 1.0]. 1.0 means consider all tokens.
 
             timeout (int):
                 Timeout in seconds. This is the maximum amount of time you are willing to wait for a response.
@@ -348,6 +384,8 @@ def _create_stream(**kwargs):
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
                 repetition_penalty=repetition_penalty,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
                 top_k=top_k,
                 top_p=top_p,
             )
@@ -360,6 +398,8 @@ def _create_stream(**kwargs):
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
                 repetition_penalty=repetition_penalty,
+                presence_penalty=presence_penalty,
+                frequency_penalty=frequency_penalty,
                 top_k=top_k,
                 top_p=top_p,
             ).dict()
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
index bcc8bc90..a3f4461a 100644
--- a/clients/python/llmengine/data_types.py
+++ b/clients/python/llmengine/data_types.py
@@ -268,9 +268,11 @@ class CompletionSyncV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    repetition_penalty: Optional[float] = Field(default=False)
-    top_k: Optional[int] = Field(default=False)
-    top_p: Optional[float] = Field(default=False)
+    repetition_penalty: Optional[float] = Field(default=1, ge=1)
+    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    top_k: Optional[int] = Field(default=-1, ge=-1)
+    top_p: Optional[float] = Field(default=1, gt=0, le=1)
 
 
 class TokenOutput(BaseModel):
@@ -332,9 +334,11 @@ class CompletionStreamV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    repetition_penalty: Optional[float] = Field(default=False)
-    top_k: Optional[int] = Field(default=False)
-    top_p: Optional[float] = Field(default=False)
+    repetition_penalty: Optional[float] = Field(default=1, ge=1)
+    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    top_k: Optional[int] = Field(default=-1, ge=-1)
+    top_p: Optional[float] = Field(default=1, gt=0, le=1)
 
 
 class CompletionStreamOutput(BaseModel):
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
index 7b4ccf47..ad9602bb 100644
--- a/model-engine/model_engine_server/common/dtos/llms.py
+++ b/model-engine/model_engine_server/common/dtos/llms.py
@@ -116,17 +116,27 @@ class CompletionSyncV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens.
     """
-    repetition_penalty: Optional[float] = None
+
+    repetition_penalty: Optional[float] = Field(default=1, ge=1)
+    """
+    Only affects text-generation-inference
+    The parameter for repetition penalty. 1.0 means no penalty
     """
-    The parameter for repetition penalty. 1.0 means no penalty. 
+    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
     """
-    top_k: Optional[int] = None
+    Only affects vllm, lightllm
+    presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
+    frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
+    """
+
+    top_k: Optional[int] = Field(default=-1, ge=-1)
     """
-    Integer that controls the number of top tokens to consider.
+    Controls the number of top tokens to consider. -1 means consider all tokens.
     """
-    top_p: Optional[float] = None
+    top_p: Optional[float] = Field(default=1, gt=0, le=1)
     """
-    Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1].
+    Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
     """
 
 
@@ -169,17 +179,27 @@ class CompletionStreamV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
     """
-    repetition_penalty: Optional[float] = None
+
+    repetition_penalty: Optional[float] = Field(default=1, ge=1)
+    """
+    Only affects text-generation-inference
+    The parameter for repetition penalty. 1.0 means no penalty
     """
-    The parameter for repetition penalty. 1.0 means no penalty. 
+    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
     """
-    top_k: Optional[int] = None
+    Only affects vllm, lightllm
+    presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
+    frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
+    """
+
+    top_k: Optional[int] = Field(default=-1, ge=-1)
     """
-    Integer that controls the number of top tokens to consider.
+    Controls the number of top tokens to consider. -1 means consider all tokens.
     """
-    top_p: Optional[float] = None
+    top_p: Optional[float] = Field(default=1, gt=0, le=1)
     """
-    Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1].
+    Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
     """
 
 
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index f0718ef2..bc383c98 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -937,6 +937,8 @@ async def execute(
 
         request_id = str(uuid4())
         add_trace_request_id(request_id)
+        if request.top_k == 0:
+            request.top_k = -1
 
         model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints(
             owner=user.team_id, name=model_endpoint_name, order_by=None
@@ -1034,6 +1036,7 @@ async def execute(
                 tgi_args["parameters"]["top_k"] = request.top_k
                 if request.top_p == 1:  # tgi set to None to consider all tokens.
                     request.top_p = None
+                tgi_args["parameters"]["top_p"] = request.top_p
             else:
                 tgi_args["parameters"]["do_sample"] = False
 
@@ -1064,16 +1067,14 @@ async def execute(
             vllm_args: Any = {
                 "prompt": request.prompt,
                 "max_tokens": request.max_new_tokens,
+                "presence_penalty": request.presence_penalty,
+                "frequency_penalty": request.frequency_penalty,
             }
             if request.stop_sequences is not None:
                 vllm_args["stop"] = request.stop_sequences
             vllm_args["temperature"] = request.temperature
             if request.temperature > 0:
-                if request.top_k is None:  # vllm set to -1 to consider all tokens.
-                    request.top_k = -1
                 vllm_args["top_k"] = request.top_k
-                if request.top_p is None:  # vllm set to 1 to consider all tokens.
-                    request.top_p = 1
                 vllm_args["top_p"] = request.top_p
             if request.return_token_log_probs:
                 vllm_args["logprobs"] = 1
@@ -1105,12 +1106,16 @@ async def execute(
                 "inputs": request.prompt,
                 "parameters": {
                     "max_new_tokens": request.max_new_tokens,
+                    "presence_penalty": request.presence_penalty,
+                    "frequency_penalty": request.frequency_penalty,
                 },
             }
             # TODO: implement stop sequences
             if request.temperature > 0:
                 lightllm_args["parameters"]["temperature"] = request.temperature
                 lightllm_args["parameters"]["do_sample"] = True
+                lightllm_args["top_k"] = request.top_k
+                lightllm_args["top_p"] = request.top_p
             else:
                 lightllm_args["parameters"]["do_sample"] = False
             if request.return_token_log_probs:
@@ -1179,6 +1184,9 @@ async def execute(
 
         request_id = str(uuid4())
         add_trace_request_id(request_id)
+        if request.top_k == 0:
+            request.top_k = -1
+
         model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints(
             owner=user.team_id, name=model_endpoint_name, order_by=None
         )
@@ -1237,6 +1245,7 @@ async def execute(
                 "inputs": request.prompt,
                 "parameters": {
                     "max_new_tokens": request.max_new_tokens,
+                    "repetition_penalty": request.repetition_penalty,
                 },
             }
             if request.stop_sequences is not None:
@@ -1244,14 +1253,27 @@ async def execute(
             if request.temperature > 0:
                 args["parameters"]["temperature"] = request.temperature
                 args["parameters"]["do_sample"] = True
+                if request.top_k == -1:  # tgi set to None to consider all tokens.
+                    request.top_k = None
+                args["parameters"]["top_k"] = request.top_k
+                if request.top_p == 1:  # tgi set to None to consider all tokens.
+                    request.top_p = None
+                args["parameters"]["top_p"] = request.top_p
+            else:
+                args["parameters"]["do_sample"] = False
         elif model_content.inference_framework == LLMInferenceFramework.VLLM:
             args = {
                 "prompt": request.prompt,
                 "max_tokens": request.max_new_tokens,
+                "presence_penalty": request.presence_penalty,
+                "frequency_penalty": request.frequency_penalty,
             }
             if request.stop_sequences is not None:
                 args["stop"] = request.stop_sequences
             args["temperature"] = request.temperature
+            if request.temperature > 0:
+                args["top_k"] = request.top_k
+                args["top_p"] = request.top_p
             if request.return_token_log_probs:
                 args["logprobs"] = 1
             args["stream"] = True
@@ -1260,12 +1282,16 @@ async def execute(
                 "inputs": request.prompt,
                 "parameters": {
                     "max_new_tokens": request.max_new_tokens,
+                    "presence_penalty": request.presence_penalty,
+                    "frequency_penalty": request.frequency_penalty,
                 },
             }
             # TODO: stop sequences
             if request.temperature > 0:
                 args["parameters"]["temperature"] = request.temperature
                 args["parameters"]["do_sample"] = True
+                args["parameters"]["top_k"] = request.top_k
+                args["parameters"]["top_p"] = request.top_p
             else:
                 args["parameters"]["do_sample"] = False
             if request.return_token_log_probs:

From e5de486a71b284e1dbd71a71afd7908f4f16841f Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Thu, 28 Sep 2023 16:26:48 +0000
Subject: [PATCH 3/8] add comments

---
 clients/python/llmengine/completion.py         | 18 ++++++++++++------
 .../use_cases/llm_model_endpoint_use_cases.py  |  4 ++--
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 7c759299..8dacbaf3 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -33,9 +33,9 @@ async def acreate(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = 1,
-        presence_penalty: Optional[float] = 0,
-        frequency_penalty: Optional[float] = 0,
+        repetition_penalty: Optional[float] = 1,  # text-generation-inference
+        presence_penalty: Optional[float] = 0,  # vllm, lightllm
+        frequency_penalty: Optional[float] = 0,  # vllm, lightllm
         top_k: Optional[int] = -1,
         top_p: Optional[float] = 1,
         timeout: int = COMPLETION_TIMEOUT,
@@ -78,16 +78,19 @@ async def acreate(
                 When True, the response will include a list of tokens and their log probabilities.
 
             repetition_penalty (Optional[float]):
+            *** Only affects: text-generation-inference ***
                 The parameter for repetition penalty
                 https://arxiv.org/pdf/1909.05858.pdf
                 Range: [1.0, infinity). 1.0 means no penalty
 
             presence_penalty (Optional[float]):
+            *** Only affects: vllm, lightllm ***
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             frequency_penalty (Optional[float]):
+            *** Only affects: vllm, lightllm ***
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
@@ -233,9 +236,9 @@ def create(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = 1,
-        presence_penalty: Optional[float] = 0,
-        frequency_penalty: Optional[float] = 0,
+        repetition_penalty: Optional[float] = 1,  # text-generation-inference
+        presence_penalty: Optional[float] = 0,  # vllm, lightllm
+        frequency_penalty: Optional[float] = 0,  # vllm, lightllm
         top_k: Optional[int] = -1,
         top_p: Optional[float] = 1,
         timeout: int = COMPLETION_TIMEOUT,
@@ -279,16 +282,19 @@ def create(
                 When True, the response will include a list of tokens and their log probabilities.
 
             repetition_penalty (Optional[float]):
+            *** Only affects: text-generation-inference ***
                 The parameter for repetition penalty
                 https://arxiv.org/pdf/1909.05858.pdf
                 Range: [1.0, infinity). 1.0 means no penalty
 
             presence_penalty (Optional[float]):
+            *** Only affects: vllm, lightllm ***
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             frequency_penalty (Optional[float]):
+            *** Only affects: vllm, lightllm ***
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index bc383c98..aac35008 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -937,7 +937,7 @@ async def execute(
 
         request_id = str(uuid4())
         add_trace_request_id(request_id)
-        if request.top_k == 0:
+        if request.top_k == 0:  # top_k can't be 0, only takes >= 1, or -1/None to disable top_k
             request.top_k = -1
 
         model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints(
@@ -1184,7 +1184,7 @@ async def execute(
 
         request_id = str(uuid4())
         add_trace_request_id(request_id)
-        if request.top_k == 0:
+        if request.top_k == 0:  # top_k can't be 0, only takes >= 1, or -1/None to disable top_k
             request.top_k = -1
 
         model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints(

From 3d845e2d81800ba6d49712924be6176149f716f2 Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Thu, 28 Sep 2023 17:08:13 +0000
Subject: [PATCH 4/8] fix

---
 clients/python/llmengine/completion.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 8dacbaf3..7179fc75 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -78,19 +78,19 @@ async def acreate(
                 When True, the response will include a list of tokens and their log probabilities.
 
             repetition_penalty (Optional[float]):
-            *** Only affects: text-generation-inference ***
+                Only affects: text-generation-inference
                 The parameter for repetition penalty
                 https://arxiv.org/pdf/1909.05858.pdf
                 Range: [1.0, infinity). 1.0 means no penalty
 
             presence_penalty (Optional[float]):
-            *** Only affects: vllm, lightllm ***
+                Only affects: vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             frequency_penalty (Optional[float]):
-            *** Only affects: vllm, lightllm ***
+                Only affects: vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
@@ -282,19 +282,19 @@ def create(
                 When True, the response will include a list of tokens and their log probabilities.
 
             repetition_penalty (Optional[float]):
-            *** Only affects: text-generation-inference ***
+                Only affects: text-generation-inference
                 The parameter for repetition penalty
                 https://arxiv.org/pdf/1909.05858.pdf
                 Range: [1.0, infinity). 1.0 means no penalty
 
             presence_penalty (Optional[float]):
-            *** Only affects: vllm, lightllm ***
+                Only affects: vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             frequency_penalty (Optional[float]):
-            *** Only affects: vllm, lightllm ***
+                Only affects: vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.

From ffd93416593aab9b96ef8b8936a3f71dee019f61 Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Thu, 28 Sep 2023 22:24:02 +0000
Subject: [PATCH 5/8] fix Optional, add params validation

---
 clients/python/llmengine/completion.py        | 28 ++++++------
 clients/python/llmengine/data_types.py        | 20 ++++-----
 .../model_engine_server/common/dtos/llms.py   | 24 +++++------
 .../use_cases/llm_model_endpoint_use_cases.py | 43 +++++++++++++------
 4 files changed, 67 insertions(+), 48 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 7179fc75..8d4481b7 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -33,11 +33,11 @@ async def acreate(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = 1,  # text-generation-inference
-        presence_penalty: Optional[float] = 0,  # vllm, lightllm
-        frequency_penalty: Optional[float] = 0,  # vllm, lightllm
-        top_k: Optional[int] = -1,
-        top_p: Optional[float] = 1,
+        repetition_penalty: Optional[float] = None,  # text-generation-inference
+        presence_penalty: float = 0.0,  # vllm, lightllm
+        frequency_penalty: float = 0.0,  # vllm, lightllm
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, AsyncIterable[CompletionStreamResponse]]:
@@ -83,13 +83,13 @@ async def acreate(
                 https://arxiv.org/pdf/1909.05858.pdf
                 Range: [1.0, infinity). 1.0 means no penalty
 
-            presence_penalty (Optional[float]):
+            presence_penalty (float):
                 Only affects: vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
-            frequency_penalty (Optional[float]):
+            frequency_penalty (float):
                 Only affects: vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
@@ -236,11 +236,11 @@ def create(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = 1,  # text-generation-inference
-        presence_penalty: Optional[float] = 0,  # vllm, lightllm
-        frequency_penalty: Optional[float] = 0,  # vllm, lightllm
-        top_k: Optional[int] = -1,
-        top_p: Optional[float] = 1,
+        repetition_penalty: Optional[float] = None,  # text-generation-inference
+        presence_penalty: float = 0.0,  # vllm, lightllm
+        frequency_penalty: float = 0.0,  # vllm, lightllm
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
         timeout: int = COMPLETION_TIMEOUT,
         stream: bool = False,
     ) -> Union[CompletionSyncResponse, Iterator[CompletionStreamResponse]]:
@@ -287,13 +287,13 @@ def create(
                 https://arxiv.org/pdf/1909.05858.pdf
                 Range: [1.0, infinity). 1.0 means no penalty
 
-            presence_penalty (Optional[float]):
+            presence_penalty (float):
                 Only affects: vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
-            frequency_penalty (Optional[float]):
+            frequency_penalty (float):
                 Only affects: vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
index a3f4461a..f7f7782d 100644
--- a/clients/python/llmengine/data_types.py
+++ b/clients/python/llmengine/data_types.py
@@ -268,11 +268,11 @@ class CompletionSyncV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    repetition_penalty: Optional[float] = Field(default=1, ge=1)
-    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
-    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
-    top_k: Optional[int] = Field(default=-1, ge=-1)
-    top_p: Optional[float] = Field(default=1, gt=0, le=1)
+    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
+    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    top_k: Optional[int] = Field(default=None, ge=-1)
+    top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
 
 
 class TokenOutput(BaseModel):
@@ -334,11 +334,11 @@ class CompletionStreamV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    repetition_penalty: Optional[float] = Field(default=1, ge=1)
-    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
-    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
-    top_k: Optional[int] = Field(default=-1, ge=-1)
-    top_p: Optional[float] = Field(default=1, gt=0, le=1)
+    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
+    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    top_k: Optional[int] = Field(default=None, ge=-1)
+    top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
 
 
 class CompletionStreamOutput(BaseModel):
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
index ad9602bb..df943970 100644
--- a/model-engine/model_engine_server/common/dtos/llms.py
+++ b/model-engine/model_engine_server/common/dtos/llms.py
@@ -104,7 +104,7 @@ class CompletionSyncV1Request(BaseModel):
 
     prompt: str
     max_new_tokens: int
-    temperature: float = Field(ge=0, le=1)
+    temperature: float = Field(ge=0.0, le=1.0)
     """
     Temperature of the sampling. Setting to 0 equals to greedy sampling.
     """
@@ -117,24 +117,24 @@ class CompletionSyncV1Request(BaseModel):
     Whether to return the log probabilities of the tokens.
     """
 
-    repetition_penalty: Optional[float] = Field(default=1, ge=1)
+    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
     """
     Only affects text-generation-inference
     The parameter for repetition penalty. 1.0 means no penalty
     """
-    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
-    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     """
     Only affects vllm, lightllm
     presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
     frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
     """
 
-    top_k: Optional[int] = Field(default=-1, ge=-1)
+    top_k: Optional[int] = Field(default=None, ge=-1)
     """
     Controls the number of top tokens to consider. -1 means consider all tokens.
     """
-    top_p: Optional[float] = Field(default=1, gt=0, le=1)
+    top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
     """
     Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
     """
@@ -167,7 +167,7 @@ class CompletionStreamV1Request(BaseModel):
 
     prompt: str
     max_new_tokens: int
-    temperature: float = Field(ge=0, le=1)
+    temperature: float = Field(ge=0.0, le=1.0)
     """
     Temperature of the sampling. Setting to 0 equals to greedy sampling.
     """
@@ -180,24 +180,24 @@ class CompletionStreamV1Request(BaseModel):
     Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
     """
 
-    repetition_penalty: Optional[float] = Field(default=1, ge=1)
+    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
     """
     Only affects text-generation-inference
     The parameter for repetition penalty. 1.0 means no penalty
     """
-    presence_penalty: Optional[float] = Field(default=0, ge=0, le=2)
-    frequency_penalty: Optional[float] = Field(default=0, ge=0, le=2)
+    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     """
     Only affects vllm, lightllm
     presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
     frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
     """
 
-    top_k: Optional[int] = Field(default=-1, ge=-1)
+    top_k: Optional[int] = Field(default=None, ge=-1)
     """
     Controls the number of top tokens to consider. -1 means consider all tokens.
     """
-    top_p: Optional[float] = Field(default=1, gt=0, le=1)
+    top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
     """
     Controls the cumulative probability of the top tokens to consider. 1.0 means consider all tokens.
     """
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index aac35008..330cf32f 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -831,6 +831,34 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]:
     return tokens
 
 
+def validate_completion_params(inference_framework: LLMInferenceFramework, request):
+    # can't pass mypy check with Union[CompletionSyncV1Request, CompletionStreamV1Request], doesn't support intersection types?
+
+    if request.temperature == 0:  # greedy, do_sample is False
+        if request.top_k not in [-1, None] or request.top_p not in [1.0, None]:
+            raise ObjectHasInvalidValueException(
+                "top_k and top_p can't be enabled when temperature is 0."
+            )
+
+    if request.top_k == 0:
+        raise ObjectHasInvalidValueException(
+            "top_k needs to be strictly positive, or set it to be -1 / None to disable top_k."
+        )
+
+    if request.top_k in [-1, None]:  # disable top_k, consider all tokens.
+        if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
+            request.top_k = None
+        elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
+            request.top_k = -1
+    if request.top_p in [1.0, None]:  # disable top_p, consider all tokens.
+        if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
+            request.top_p = None
+        elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
+            request.top_p = 1.0
+
+    return request
+
+
 class CompletionSyncV1UseCase:
     """
     Use case for running a prompt completion on an LLM endpoint.
@@ -937,8 +965,6 @@ async def execute(
 
         request_id = str(uuid4())
         add_trace_request_id(request_id)
-        if request.top_k == 0:  # top_k can't be 0, only takes >= 1, or -1/None to disable top_k
-            request.top_k = -1
 
         model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints(
             owner=user.team_id, name=model_endpoint_name, order_by=None
@@ -977,6 +1003,8 @@ async def execute(
             endpoint_id=model_endpoint.record.id
         )
         endpoint_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
+        request = validate_completion_params(endpoint_content.inference_framework, request)
+
         if endpoint_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
             args: Any = {
                 "prompts": [request.prompt],
@@ -1031,11 +1059,7 @@ async def execute(
             if request.temperature > 0:
                 tgi_args["parameters"]["temperature"] = request.temperature
                 tgi_args["parameters"]["do_sample"] = True
-                if request.top_k == -1:  # tgi set to None to consider all tokens.
-                    request.top_k = None
                 tgi_args["parameters"]["top_k"] = request.top_k
-                if request.top_p == 1:  # tgi set to None to consider all tokens.
-                    request.top_p = None
                 tgi_args["parameters"]["top_p"] = request.top_p
             else:
                 tgi_args["parameters"]["do_sample"] = False
@@ -1184,8 +1208,6 @@ async def execute(
 
         request_id = str(uuid4())
         add_trace_request_id(request_id)
-        if request.top_k == 0:  # top_k can't be 0, only takes >= 1, or -1/None to disable top_k
-            request.top_k = -1
 
         model_endpoints = await self.llm_model_endpoint_service.list_llm_model_endpoints(
             owner=user.team_id, name=model_endpoint_name, order_by=None
@@ -1224,6 +1246,7 @@ async def execute(
         )
 
         model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
+        request = validate_completion_params(model_content.inference_framework, request)
 
         args: Any = None
         if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
@@ -1253,11 +1276,7 @@ async def execute(
             if request.temperature > 0:
                 args["parameters"]["temperature"] = request.temperature
                 args["parameters"]["do_sample"] = True
-                if request.top_k == -1:  # tgi set to None to consider all tokens.
-                    request.top_k = None
                 args["parameters"]["top_k"] = request.top_k
-                if request.top_p == 1:  # tgi set to None to consider all tokens.
-                    request.top_p = None
                 args["parameters"]["top_p"] = request.top_p
             else:
                 args["parameters"]["do_sample"] = False

From 63d174c08a400e56d925ab55f5ca57b50e97458d Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Fri, 29 Sep 2023 01:16:21 +0000
Subject: [PATCH 6/8] remove repetition_penalty

---
 clients/python/llmengine/completion.py        | 26 +++--------------
 clients/python/llmengine/data_types.py        |  2 --
 .../model_engine_server/common/dtos/llms.py   | 28 +++++++------------
 .../use_cases/llm_model_endpoint_use_cases.py | 10 +++----
 4 files changed, 19 insertions(+), 47 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index 8d4481b7..ec52396e 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -33,7 +33,6 @@ async def acreate(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = None,  # text-generation-inference
         presence_penalty: float = 0.0,  # vllm, lightllm
         frequency_penalty: float = 0.0,  # vllm, lightllm
         top_k: Optional[int] = None,
@@ -77,20 +76,14 @@ async def acreate(
                 Whether to return the log probabilities of generated tokens.
                 When True, the response will include a list of tokens and their log probabilities.
 
-            repetition_penalty (Optional[float]):
-                Only affects: text-generation-inference
-                The parameter for repetition penalty
-                https://arxiv.org/pdf/1909.05858.pdf
-                Range: [1.0, infinity). 1.0 means no penalty
-
             presence_penalty (float):
-                Only affects: vllm, lightllm
+                Only supported in vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             frequency_penalty (float):
-                Only affects: vllm, lightllm
+                Only supported in vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
@@ -195,7 +188,6 @@ async def _acreate_stream(
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
-                repetition_penalty=repetition_penalty,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 top_k=top_k,
@@ -220,7 +212,6 @@ async def _acreate_sync(**kwargs) -> CompletionSyncResponse:
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
-                repetition_penalty=repetition_penalty,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 top_k=top_k,
@@ -236,7 +227,6 @@ def create(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        repetition_penalty: Optional[float] = None,  # text-generation-inference
         presence_penalty: float = 0.0,  # vllm, lightllm
         frequency_penalty: float = 0.0,  # vllm, lightllm
         top_k: Optional[int] = None,
@@ -281,20 +271,14 @@ def create(
                 Whether to return the log probabilities of generated tokens.
                 When True, the response will include a list of tokens and their log probabilities.
 
-            repetition_penalty (Optional[float]):
-                Only affects: text-generation-inference
-                The parameter for repetition penalty
-                https://arxiv.org/pdf/1909.05858.pdf
-                Range: [1.0, infinity). 1.0 means no penalty
-
             presence_penalty (float):
-                Only affects: vllm, lightllm
+                Only supported in vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
             frequency_penalty (float):
-                Only affects: vllm, lightllm
+                Only supported in vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
@@ -389,7 +373,6 @@ def _create_stream(**kwargs):
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
-                repetition_penalty=repetition_penalty,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 top_k=top_k,
@@ -403,7 +386,6 @@ def _create_stream(**kwargs):
                 temperature=temperature,
                 stop_sequences=stop_sequences,
                 return_token_log_probs=return_token_log_probs,
-                repetition_penalty=repetition_penalty,
                 presence_penalty=presence_penalty,
                 frequency_penalty=frequency_penalty,
                 top_k=top_k,
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
index f7f7782d..29e0a529 100644
--- a/clients/python/llmengine/data_types.py
+++ b/clients/python/llmengine/data_types.py
@@ -268,7 +268,6 @@ class CompletionSyncV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
     presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     top_k: Optional[int] = Field(default=None, ge=-1)
@@ -334,7 +333,6 @@ class CompletionStreamV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
     presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     top_k: Optional[int] = Field(default=None, ge=-1)
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
index df943970..2f59e065 100644
--- a/model-engine/model_engine_server/common/dtos/llms.py
+++ b/model-engine/model_engine_server/common/dtos/llms.py
@@ -116,20 +116,16 @@ class CompletionSyncV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens.
     """
-
-    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
+    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     """
-    Only affects text-generation-inference
-    The parameter for repetition penalty. 1.0 means no penalty
+    Only supported in vllm, lightllm
+    Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
     """
-    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     """
-    Only affects vllm, lightllm
-    presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
-    frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
+    Only supported in vllm, lightllm
+    Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
     """
-
     top_k: Optional[int] = Field(default=None, ge=-1)
     """
     Controls the number of top tokens to consider. -1 means consider all tokens.
@@ -179,20 +175,16 @@ class CompletionStreamV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
     """
-
-    repetition_penalty: Optional[float] = Field(default=None, ge=1.0)
+    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     """
-    Only affects text-generation-inference
-    The parameter for repetition penalty. 1.0 means no penalty
+    Only supported in vllm, lightllm
+    Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
     """
-    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
     """
-    Only affects vllm, lightllm
-    presence_penalty: Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
-    frequency_penalty: Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
+    Only supported in vllm, lightllm
+    Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
     """
-
     top_k: Optional[int] = Field(default=None, ge=-1)
     """
     Controls the number of top tokens to consider. -1 means consider all tokens.
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index 330cf32f..c67d9226 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -8,7 +8,7 @@
 import math
 import os
 from dataclasses import asdict
-from typing import Any, AsyncIterable, Dict, List, Optional
+from typing import Any, AsyncIterable, Dict, List, Optional, Union
 from uuid import uuid4
 
 from model_engine_server.common.config import hmi_config
@@ -831,8 +831,10 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]:
     return tokens
 
 
-def validate_completion_params(inference_framework: LLMInferenceFramework, request):
-    # can't pass mypy check with Union[CompletionSyncV1Request, CompletionStreamV1Request], doesn't support intersection types?
+def validate_completion_params(
+    inference_framework: LLMInferenceFramework,
+    request: Union[CompletionSyncV1Request, CompletionStreamV1Request],
+) -> Union[CompletionSyncV1Request, CompletionStreamV1Request]:
 
     if request.temperature == 0:  # greedy, do_sample is False
         if request.top_k not in [-1, None] or request.top_p not in [1.0, None]:
@@ -1051,7 +1053,6 @@ async def execute(
                 "parameters": {
                     "max_new_tokens": request.max_new_tokens,
                     "decoder_input_details": True,
-                    "repetition_penalty": request.repetition_penalty,
                 },
             }
             if request.stop_sequences is not None:
@@ -1268,7 +1269,6 @@ async def execute(
                 "inputs": request.prompt,
                 "parameters": {
                     "max_new_tokens": request.max_new_tokens,
-                    "repetition_penalty": request.repetition_penalty,
                 },
             }
             if request.stop_sequences is not None:

From aa646739911ce51bc535b88786fa14d5c7e1f196 Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Fri, 29 Sep 2023 07:16:51 +0000
Subject: [PATCH 7/8] add back optional, update validation function

---
 clients/python/llmengine/completion.py        | 16 ++---
 clients/python/llmengine/data_types.py        |  8 +--
 .../model_engine_server/common/dtos/llms.py   |  8 +--
 .../use_cases/llm_model_endpoint_use_cases.py | 64 ++++++++++++-------
 4 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/clients/python/llmengine/completion.py b/clients/python/llmengine/completion.py
index ec52396e..507754d8 100644
--- a/clients/python/llmengine/completion.py
+++ b/clients/python/llmengine/completion.py
@@ -33,8 +33,8 @@ async def acreate(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        presence_penalty: float = 0.0,  # vllm, lightllm
-        frequency_penalty: float = 0.0,  # vllm, lightllm
+        presence_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         timeout: int = COMPLETION_TIMEOUT,
@@ -76,13 +76,13 @@ async def acreate(
                 Whether to return the log probabilities of generated tokens.
                 When True, the response will include a list of tokens and their log probabilities.
 
-            presence_penalty (float):
+            presence_penalty (Optional[float]):
                 Only supported in vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
-            frequency_penalty (float):
+            frequency_penalty (Optional[float]):
                 Only supported in vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
@@ -227,8 +227,8 @@ def create(
         temperature: float = 0.2,
         stop_sequences: Optional[List[str]] = None,
         return_token_log_probs: Optional[bool] = False,
-        presence_penalty: float = 0.0,  # vllm, lightllm
-        frequency_penalty: float = 0.0,  # vllm, lightllm
+        presence_penalty: Optional[float] = None,
+        frequency_penalty: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         timeout: int = COMPLETION_TIMEOUT,
@@ -271,13 +271,13 @@ def create(
                 Whether to return the log probabilities of generated tokens.
                 When True, the response will include a list of tokens and their log probabilities.
 
-            presence_penalty (float):
+            presence_penalty (Optional[float]):
                 Only supported in vllm, lightllm
                 Penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
                 Range: [0.0, 2.0]. Higher values encourage the model to use new tokens.
 
-            frequency_penalty (float):
+            frequency_penalty (Optional[float]):
                 Only supported in vllm, lightllm
                 Penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
                 https://platform.openai.com/docs/guides/gpt/parameter-details
diff --git a/clients/python/llmengine/data_types.py b/clients/python/llmengine/data_types.py
index 29e0a529..ecfcd884 100644
--- a/clients/python/llmengine/data_types.py
+++ b/clients/python/llmengine/data_types.py
@@ -268,8 +268,8 @@ class CompletionSyncV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
-    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     top_k: Optional[int] = Field(default=None, ge=-1)
     top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
 
@@ -333,8 +333,8 @@ class CompletionStreamV1Request(BaseModel):
     temperature: float = Field(..., ge=0.0)
     stop_sequences: Optional[List[str]] = Field(default=None)
     return_token_log_probs: Optional[bool] = Field(default=False)
-    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
-    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
+    frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     top_k: Optional[int] = Field(default=None, ge=-1)
     top_p: Optional[float] = Field(default=None, gt=0.0, le=1.0)
 
diff --git a/model-engine/model_engine_server/common/dtos/llms.py b/model-engine/model_engine_server/common/dtos/llms.py
index 2f59e065..27a12ddc 100644
--- a/model-engine/model_engine_server/common/dtos/llms.py
+++ b/model-engine/model_engine_server/common/dtos/llms.py
@@ -116,12 +116,12 @@ class CompletionSyncV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens.
     """
-    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     """
     Only supported in vllm, lightllm
     Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
     """
-    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     """
     Only supported in vllm, lightllm
     Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
@@ -175,12 +175,12 @@ class CompletionStreamV1Request(BaseModel):
     """
     Whether to return the log probabilities of the tokens. Only affects behavior for text-generation-inference models
     """
-    presence_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     """
     Only supported in vllm, lightllm
     Penalize new tokens based on whether they appear in the text so far. 0.0 means no penalty
     """
-    frequency_penalty: float = Field(default=0.0, ge=0.0, le=2.0)
+    frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
     """
     Only supported in vllm, lightllm
     Penalize new tokens based on their existing frequency in the text so far. 0.0 means no penalty
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index c67d9226..eadb4a22 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -831,32 +831,50 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]:
     return tokens
 
 
-def validate_completion_params(
+def validate_and_update_completion_params(
     inference_framework: LLMInferenceFramework,
     request: Union[CompletionSyncV1Request, CompletionStreamV1Request],
-) -> Union[CompletionSyncV1Request, CompletionStreamV1Request]:
-
-    if request.temperature == 0:  # greedy, do_sample is False
-        if request.top_k not in [-1, None] or request.top_p not in [1.0, None]:
+):
+    # top_k, top_p
+    if inference_framework in [
+        LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
+        LLMInferenceFramework.VLLM,
+        LLMInferenceFramework.LIGHTLLM,
+    ]:
+        if request.temperature == 0:
+            if request.top_k not in [-1, None] or request.top_p not in [1.0, None]:
+                raise ObjectHasInvalidValueException(
+                    "top_k and top_p can't be enabled when temperature is 0."
+                )
+        if request.top_k == 0:
             raise ObjectHasInvalidValueException(
-                "top_k and top_p can't be enabled when temperature is 0."
+                "top_k needs to be strictly positive, or set it to be -1 / None to disable top_k."
+            )
+        if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
+            request.top_k = None if request.top_k == -1 else request.top_k
+            request.top_p = None if request.top_p == 1.0 else request.top_p
+        if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
+            request.top_k = -1 if request.top_k is None else request.top_k
+            request.top_p = 1.0 if request.top_p is None else request.top_p
+    else:
+        if request.top_k or request.top_p:
+            raise ObjectHasInvalidValueException(
+                "top_k and top_p are only supported in text-generation-inference, vllm, lightllm."
             )
 
-    if request.top_k == 0:
-        raise ObjectHasInvalidValueException(
-            "top_k needs to be strictly positive, or set it to be -1 / None to disable top_k."
+    # presence_penalty, frequency_penalty
+    if inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
+        request.presence_penalty = (
+            0.0 if request.presence_penalty is None else request.presence_penalty
         )
-
-    if request.top_k in [-1, None]:  # disable top_k, consider all tokens.
-        if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
-            request.top_k = None
-        elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
-            request.top_k = -1
-    if request.top_p in [1.0, None]:  # disable top_p, consider all tokens.
-        if inference_framework == LLMInferenceFramework.TEXT_GENERATION_INFERENCE:
-            request.top_p = None
-        elif inference_framework in [LLMInferenceFramework.VLLM, LLMInferenceFramework.LIGHTLLM]:
-            request.top_p = 1.0
+        request.frequency_penalty = (
+            0.0 if request.frequency_penalty is None else request.frequency_penalty
+        )
+    else:
+        if request.presence_penalty or request.frequency_penalty:
+            raise ObjectHasInvalidValueException(
+                "presence_penalty and frequency_penalty are only supported in vllm, lightllm."
+            )
 
     return request
 
@@ -1005,7 +1023,9 @@ async def execute(
             endpoint_id=model_endpoint.record.id
         )
         endpoint_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
-        request = validate_completion_params(endpoint_content.inference_framework, request)
+        request = validate_and_update_completion_params(
+            endpoint_content.inference_framework, request
+        )
 
         if endpoint_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
             args: Any = {
@@ -1247,7 +1267,7 @@ async def execute(
         )
 
         model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
-        request = validate_completion_params(model_content.inference_framework, request)
+        request = validate_and_update_completion_params(model_content.inference_framework, request)
 
         args: Any = None
         if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED:

From 4b1b4fb56a419bb9d0d1348ad56b6a7435bf10a2 Mon Sep 17 00:00:00 2001
From: Frances Yuan <fan.yuan@scale.com>
Date: Fri, 29 Sep 2023 20:00:17 +0000
Subject: [PATCH 8/8] type check

---
 .../use_cases/llm_model_endpoint_use_cases.py  | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
index eadb4a22..241184fb 100644
--- a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
+++ b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -834,7 +834,7 @@ def deepspeed_result_to_tokens(result: Dict[str, Any]) -> List[TokenOutput]:
 def validate_and_update_completion_params(
     inference_framework: LLMInferenceFramework,
     request: Union[CompletionSyncV1Request, CompletionStreamV1Request],
-):
+) -> Union[CompletionSyncV1Request, CompletionStreamV1Request]:
     # top_k, top_p
     if inference_framework in [
         LLMInferenceFramework.TEXT_GENERATION_INFERENCE,
@@ -1023,9 +1023,14 @@ async def execute(
             endpoint_id=model_endpoint.record.id
         )
         endpoint_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
-        request = validate_and_update_completion_params(
+        validated_request = validate_and_update_completion_params(
             endpoint_content.inference_framework, request
         )
+        if not isinstance(validated_request, CompletionSyncV1Request):
+            raise ValueError(
+                f"request has type {validated_request.__class__.__name__}, expected type CompletionSyncV1Request"
+            )
+        request = validated_request
 
         if endpoint_content.inference_framework == LLMInferenceFramework.DEEPSPEED:
             args: Any = {
@@ -1267,7 +1272,14 @@ async def execute(
         )
 
         model_content = _model_endpoint_entity_to_get_llm_model_endpoint_response(model_endpoint)
-        request = validate_and_update_completion_params(model_content.inference_framework, request)
+        validated_request = validate_and_update_completion_params(
+            model_content.inference_framework, request
+        )
+        if not isinstance(validated_request, CompletionStreamV1Request):
+            raise ValueError(
+                f"request has type {validated_request.__class__.__name__}, expected type CompletionStreamV1Request"
+            )
+        request = validated_request
 
         args: Any = None
         if model_content.inference_framework == LLMInferenceFramework.DEEPSPEED: