From 58faee61f32ed2c1b3afb33511741625e088c5c6 Mon Sep 17 00:00:00 2001
From: Jui-Tse Hung <juitse.hung@scale.com>
Date: Thu, 20 Jul 2023 17:33:52 +0000
Subject: [PATCH 1/3] Add llm endpoint creation and inference sample code to
 self hosting doc play with it section

---
 docs/guides/self_hosting.md | 62 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/docs/guides/self_hosting.md b/docs/guides/self_hosting.md
index 23f66579..77254151 100644
--- a/docs/guides/self_hosting.md
+++ b/docs/guides/self_hosting.md
@@ -139,5 +139,65 @@ $ kubectl port-forward pod/llm-engine-<REST_OF_POD_NAME> 5000:5000 -n <NAMESPACE
 Then, try sending a request to get LLM model endpoints for `test-user-id`. You should get a response with empty list:
 ```
 $ curl -X GET -H "Content-Type: application/json" -u "test-user-id:" "http://localhost:5000/v1/llm/model-endpoints"
-{"model_endpoints":[]}% 
+```
+
+You should get the following response:
+```
+{"model_endpoints":[]}
+```
+
+Next, let's create a LLM endpoint using llama-7b:
+```
+$ curl -X POST 'http://localhost:5000/v1/llm/model-endpoints' \
+    -H 'Content-Type: application/json' \
+    -d '{
+        "name": "llama-7b",
+        "model_name": "llama-7b",
+        "source": "hugging_face",
+        "inference_framework": "text_generation_inference",
+        "inference_framework_image_tag": "0.9.1",
+        "num_shards": 4,
+        "endpoint_type": "streaming",
+        "cpus": 32,
+        "gpus": 4,
+        "memory": "40Gi",
+        "storage": "40Gi",
+        "gpu_type": "nvidia-ampere-a10",
+        "min_workers": 1,
+        "max_workers": 12,
+        "per_worker": 1,
+        "labels": {"team": "infra", "product": "llm_model_zoo"},
+        "metadata": {}
+    }' \
+    -u test_user_id:
+```
+
+It should output something like:
+```
+{"endpoint_creation_task_id":"8d323344-b1b5-497d-a851-6d6284d2f8e4"}
+```
+
+Wait a few minutes for the endpoint to be ready. Once it's ready, you can list pods and see `2/2` in the `READY` column:
+```
+$ kubectl get pods -n <endpoint_namespace specified in values_sample.yaml>
+NAME                                                              READY   STATUS    RESTARTS        AGE
+llm-engine-endpoint-id-end-cismpd08agn003rr2kc0-7f86ff64f9qj9xp   2/2     Running   1 (4m41s ago)   7m26s
+```
+Note the endpoint name could be different.
+
+Then, you can send an inference request to the endppoint:
+```
+$ curl -X POST 'http://localhost:5000/v1/llm/completions-sync?model_endpoint_name=llama-7b' \
+    -H 'Content-Type: application/json' \
+    -d '{
+        "prompts": ["hi"],
+        "max_new_tokens": 10,
+        "temperature": 0.1
+    }' \
+    -u test-user-id:
+```
+
+You should get a response similar to:
+```
+{"status":"SUCCESS","outputs":[{"text":"hi hi hi2 hi2 hi2 hi2","num_completion_tokens":10}],"traceback":null}
 ```
\ No newline at end of file

From de7d5252723cea8272a29695e68b960a02ef5718 Mon Sep 17 00:00:00 2001
From: Jui-Tse Hung <juitse.hung@scale.com>
Date: Thu, 20 Jul 2023 21:50:59 +0000
Subject: [PATCH 2/3] Small fix on sending list llm endpoints request

---
 docs/guides/self_hosting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/guides/self_hosting.md b/docs/guides/self_hosting.md
index 77254151..0f5f29e4 100644
--- a/docs/guides/self_hosting.md
+++ b/docs/guides/self_hosting.md
@@ -136,7 +136,7 @@ Forward a port from a `llm-engine` pod:
 $ kubectl port-forward pod/llm-engine-<REST_OF_POD_NAME> 5000:5000 -n <NAMESPACE_WHERE_LLM_ENGINE_IS_INSTALLED>
 ```
 
-Then, try sending a request to get LLM model endpoints for `test-user-id`. You should get a response with empty list:
+Then, try sending a request to get LLM model endpoints for `test-user-id`:
 ```
 $ curl -X GET -H "Content-Type: application/json" -u "test-user-id:" "http://localhost:5000/v1/llm/model-endpoints"
 ```

From 253153797b622e6c191cdbf08d6736473fea0016 Mon Sep 17 00:00:00 2001
From: Jui-Tse Hung <juitse.hung@scale.com>
Date: Thu, 20 Jul 2023 22:46:46 +0000
Subject: [PATCH 3/3] Update doc

---
 docs/guides/self_hosting.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/guides/self_hosting.md b/docs/guides/self_hosting.md
index 0f5f29e4..8c6c963b 100644
--- a/docs/guides/self_hosting.md
+++ b/docs/guides/self_hosting.md
@@ -155,7 +155,7 @@ $ curl -X POST 'http://localhost:5000/v1/llm/model-endpoints' \
         "model_name": "llama-7b",
         "source": "hugging_face",
         "inference_framework": "text_generation_inference",
-        "inference_framework_image_tag": "0.9.1",
+        "inference_framework_image_tag": "0.9.3",
         "num_shards": 4,
         "endpoint_type": "streaming",
         "cpus": 32,
@@ -166,7 +166,7 @@ $ curl -X POST 'http://localhost:5000/v1/llm/model-endpoints' \
         "min_workers": 1,
         "max_workers": 12,
         "per_worker": 1,
-        "labels": {"team": "infra", "product": "llm_model_zoo"},
+        "labels": {},
         "metadata": {}
     }' \
     -u test_user_id:
@@ -177,7 +177,7 @@ It should output something like:
 {"endpoint_creation_task_id":"8d323344-b1b5-497d-a851-6d6284d2f8e4"}
 ```
 
-Wait a few minutes for the endpoint to be ready. Once it's ready, you can list pods and see `2/2` in the `READY` column:
+Wait a few minutes for the endpoint to be ready. You can tell that it's ready by listing pods and checking that all containers in the llm endpoint pod are ready:
 ```
 $ kubectl get pods -n <endpoint_namespace specified in values_sample.yaml>
 NAME                                                              READY   STATUS    RESTARTS        AGE
@@ -190,8 +190,8 @@ Then, you can send an inference request to the endppoint:
 $ curl -X POST 'http://localhost:5000/v1/llm/completions-sync?model_endpoint_name=llama-7b' \
     -H 'Content-Type: application/json' \
     -d '{
-        "prompts": ["hi"],
-        "max_new_tokens": 10,
+        "prompts": ["Tell me a joke about AI"],
+        "max_new_tokens": 30,
         "temperature": 0.1
     }' \
     -u test-user-id:
@@ -199,5 +199,5 @@ $ curl -X POST 'http://localhost:5000/v1/llm/completions-sync?model_endpoint_nam
 
 You should get a response similar to:
 ```
-{"status":"SUCCESS","outputs":[{"text":"hi hi hi2 hi2 hi2 hi2","num_completion_tokens":10}],"traceback":null}
+{"status":"SUCCESS","outputs":[{"text":". Tell me a joke about AI. Tell me a joke about AI. Tell me a joke about AI. Tell me","num_completion_tokens":30}],"traceback":null}
 ```
\ No newline at end of file