In [None]:
# Copyright 2024 Forusone
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Serving-hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4-TGI-Cloud Run-L4
* [Hugging Face TGI Metrics](https://huggingface.co/docs/text-generation-inference/en/reference/metrics)
* [Run LLM inference on Cloud Run GPUs with Hugging Face TGI (services)](https://cloud.google.com/run/docs/tutorials/gpu-llama3-with-tgi)
* [Deploy Meta Llama 3.1 8B with TGI DLC on Cloud Run](https://huggingface.co/docs/google-cloud/examples/cloud-run-tgi-deployment)

In [None]:
# @title Define deployment constants
PROJECT_ID="ai-hangsik" # @param {type:"string"}
LOCATION="us-central1"  # @param {type:"string"}
CONTAINER_URI="us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-4.ubuntu2204.py311" # @param {type:"string"}
SERVICE_NAME="hf-tgi-llama31-8b" # @param {type:"string"}

In [None]:
# # @title Authentication
# !gcloud auth login
# !gcloud auth application-default login
!gcloud config set project {PROJECT_ID}

In [None]:
# @title Enable Cloud Run APIs
!gcloud services enable run.googleapis.com

## Deploy a cloud run with a model
* Need to request Nvidia L4 GPU for Cloud Run. [Quota increase](https://cloud.google.com/run/quotas#increase).
* [TGI launcher arguments](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/launcher)


In [None]:
# @title Cloud run command to deploy a model.
!gcloud beta run deploy $SERVICE_NAME \
    --image=$CONTAINER_URI \
    --args="--model-id=hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,--quantize=awq,--max-concurrent-requests=64" \
    --port=8080 \
    --cpu=4 \
    --memory=16Gi \
    --no-cpu-throttling \
    --gpu=1 \
    --gpu-type=nvidia-l4 \
    --max-instances=1 \
    --concurrency=64 \
    --region={LOCATION} \
    --allow-unauthenticated

Deploying container to Cloud Run service [[1mhf-tgi-llama31-8b[m] in project [[1mai-hangsik[m] region [[1mus-central1[m]
Service [[1mhf-tgi-llama31-8b[m] revision [[1mhf-tgi-llama31-8b-00001-q2c[m] has been deployed and is serving [1m100[m percent of traffic.
Service URL: [1mhttps://hf-tgi-llama31-8b-721521243942.us-central1.run.app[m


## Run a demo using Cloud Run proxy on local machine

* Athenticate
```
gcloud auth login
```

* Execute the follwing command on your local machine.
```
gcloud run services proxy $SERVICE_NAME --region $LOCATION
 --> gcloud run services proxy hf-tgi-llama31-8b --port 8088 --region us-central1
```
* You can see the following information.
```
/Users/hangsik$ gcloud run services proxy hf-tgi-llama31-8b --port 8088 --region us-central1
Proxying to Cloud Run service [hf-tgi-llama31-8b] in project [ai-hangsik] region [us-central1]
http://127.0.0.1:8088 proxies to https://hf-tgi-llama31-8b-o5gpdmpuwq-uc.a.run.app
```


* Execute the following command on your local machine.
```
curl http://localhost:8088/v1/chat/completions \
    -X POST \
    -H 'Content-Type: application/json' \
    -d '{
        "model": "tgi",
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant."
            },
            {
                "role": "user",
                "content": "What is Deep Learning?"
            }
        ],
        "max_tokens": 128
    }'
```

* Response
```
{"object":"chat.completion","id":"","created":1736821056,"model":"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4","system_fingerprint":"2.4.0-native","choices":[{"index":0,"message":{"role":"assistant","content":"Deep Learning is a subfield of machine learning (a subset of artificial intelligence) that uses multi-layered artificial neural networks to analyze and learn from data. These neural networks are inspired by the structure and function of the human brain, where connections between layers of neurons enable complex patterns to be recognized and learned.\n\nOver the last decade, Deep Learning has gained tremendous popularity due to its remarkable performance in various applications such as:\n\n1. **Image Classification**: Self-driving cars, facial recognition, object detection in video footage.\n2. **Natural Language Processing (NLP)**: Sentiment analysis, language translation, text summarization, chatbots.\n3"},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":46,"completion_tokens":128,"total_tokens":174}}/Users/hangsik$
```