In [3]:
!pip install nemo-microservices==1.4.0

# Register Model Configuration
- In NeMoMicroservices client, set base_url to nemo-deployment-management's url
- To register MLIS's model endpoint, we need URL, API Token, Model Name

```json
    {
        "host_url": "<URL>",
        "api_key": "<API Token>",
        "enabled_models": ["<Model Name>"]
    }
```

In [4]:
from nemo_microservices import NeMoMicroservices

deploy_client = NeMoMicroservices(
    base_url='http://nemo-deployment-management:8000',
    inference_base_url='http://nemo-nim-proxy:8000'
)

In [None]:
deploy_client.deployment.configs.create(
    name="mlis-llama-31-8b-instruct",
    namespace="mlis",
    description="mlis endpoint configuration",
    external_endpoint={
        "host_url": "https://llm-llama-3-1-8b-predictor-user.app.pcai.XXXX",
        "api_key": "<put MLIS's endpoint token>",
        "enabled_models": [
            "meta/llama-3.1-8b-instruct"
        ]
    },
)

In [None]:
deploy_client.deployment.configs.create(
    name="mlis-qwen-25-7b-instruct",
    namespace="mlis",
    description="mlis endpoint configuration",
    external_endpoint={
        "host_url": "https://qwen-25-7b-instruct-stream-predictor-user.app.pcai.XXXX",
        "api_key": "<put MLIS's endpoint token>",
        "enabled_models": [
            "Qwen/Qwen2.5-7B-Instruct"
        ]
    },
)

In [59]:
deploy_client.inference.models.list().dict()

# Inference with nim-proxy url
- Once the Model configuration is successfully registered, we can use **nim-proxy url** to do inference for that model with same token.

In [None]:
llama_token = "<put MLIS's endpoint token>"
qwen_token = "<put MLIS's endpoint token>"

In [None]:
model_profiles = [
    {
        "api_key": llama_token,
        "model_name": "meta/llama-3.1-8b-instruct",
    },
    {
        "api_key": qwen_token,
        "model_name": "Qwen/Qwen2.5-7B-Instruct",
    },
]

In [52]:
import requests

url = 'http://nemo-nim-proxy:8000' + '/v1/chat/completions'

responses = []

for model in model_profiles:
    headers = {
        'Authorization':'Bearer ' + model['api_key']
    }
    payload = {
        "model": model['model_name'],
        "messages": [{"role":"user", "content":"Write a limerick about the wonders of GPU computing."}],
        "max_tokens": 128
    }
    responses.append(requests.post(url=url,headers=headers,json=payload))

In [53]:
for i in responses:
    print(f"*** {i.json()['model']} ***")
    print(i.json()['choices'][0]['message']['content'])
    print('=' * 30)