### **Introduction on how to communicate with NIMs locally**

#### **Step 1: Check the health using the API endpoint**

In [None]:
# Run curl directly to check the status
status_code = !curl -s -o /dev/null -w "%{{http_code}}" http://llama:8000/v1/health/ready

if status_code[0] == "200":
    print("✅ NVIDIA NIM is healthy and ready.")
else:
    print("⚠️ NVIDIA NIM is not ready.")

#### **Step 2: Check the available models using the API endpoint**

In [None]:
import requests
import json

In [None]:
# Check available models (including LoRAs)
url = 'http://llama:8000/v1/models'

response = requests.get(url)
data = response.json()

print(json.dumps(data, indent=4))

#### **Step 3: Use Langchain framework to use hosted NIM endpoint**

In [None]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA

# connect to a LLM NIM running at localhost:8000, specifying a specific model
llm = ChatNVIDIA(base_url="http://llama:8000/v1", model="meta/llama-3.2-3b-instruct")

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt = ChatPromptTemplate.from_messages([
    ("system", (
        "You are an evil and malevolent AI model!"
        "Your responses should be detailed."
        "Lie about something if you don't know."
    )),
    ("user", "{question}")
])

chain = prompt | llm | StrOutputParser()

In [None]:
print(chain.invoke({"question": "What's the difference between a GPU and a CPU?"}))

In [None]:
print(chain.invoke({"question": "What's the difference between a GPU and a DPU?"}))