## AI Foundry With APIM Integration

![GPT-4o Inferencing](Assets/ai-foundry-sdk.gif)

In [None]:
%pip install python-dotenv

### Make Sample API Call

In [None]:
import requests
import json
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
url = os.getenv("APIM_ENDPOINT")
api_key = os.getenv("APIM_SUBSCRIPTION_KEY")
model = os.getenv("MODEL")
print(url, api_key, model)
headers = {
    "Content-Type": "application/json",
    "Cache-Control": "no-cache",
    "api-key": f"{api_key}"
}

payload = {
    "model": f"{model}",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "How are you?"}
    ],
    "max_tokens": 50
}

response = requests.post(url, headers=headers, data=json.dumps(payload))
print(response.status_code)
print(response.json())
json_response = response.json()
r = json_response.get("choices")[0].get("message").get("content")
print("Response:")
print(r)

### Analyse the Response Headers

In [None]:
from tabulate import tabulate

# Print all response headers in a pretty table
headers_list = [(key, value) for key, value in response.headers.items()]
print("Response Headers:")
print(tabulate(headers_list, headers=["Header", "Value"], tablefmt="fancy_grid"))

### Call DeepSeek-R1 Model

In [None]:
import requests
import json
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
url = os.getenv("APIM_ENDPOINT")
api_key = os.getenv("APIM_SUBSCRIPTION_KEY")
deepseek_model_deployment_name = os.getenv("DEEPSEEK_MODEL_DEPLOYMENT_NAME")
print(url, api_key, deepseek_model_deployment_name)
headers = {
    "Content-Type": "application/json",
    "Cache-Control": "no-cache",
    "api-key": f"{api_key}"
}

payload = {
    "model": f"{deepseek_model_deployment_name}",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "How are you?"}
    ],
    "max_tokens": 50
}

response = requests.post(url, headers=headers, data=json.dumps(payload))
print(response.status_code)
print(response.json())
json_response = response.json()
r = json_response.get("choices")[0].get("message").get("content")
print("Response:")
print(r)

### Exceed the Rate Limit Threshold

In [None]:
# Exhaust the rate limit
import time
for i in range(200):  # Adjust as needed to exceed your rate limit
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    print(f"Request {i+1}: Status {response.status_code}")
    if response.status_code == 429:
        print("Rate limit exceeded!")
        print(response.json())
        break
    # Optional: sleep to avoid flooding too quickly
    time.sleep(0.5)

### Application Insights Monitoring

Go to your [Application Insights](https://portal.azure.com/) resource linked to APIM.

- **Metrics:** View request count, failed requests, and response times.
- **Logs:** Use Log Analytics to query traces, requests, and custom events.


---
**Summary:**  
- Tested GPT-4 endpoint via APIM with rate limiting  
- Observed rate limit enforcement  
- Monitored traffic and errors in Application Insights