## Azure Redis Cache with APIM

![Redis-Cache-with-APIM](semantic-caching.gif)

The azure-openai-semantic-cache-lookup policy conducts a cache lookup of responses on Azure OpenAI Chat Completion API and Completion API requests from a pre-configured external cache. It operates by comparing the vector proximity of the prompt to prior requests and using a specific similarity score threshold. Caching responses helps reduce bandwidth and processing demands on the backend Azure OpenAI API, thus reducing latency perceived by API consumers.

### Result
![semantic-cachin-results](result.png)

In [None]:
import requests
import json
import os
from dotenv import load_dotenv
import time

In [None]:
%%time

# Load environment variables from .env file
load_dotenv()
url = os.getenv("APIM_ENDPOINT")
api_key = os.getenv("APIM_SUBSCRIPTION_KEY")
model = os.getenv("MODEL")
print(url, api_key, model)
headers = {
    "Content-Type": "application/json",
    "Cache-Control": "no-cache",
    "api-key": f"{api_key}"
}

payload = {
    "model": f"{model}",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Please Write me a poem about Kittens"}
    ],
    "max_tokens": 50
}

response = requests.post(url, headers=headers, data=json.dumps(payload))
print(response.status_code)
print(response.json())
json_response = response.json()
r = json_response.get("choices")[0].get("message").get("content")
print("Response:")
print(r)

In [None]:
from tabulate import tabulate

# Print all response headers in a pretty table
headers_list = [(key, value) for key, value in response.headers.items()]
print("Response Headers:")
print(tabulate(headers_list, headers=["Header", "Value"], tablefmt="fancy_grid"))

In [None]:
%%time

# Load environment variables from .env file
load_dotenv()
url = os.getenv("APIM_ENDPOINT")
api_key = os.getenv("APIM_SUBSCRIPTION_KEY")
model = os.getenv("MODEL")
print(url, api_key, model)
headers = {
    "Content-Type": "application/json",
    "Cache-Control": "no-cache",
    "api-key": f"{api_key}"
}

payload = {
    "model": f"{model}",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "Write me a poem about Puppies"}
    ],
    "max_tokens": 50
}

response = requests.post(url, headers=headers, data=json.dumps(payload))
print(response.status_code)
print(response.json())
json_response = response.json()
r = json_response.get("choices")[0].get("message").get("content")
print("Response:")
print(r)

In [None]:
from tabulate import tabulate

# Print all response headers in a pretty table
headers_list = [(key, value) for key, value in response.headers.items()]
print("Response Headers:")
print(tabulate(headers_list, headers=["Header", "Value"], tablefmt="fancy_grid"))