In [1]:
import os
import hashlib
import time
from typing import Dict, Any
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

load_dotenv(override=True)
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

llm = ChatOpenAI(model="gpt-5-nano",temperature=0,api_key=OPENAI_API_KEY)

In [2]:
class DeterministicPromptCache:
    def __init__(self):
        self.cache: Dict[str, Dict[str, Any]] = {}
        self.metrics = {
            "hits": 0,
            "misses": 0,
            "total_requests": 0
        }

    def _hash_prompt(self, prompt: str) -> str:
        return hashlib.sha256(prompt.encode()).hexdigest()

    def get(self, prompt: str):
        self.metrics["total_requests"] += 1
        key = self._hash_prompt(prompt)

        if key in self.cache:
            self.metrics["hits"] += 1
            return self.cache[key]["response"]
        else:
            self.metrics["misses"] += 1
            return None

    def set(self, prompt: str, response: str):
        key = self._hash_prompt(prompt)
        self.cache[key] = {
            "response": response,
            "timestamp": time.time()
        }

    def stats(self):
        hit_rate = (
            self.metrics["hits"] / self.metrics["total_requests"]
            if self.metrics["total_requests"] > 0
            else 0
        )

        return {
            **self.metrics,
            "hit_rate": round(hit_rate, 2)
        }

In [3]:
cache = DeterministicPromptCache()

In [4]:
def cached_llm_call(prompt: str):

    cached_response = cache.get(prompt)

    if cached_response:
        print("Cache HIT")
        return cached_response

    print("Cache MISS — Calling LLM")

    response = llm.invoke([HumanMessage(content=prompt)])
    output = response.content

    cache.set(prompt, output)

    return output


In [5]:
def measure_latency(prompt: str):

    start = time.time()
    result = cached_llm_call(prompt)
    end = time.time()

    latency = round(end - start, 3)

    print(f"\nLatency: {latency} seconds\n")
    return result

In [6]:
prompt = "Explain AI in 2 sentences"

print("First Call (Expected MISS)")
measure_latency(prompt)

print("Second Call (Expected HIT)")
measure_latency(prompt)

First Call (Expected MISS)
Cache MISS — Calling LLM

Latency: 4.426 seconds

Second Call (Expected HIT)
Cache HIT

Latency: 0.0 seconds



'Artificial intelligence is the field of computer science focused on creating systems that can perform tasks that normally require human intelligence, such as learning, reasoning, and understanding language. Most modern AI uses machine learning, where algorithms learn from large datasets to improve their performance and adapt to new tasks, enabling applications like voice assistants, image recognition, and autonomous vehicles.'