In [2]:
import os
import time
import numpy as np
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv(override=True)
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

llm = ChatOpenAI(model="gpt-5-nano",temperature=0,api_key=OPENAI_API_KEY)

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small",api_key=OPENAI_API_KEY)

In [3]:
def get_embedding(text: str):
    vector = embedding_model.embed_query(text)
    return np.array(vector).reshape(1, -1)

In [4]:
get_embedding('Hi')

array([[-0.00692033, -0.03531143,  0.00159792, ..., -0.01301259,
        -0.01907527, -0.00606268]], shape=(1, 1536))

In [5]:
class SemanticPromptCache:
    def __init__(self, similarity_threshold=0.9):
        self.cache = []
        self.similarity_threshold = similarity_threshold
        self.metrics = {
            "hits": 0,
            "misses": 0,
            "total_requests": 0
        }

    def get(self, prompt: str):
        self.metrics["total_requests"] += 1
        new_embedding = get_embedding(prompt)

        for entry in self.cache:
            similarity = cosine_similarity(
                new_embedding,
                entry["embedding"]
            )[0][0]

            if similarity >= self.similarity_threshold:
                self.metrics["hits"] += 1
                print(f"Semantic Cache HIT (similarity={round(similarity,2)})")
                return entry["response"]

        self.metrics["misses"] += 1
        print("Semantic Cache MISS")
        return None

    def set(self, prompt: str, response: str):
        embedding = get_embedding(prompt)

        self.cache.append({
            "prompt": prompt,
            "embedding": embedding,
            "response": response
        })

    def stats(self):
        hit_rate = (
            self.metrics["hits"] / self.metrics["total_requests"]
            if self.metrics["total_requests"] > 0
            else 0
        )

        return {
            **self.metrics,
            "hit_rate": round(hit_rate, 2)
        }


In [6]:
semantic_cache = SemanticPromptCache(similarity_threshold=0.9)

In [7]:
def semantic_cached_llm_call(prompt: str):

    cached_response = semantic_cache.get(prompt)

    if cached_response:
        return cached_response

    response = llm.invoke([HumanMessage(content=prompt)])
    output = response.content

    semantic_cache.set(prompt, output)

    return output

In [8]:
def measure_latency(prompt: str):

    start = time.time()
    result = semantic_cached_llm_call(prompt)
    end = time.time()

    print(f"Latency: {round(end - start, 3)} seconds\n")
    return result

In [9]:
prompt1 = "Explain AI in two sentence"
prompt2 = "Describe AI in two sentence"

print("First Call (MISS)")
measure_latency(prompt1)

print("Second Call (Should HIT if similar)")
measure_latency(prompt2)

print("Cache Stats:")
print(semantic_cache.stats())

First Call (MISS)
Semantic Cache MISS
Latency: 6.734 seconds

Second Call (Should HIT if similar)
Semantic Cache HIT (similarity=0.95)
Latency: 0.221 seconds

Cache Stats:
{'hits': 1, 'misses': 1, 'total_requests': 2, 'hit_rate': 0.5}
