# Get API Keys

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from openai import AsyncOpenAI

client = AsyncOpenAI()
GPT_4O_MINI = "gpt-4o-mini"

In [3]:
def _msg(role: str, content: str) -> dict[str, str]:
    return {"role": role, "content": content}


def _system(content: str) -> dict[str, str]:
    return _msg(role="system", content=content)


def _user(content: str) -> dict[str, str]:
    return _msg(role="user", content=content)


def _assistant(content: str) -> dict[str, str]:
    return _msg(role="assistant", content=content)


# Not Cached

In [4]:
completion = await client.chat.completions.create(
    model=GPT_4O_MINI,
    messages=[_user(content="What is Caching in Generative AI?")],
    max_tokens=100,
)

In [5]:
from pprint import pprint

pprint(object=completion.choices[0].message.content)

('Caching in generative AI refers to the practice of storing previously '
 'generated outputs or intermediate results in order to improve efficiency and '
 'reduce computation time for future requests. Here’s a closer look at how '
 'caching can be beneficial in the context of generative AI:\n'
 '\n'
 '1. **Efficiency Improvement**: By storing the results of computations that '
 'are frequently requested, the system can quickly retrieve cached outputs '
 'instead of recalculating them. This significantly speeds up response times, '
 'especially for popular queries or prompts.\n'
 '\n'
 '2. **Resource Optimization')


# Cached

In [6]:
from diskcache import Cache

# cache = Cache() # Temporary Cache, refreshes every time the notebook is restarted
cache = Cache(directory=".cache")

In [7]:
cache.set(key="Welcome", value="Subrata Mondal")

True

In [8]:
cache.get(key="Welcome")

'Subrata Mondal'

**`Cache`** is not async by default, we need to make it async by wrapping.

In [16]:
import asyncio


async def set_async(key, value, **kwargs):
    return await asyncio.to_thread(cache.set, key, value, **kwargs)


async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

In [17]:
get_async(key="Welcome")

<coroutine object get_async at 0x107ea35a0>

In [18]:
await get_async(key="Welcome")

'Subrata Mondal'

In [19]:
await get_async(key="Invalid Key", default="Alright! Alright!")

'Alright! Alright!'

720

# Caching LLM Calls

In [22]:
from hashlib import md5

md5(b"subrata").hexdigest()

'78a8f834561c67fa660760d41a61dc62'

In [28]:
from hashlib import md5

args = ["1", "2"]
kwargs = dict(a=2, b=4)

dirty = str(args) + str(kwargs)
md5(dirty.encode("utf-8")).hexdigest()

'8fe21d329868d5b6aef0d2d118a97c62'

In [33]:
import json


def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode("utf-8")).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    return cache_key

In [34]:
def _make_cache_key_for_chat_completion(model, messages, **kwargs):
    return make_cache_key(
        key_name="openai_chat_completion",
        model=model,
        messages=messages,
        **kwargs,
    )

In [50]:
from openai.types.chat import ChatCompletion
from functools import update_wrapper

CACHE_MISS_SENTINEL = object()


async def cached_chat_completion(model, messages, **kwargs) -> ChatCompletion | str:
    cache_key = _make_cache_key_for_chat_completion(
        model,
        messages,
        **kwargs,
    )
    cached_value = await get_async(key=cache_key, default=CACHE_MISS_SENTINEL)

    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        completion = await client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs,
        )
        await set_async(key=cache_key, value=completion.json())
        return completion
    # CACHE HIT
    else:
        return ChatCompletion.validate(json.loads(cached_value))


cached_chat_completion = update_wrapper(
    wrapper=cached_chat_completion,
    wrapped=client.chat.completions.create,
)

In [48]:
import time

start = time.time()

completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="What is Caching in Software Engineering?")],
    max_tokens=100,
)
print(completion)

end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdYkVYjYfzq6lS5cRKeNElIjfLpt', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering refers to the process of storing copies of files or data in a temporary storage location (the cache) for quick access. The main goal of caching is to improve the performance and efficiency of applications by reducing the time and resources required to fetch data that has already been retrieved or computed.\n\n### Key Concepts of Caching:\n\n1. **Cache Storage**:\n   - Caches can reside in different layers, such as in-memory (RAM), disk, or even in external services (e', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103598, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705bf87c0', usage=CompletionUsage(completion_tokens=100, prompt_tokens=15, total_tokens=115, completion_tokens_details=Comple

In [37]:
completion

ChatCompletion(id='chatcmpl-AVdYkVYjYfzq6lS5cRKeNElIjfLpt', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering refers to the process of storing copies of files or data in a temporary storage location (the cache) for quick access. The main goal of caching is to improve the performance and efficiency of applications by reducing the time and resources required to fetch data that has already been retrieved or computed.\n\n### Key Concepts of Caching:\n\n1. **Cache Storage**:\n   - Caches can reside in different layers, such as in-memory (RAM), disk, or even in external services (e', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103598, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705bf87c0', usage=CompletionUsage(completion_tokens=100, prompt_tokens=15, total_tokens=115, completion_tokens_details=Comple

In [47]:
start = time.time()
completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="What is Caching in Software Engineering?")],
    max_tokens=100,
)
print(completion)
end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdYkVYjYfzq6lS5cRKeNElIjfLpt', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering refers to the process of storing copies of files or data in a temporary storage location (the cache) for quick access. The main goal of caching is to improve the performance and efficiency of applications by reducing the time and resources required to fetch data that has already been retrieved or computed.\n\n### Key Concepts of Caching:\n\n1. **Cache Storage**:\n   - Caches can reside in different layers, such as in-memory (RAM), disk, or even in external services (e', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103598, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705bf87c0', usage=CompletionUsage(completion_tokens=100, prompt_tokens=15, total_tokens=115, completion_tokens_details=Comple

In [45]:
start = time.time()
completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="Teach me Caching like a beginner")],
    max_tokens=100,
)
print(completion)
end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdcx8uUPQuCrfxOlOmrT7jxXmQiv', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content="Sure! Caching is a technique used in computing to improve the performance of systems by storing copies of frequently accessed data in a location that is faster to access than the original source. Here's a beginner-friendly guide to understanding caching:\n\n### What is Caching?\n\n1. **Definition**: Caching involves storing data in a temporary storage area (the cache) to speed up future access to that data. Instead of fetching the data from a slow source (like a database or a remote server), the system", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103859, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_3de1288069', usage=CompletionUsage(completion_tokens=100, prompt_tokens=14, total_tokens=114, completion_tokens_details=Com

In [46]:
start = time.time()
completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="Teach me Caching like a beginner")],
    max_tokens=100,
)
print(completion)
end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdcx8uUPQuCrfxOlOmrT7jxXmQiv', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content="Sure! Caching is a technique used in computing to improve the performance of systems by storing copies of frequently accessed data in a location that is faster to access than the original source. Here's a beginner-friendly guide to understanding caching:\n\n### What is Caching?\n\n1. **Definition**: Caching involves storing data in a temporary storage area (the cache) to speed up future access to that data. Instead of fetching the data from a slow source (like a database or a remote server), the system", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103859, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_3de1288069', usage=CompletionUsage(completion_tokens=100, prompt_tokens=14, total_tokens=114, completion_tokens_details=Com