# Get API Keys

In [8]:
from dotenv import load_dotenv

load_dotenv()

True

# Set Up Tracing

In [9]:
from langfuse.openai import AsyncOpenAI

# Chat Helper

In [10]:
def _msg(role: str, content: str) -> dict[str, str]:
    return {"role": role, "content": content}


def _system(content: str) -> dict[str, str]:
    return _msg(role="system", content=content)


def _user(content: str) -> dict[str, str]:
    return _msg(role="user", content=content)


def _assistant(content: str) -> dict[str, str]:
    return _msg(role="assistant", content=content)

# SetUp Client

In [11]:
client = AsyncOpenAI()
GPT_4O_MINI = "gpt-4o-mini"

dir(client)

['__aenter__',
 '__aexit__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_base_url',
 '_build_headers',
 '_build_request',
 '_calculate_retry_timeout',
 '_client',
 '_custom_headers',
 '_custom_query',
 '_default_stream_cls',
 '_enforce_trailing_slash',
 '_idempotency_header',
 '_idempotency_key',
 '_limits',
 '_make_sse_decoder',
 '_make_status_error',
 '_make_status_error_from_response',
 '_maybe_override_cast_to',
 '_parse_retry_after_header',
 '_platform',
 '_prepare_options',
 '_prepare_request',
 '_prepare_url',
 '_process_response',
 '_process_response_data',
 '

# Cached

In [12]:
from diskcache import Cache

# cache = Cache() # Temporary Cache, refreshes every time the notebook is restarted
cache = Cache(directory=".cache")

In [13]:
cache.set(key="Welcome", value="Subrata Mondal")

True

In [14]:
cache.get(key="Welcome")

'Subrata Mondal'

## Convert Sync to Async

**`Cache`** is not async by default, we need to make it async by wrapping.

In [15]:
import asyncio


async def set_async(key, value, **kwargs):
    return await asyncio.to_thread(cache.set, key, value, **kwargs)


async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

In [16]:
get_async(key="Welcome")

<coroutine object get_async at 0x113af4200>

In [17]:
await get_async(key="Welcome")

'Subrata Mondal'

In [18]:
await get_async(key="Invalid Key", default="Alright! Alright!")

'Alright! Alright!'

# Caching and Tracing LLM Calls

In [19]:
from hashlib import md5

md5(b"subrata").hexdigest()

'78a8f834561c67fa660760d41a61dc62'

In [20]:
from hashlib import md5

args = ["1", "2"]
kwargs = dict(a=2, b=4)

dirty = str(args) + str(kwargs)
md5(dirty.encode("utf-8")).hexdigest()

'8fe21d329868d5b6aef0d2d118a97c62'

In [21]:
import json


def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode("utf-8")).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    return cache_key

In [22]:
def _make_cache_key_for_chat_completion(model, messages, **kwargs):
    return make_cache_key(
        key_name="openai_chat_completion",
        model=model,
        messages=messages,
        **kwargs,
    )

In [29]:
from openai.types.chat import ChatCompletion
from functools import update_wrapper, wraps
import backoff
from openai import APITimeoutError, RateLimitError

CACHE_MISS_SENTINEL = object()


@wraps(client.chat.completions.create)
async def cached_chat_completion(model, messages, **kwargs) -> ChatCompletion | str:
    cache_key = _make_cache_key_for_chat_completion(
        model,
        messages,
        **kwargs,
    )
    cached_value = await get_async(key=cache_key, default=CACHE_MISS_SENTINEL)

    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:

        @backoff.on_exception(
            backoff.expo, (APITimeoutError, RateLimitError), max_time=60
        )
        async def do_call():
            return await client.chat.completions.create(
                model=model,
                messages=messages,
                **kwargs,
            )

        completion = await do_call()
        await set_async(key=cache_key, value=completion.json())
        return completion
    # CACHE HIT
    else:
        # TODO: Tracing Code
        return ChatCompletion.validate(json.loads(cached_value))

In [30]:
import time

start = time.time()

completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="What is Caching in Software Engineering?")],
    max_tokens=100,
)
print(completion)

end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdYkVYjYfzq6lS5cRKeNElIjfLpt', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering refers to the process of storing copies of files or data in a temporary storage location (the cache) for quick access. The main goal of caching is to improve the performance and efficiency of applications by reducing the time and resources required to fetch data that has already been retrieved or computed.\n\n### Key Concepts of Caching:\n\n1. **Cache Storage**:\n   - Caches can reside in different layers, such as in-memory (RAM), disk, or even in external services (e', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103598, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705bf87c0', usage=CompletionUsage(completion_tokens=100, prompt_tokens=15, total_tokens=115, completion_tokens_details=Comple

In [31]:
completion

ChatCompletion(id='chatcmpl-AVdYkVYjYfzq6lS5cRKeNElIjfLpt', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering refers to the process of storing copies of files or data in a temporary storage location (the cache) for quick access. The main goal of caching is to improve the performance and efficiency of applications by reducing the time and resources required to fetch data that has already been retrieved or computed.\n\n### Key Concepts of Caching:\n\n1. **Cache Storage**:\n   - Caches can reside in different layers, such as in-memory (RAM), disk, or even in external services (e', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103598, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705bf87c0', usage=CompletionUsage(completion_tokens=100, prompt_tokens=15, total_tokens=115, completion_tokens_details=Comple

In [32]:
start = time.time()
completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="What is Caching in Software Engineering?")],
    max_tokens=100,
)
print(completion)
end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdYkVYjYfzq6lS5cRKeNElIjfLpt', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='Caching in software engineering refers to the process of storing copies of files or data in a temporary storage location (the cache) for quick access. The main goal of caching is to improve the performance and efficiency of applications by reducing the time and resources required to fetch data that has already been retrieved or computed.\n\n### Key Concepts of Caching:\n\n1. **Cache Storage**:\n   - Caches can reside in different layers, such as in-memory (RAM), disk, or even in external services (e', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103598, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_0705bf87c0', usage=CompletionUsage(completion_tokens=100, prompt_tokens=15, total_tokens=115, completion_tokens_details=Comple

In [33]:
start = time.time()
completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="Teach me Caching like a beginner")],
    max_tokens=100,
)
print(completion)
end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdcx8uUPQuCrfxOlOmrT7jxXmQiv', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content="Sure! Caching is a technique used in computing to improve the performance of systems by storing copies of frequently accessed data in a location that is faster to access than the original source. Here's a beginner-friendly guide to understanding caching:\n\n### What is Caching?\n\n1. **Definition**: Caching involves storing data in a temporary storage area (the cache) to speed up future access to that data. Instead of fetching the data from a slow source (like a database or a remote server), the system", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103859, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_3de1288069', usage=CompletionUsage(completion_tokens=100, prompt_tokens=14, total_tokens=114, completion_tokens_details=Com

In [34]:
start = time.time()
completion = await cached_chat_completion(
    model=GPT_4O_MINI,
    messages=[_user(content="Teach me Caching like a beginner")],
    max_tokens=100,
)
print(completion)
end = time.time()

print(end - start)

ChatCompletion(id='chatcmpl-AVdcx8uUPQuCrfxOlOmrT7jxXmQiv', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content="Sure! Caching is a technique used in computing to improve the performance of systems by storing copies of frequently accessed data in a location that is faster to access than the original source. Here's a beginner-friendly guide to understanding caching:\n\n### What is Caching?\n\n1. **Definition**: Caching involves storing data in a temporary storage area (the cache) to speed up future access to that data. Instead of fetching the data from a slow source (like a database or a remote server), the system", refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1732103859, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier=None, system_fingerprint='fp_3de1288069', usage=CompletionUsage(completion_tokens=100, prompt_tokens=14, total_tokens=114, completion_tokens_details=Com