In [3]:
import asyncio
from typing import Any, Dict, List

from langchain_openai import ChatOpenAI
from langchain_core.callbacks import AsyncCallbackHandler, BaseCallbackHandler
from langchain_core.messages import HumanMessage
from langchain_core.outputs import LLMResult

In [4]:
class MyCustomSyncHandler(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs) -> None:
        print(f"Sync handler being called in a `thread_pool_executor`: token: {token}")

In [6]:
class MyCustomAsyncHandler(AsyncCallbackHandler):
    """Async callback handler that can be used to handle callbacks from langchain."""

    async def on_llm_start(
        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        """Run when chain starts running."""
        print("zzzz....")
        await asyncio.sleep(0.3)
        class_name = serialized["name"]
        print("Hi! I just woke up. Your llm is starting")

    async def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when chain ends running."""
        print("zzzz....")
        await asyncio.sleep(0.3)
        print("Hi! I just woke up. Your llm is ending")

In [7]:
# To enable streaming, we pass in `streaming=True` to the ChatModel constructor
# Additionally, we pass in a list with our custom handler
chat = ChatOpenAI(model="gpt-4o-mini",
max_tokens=25,
    streaming=True,
    callbacks=[MyCustomSyncHandler(), MyCustomAsyncHandler()],)

In [8]:
await chat.agenerate([[HumanMessage(content="Tell me a joke")]])

zzzz....
Hi! I just woke up. Your llm is starting
Sync handler being called in a `thread_pool_executor`: token: 
Sync handler being called in a `thread_pool_executor`: token: Why
Sync handler being called in a `thread_pool_executor`: token:  don't
Sync handler being called in a `thread_pool_executor`: token:  skeleton
Sync handler being called in a `thread_pool_executor`: token: s
Sync handler being called in a `thread_pool_executor`: token:  fight
Sync handler being called in a `thread_pool_executor`: token:  each
Sync handler being called in a `thread_pool_executor`: token:  other
Sync handler being called in a `thread_pool_executor`: token: ?


Sync handler being called in a `thread_pool_executor`: token: They
Sync handler being called in a `thread_pool_executor`: token:  don't
Sync handler being called in a `thread_pool_executor`: token:  have
Sync handler being called in a `thread_pool_executor`: token:  the
Sync handler being called in a `thread_pool_executor`: token:  guts
Sync 

LLMResult(generations=[[ChatGeneration(text="Why don't skeletons fight each other?\n\nThey don't have the guts!", generation_info={'finish_reason': 'stop', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_483d39d857'}, message=AIMessage(content="Why don't skeletons fight each other?\n\nThey don't have the guts!", response_metadata={'finish_reason': 'stop', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_483d39d857'}, id='run-41bc7bdb-367a-4872-b4ca-62baa1650f90-0'))]], llm_output={'token_usage': {}, 'model_name': 'gpt-4o-mini'}, run=[RunInfo(run_id=UUID('41bc7bdb-367a-4872-b4ca-62baa1650f90'))])