In [3]:
from langchain_google_vertexai import ChatVertexAI
llm = ChatVertexAI(model="gemini-2.0-flash-001", temperature=1.0)

# Cache

Let's set up an in-memory cache and invoke an LLM:

In [4]:
from langchain_core.caches import InMemoryCache
from langchain_core.globals import set_llm_cache

cache = InMemoryCache()
set_llm_cache(cache)

llm.invoke("What is the capital of UK?")

AIMessage(content='The capital of the UK is **London**.\n', additional_kwargs={}, response_metadata={'is_blocked': False, 'safety_ratings': [], 'usage_metadata': {'prompt_token_count': 7, 'candidates_token_count': 10, 'total_token_count': 17, 'prompt_tokens_details': [{'modality': 1, 'token_count': 7}], 'candidates_tokens_details': [{'modality': 1, 'token_count': 10}], 'cached_content_token_count': 0, 'cache_tokens_details': []}, 'finish_reason': 'STOP', 'avg_logprobs': -0.004826634004712105, 'model_name': 'gemini-2.0-flash-001'}, id='run-103f112f-ca89-4820-a4a4-ea7f05c05c1d-0', usage_metadata={'input_tokens': 7, 'output_tokens': 10, 'total_tokens': 17})

Now we can see the request-response pair has been cached:

In [12]:
import langchain
print(langchain.llm_cache._cache)

{('[{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of UK?", "type": "human"}}]', '{"id": ["langchain", "chat_models", "vertexai", "ChatVertexAI"], "kwargs": {"default_metadata": [], "endpoint_version": "v1beta1", "location": "us-central1", "max_retries": 6, "model_family": "2", "model_name": "gemini-2.0-flash-001", "n": 1, "perform_literal_eval_on_string_raw_content": true, "project": "kuligin-sandbox1", "request_parallelism": 5, "temperature": 1.0}, "lc": 1, "name": "ChatVertexAI", "type": "constructor"}---[(\'stop\', None)]'): [ChatGeneration(text='The capital of the UK is **London**.\n', generation_info={'is_blocked': False, 'safety_ratings': [], 'usage_metadata': {'prompt_token_count': 7, 'candidates_token_count': 10, 'total_token_count': 17, 'prompt_tokens_details': [{'modality': 1, 'token_count': 7}], 'candidates_tokens_details': [{'modality': 1, 'token_count': 10}], 'cached_content_token_coun

That's how an LLM instance is serialized for aching purposes:

In [6]:
llm._get_llm_string()

'{"id": ["langchain", "chat_models", "vertexai", "ChatVertexAI"], "kwargs": {"default_metadata": [], "endpoint_version": "v1beta1", "location": "us-central1", "max_retries": 6, "model_family": "2", "model_name": "gemini-2.0-flash-001", "n": 1, "perform_literal_eval_on_string_raw_content": true, "project": "kuligin-sandbox1", "request_parallelism": 5, "temperature": 1.0}, "lc": 1, "name": "ChatVertexAI", "type": "constructor"}---[(\'stop\', None)]'

We can also create an in-memory store to store and search for key-value pairs:

In [7]:
from langgraph.store.memory import InMemoryStore
from langchain_google_vertexai import VertexAIEmbeddings

in_memory_store = InMemoryStore(
)

in_memory_store.put(namespace=("users", "user1"), key="fact1", value={"message1": "My name is John."})
in_memory_store.put(namespace=("users", "user1", "conv1"), key="address", value={"message": "I live in Berlin."})

In [8]:
in_memory_store.get(namespace=("users", "user1"), key="address")

In [9]:
in_memory_store.search(("users", "user1", "conv1"), query="name")

[Item(namespace=['users', 'user1', 'conv1'], key='address', value={'message': 'I live in Berlin.'}, created_at='2025-03-18T14:25:23.305500+00:00', updated_at='2025-03-18T14:25:23.305501+00:00', score=None)]