In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["LANGCHAIN_API_KEY"]=os.environ.get('LANGCHAIN_API_KEY')
os.environ["LANGSMITH_ENDPOINT"]=os.environ.get('LANGSMITH_ENDPOINT')
os.environ["LANGCHAIN_TRACING_V2"]=os.environ.get('LANGCHAIN_TRACING_V2')
os.environ["LANGCHAIN_PROJECT"]=os.environ.get('LANGCHAIN_PROJECT')


VERTEX_AI_PROJECT=os.environ.get('VERTEX_AI_PROJECT')
VERTEX_AI_LOCATION=os.environ.get('VERTEX_AI_LOCATION')

In [2]:
VERTEX_AI_LOCATION

'us-central1'

# Chains
In a conversational RAG application, queries issued to the retriever should be informed by the context of the conversation. LangChain provides a **create_history_aware_retriever** constructor to simplify this. It constructs a chain that accepts keys input and **chat_history** as input, and has the same output schema as a retriever. **create_history_aware_retriever** requires as inputs:

1. LLM
2. Retriever
3. Prompt
   
First we obtain these objects:

# LLM

In [3]:
import vertexai


In [4]:
vertexai.init(project=VERTEX_AI_PROJECT, location=VERTEX_AI_LOCATION)

In [21]:
from langchain_google_vertexai import VertexAI
llm = VertexAI(model_name="gemini-pro")


# Basic Retriever

In [7]:
# Retriever

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter



USER_AGENT environment variable not set, consider setting it to identify your requests.


In [8]:
from llama_index.indices.managed.llama_cloud import LlamaCloudIndex

In [9]:
index = LlamaCloudIndex(
  name="Schneider_test", 
  project_name="Default",
  organization_id="05d118aa-cdd9-4de1-8cae-58356a536f4c",
  api_key="llx-wRERJV3azrzTFlqCjp2KkSRCm3C2VOrHjNWI30sggOTCqiDC"
)

In [10]:
from langchain_community.retrievers.llama_index import LlamaIndexRetriever
retriever = LlamaIndexRetriever(index=index, query_kwargs={})

In [11]:
query_engine = index.as_query_engine(query_kwargs={})

In [12]:
query_engine

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x14ff1340c20>

In [13]:
response = query_engine.query("What do Discharge lamps depend on?")


In [14]:
response = query_engine.query("What is Internal MV distribution?")


In [15]:
print(response)


Internal MV distribution refers to the distribution of medium voltage electricity within a facility or building.


# Prompt

### First create a contextualized system prompt

We'll use a prompt that includes a MessagesPlaceholder variable under the name "chat_history". 

This allows us to pass in a list of Messages to the prompt using the "chat_history" input key, 

and these messages will be inserted after the system message and before the human message containing the latest question.

<span style="color:red">
The purpose of this prerequisite chain is to reformulate the question if required in the context of the chat history.

If it's not needed. The question is returned as it is.<span>



It takes in 3 things:
1. The system Prompt for question Reformulation
2. Chat History
3. Original Question

In [16]:
# Prompt

from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

# 1. Create History Aware Retriever Chain

We can then instantiate the history-aware retriever:

This chain prepends a rephrasing of the input query to our retriever, so that the retrieval incorporates the context of the conversation.



In [17]:
from langchain.schema import Document
from langchain_core.retrievers import BaseRetriever
from pydantic import Field
from typing import Any, Dict, List

class LlamaQueryEngineRetriever(BaseRetriever):
    """
    A LangChain retriever that wraps a LlamaIndex query engine.
    """

    query_engine: Any = None  # The LlamaIndex query engine

    def _get_relevant_documents(self, query: str, *, run_manager=None) -> List[Document]:
        # Run the query through the LlamaIndex query engine
        response = self.query_engine.query(query)

        # Convert the LlamaIndex response into LangChain Documents
        docs = []
        for source_node in response.source_nodes:
            metadata = source_node.metadata or {}
            docs.append(Document(page_content=source_node.get_content(), metadata=metadata))

        return docs


In [18]:
retriever = LlamaQueryEngineRetriever(query_engine=query_engine)


In [19]:
retriever

LlamaQueryEngineRetriever(query_engine=<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine object at 0x0000014FF1340C20>)

In [22]:
history_aware_retriever_chain = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

# 2. Build QA Chain

As in the RAG tutorial, we will use **create_stuff_documents_chain** to generate a **question_answer_chain**, with input keys 

1. **context**
2. **chat_history**
3. **input**

It accepts the retrieved context alongside the conversation history and query to generate an answer.



In [23]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

# 3. Build RAG Chain


We build our final **rag_chain** with **create_retrieval_chain**. 

This chain applies the 
**history_aware_retriever_chain** and **question_answer_chain** (created above)

in sequence, retaining intermediate outputs such as the retrieved context for convenience. It has input keys input and chat_history, and includes input, chat_history, context, and answer in its output.


In [24]:
rag_chain = create_retrieval_chain(history_aware_retriever_chain, question_answer_chain)

# Adding chat history
To manage the chat history, we will need:

An object for storing the chat history;
An object that wraps our chain and manages updates to the chat history.
For these we will use **BaseChatMessageHistory** and **RunnableWithMessageHistory**. The latter is a wrapper for an LCEL chain and a BaseChatMessageHistory that handles injecting chat history into inputs and updating it after each invocation.

In [25]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


In [26]:
# Function to generate a unique session ID
import uuid

def generate_session_id() -> str:
    return str(uuid.uuid4())

# 3. Build Conversational RAG Chain


Finally we build our final **conversational_rag_chain** with 
1. rag_chain 
2. get_session_history
3. input_messages_key="input"
4. history_messages_key="chat_history"
5. output_messages_key="answer"


In [27]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [28]:
# Generating a dynamic session ID
session_id_1 = generate_session_id()

In [29]:
session_id_1

'678181cd-0531-4f5b-b7e8-4b69aaac27d3'

In [31]:
# Run the coroutine and get the response
response = conversational_rag_chain.invoke(
    {"input": "what do Discharge lamps depend on?"},
    config={
        "configurable": {"session_id": session_id_1}
    }
)

# Print the response
print(response)

{'input': 'what do Discharge lamps depend on?', 'chat_history': [], 'context': [Document(metadata={'file_size': 1031804, 'last_modified_at': '2025-02-08T12:34:21', 'file_path': 'Schneider_2018_small.pdf', 'file_name': 'Schneider_2018_small.pdf', 'external_file_id': 'Schneider_2018_small.pdf', 'file_id': 'e95aa0e4-a724-4fb7-9657-836519ab515d', 'pipeline_file_id': '6e8f09cb-3bdb-4312-aac2-9536435be726', 'pipeline_id': 'd42e94f1-972d-4b7d-99ec-5af1d540c623', 'page_label': 1, 'start_page_index': 0, 'start_page_label': 1, 'end_page_index': 0, 'end_page_label': 1, 'document_id': '1a37c32a2d258cae41793ceb72ed2d2e95d4332d81f73d6fbc', 'start_char_idx': 0, 'end_char_idx': 2485}, page_content='# A - General rules of electrical installation design\n\n# 3 Installed power loads - Characteristics\n\n|Type of lamp|Lamp power (W)|Current at 230 V (A)|\n|---|---|---|\n|Separated ballast lamp|10|0.080|\n| |18|0.110|\n| |26|0.150|\n|Integrated ballast lamp|8|0.075|\n| |11|0.095|\n| |16|0.125|\n| |21|0.170

In [32]:
# Run the coroutine and get the response
response = conversational_rag_chain.invoke(
    {"input": "what is Internal MV distribution?"},
    config={
        "configurable": {"session_id": session_id_1}
    }
)

# Print the response
print(response)



{'input': 'what is Internal MV distribution?', 'chat_history': [HumanMessage(content='what do Discharge lamps depend on?', additional_kwargs={}, response_metadata={}), AIMessage(content='Discharge lamps depend on the luminous electrical discharge through a gas or vapor of a metallic compound.', additional_kwargs={}, response_metadata={})], 'context': [Document(metadata={'file_size': 1031804, 'last_modified_at': '2025-02-08T12:34:21', 'file_path': 'Schneider_2018_small.pdf', 'file_name': 'Schneider_2018_small.pdf', 'external_file_id': 'Schneider_2018_small.pdf', 'file_id': 'e95aa0e4-a724-4fb7-9657-836519ab515d', 'pipeline_file_id': '6e8f09cb-3bdb-4312-aac2-9536435be726', 'pipeline_id': 'd42e94f1-972d-4b7d-99ec-5af1d540c623', 'page_label': 3, 'start_page_index': 2, 'start_page_label': 3, 'end_page_index': 2, 'end_page_label': 3, 'document_id': '1a37c32a2d258cae41793ceb72ed2d2e95d4332d81f73d6fbc', 'start_char_idx': 11204, 'end_char_idx': 13611}, page_content='# B - Connection to the MV ut

In [None]:
type(response)

In [33]:
response["answer"]

'Internal MV distribution describes a system where multiple secondary substations are supplied by a MV distribution circuit within the installation. This is typically used when several MV/LV transformers are needed, and all are located within the main substation.'

In [None]:
conversational_rag_chain.invoke(
    {"input": "Why it depends?"},
    config={"configurable": {"session_id": session_id_1}},
)["answer"]

In [None]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": session_id_1}},
)

# Streaming final outputs


The .stream method will by default stream each key in a sequence.

In [None]:
stream= conversational_rag_chain.stream(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": session_id_1}},
)

In [None]:
for chunk in stream:
    print(chunk)

# Steaming only Answers

We are free to process chunks as they are streamed out. If we just want to stream the answer tokens, for example, we can select chunks with the corresponding key:

In [None]:
stream= conversational_rag_chain.stream(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": session_id_1}},
)

Stream with a character

In [None]:
for chunk in stream:
    if answer_chunk := chunk.get("answer"):
        print(f"{answer_chunk}|", end="")

Stream without a character

In [None]:
stream= conversational_rag_chain.stream(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": session_id_1}},
)

In [None]:
for chunk in stream:
    if answer_chunk := chunk.get("answer"):
        print(f"{answer_chunk}", end="")

### Stream the answer Using a .pick method
 https://python.langchain.com/v0.2/api_reference/core/runnables/langchain_core.runnables.base.Runnable.html#langchain_core.runnables.base.Runnable.pick
More simply, we can use the .pick method to select only the desired key:



In [None]:
stream= conversational_rag_chain.stream(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": session_id_1}},
)

In [None]:
pick_answer_chain = conversational_rag_chain.pick("answer")


In [None]:
stream= pick_answer_chain.stream(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": session_id_1}},
)

In [None]:
for chunk in stream:
    print(f"{chunk}", end="")