In [1]:
import os

LANGSMITH_TRACING=True
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
LANGSMITH_API_KEY=os.getenv('LANGSMITH_API_KEY')
LANGSMITH_PROJECT="rag_pipeline_with_chat_history"

In [2]:
from dotenv import load_dotenv
load_dotenv()

from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model='gemini-2.5-flash-preview-04-17')

from langchain.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name = 'BAAI/bge-base-en-v1.5')

  embedding_model = HuggingFaceEmbeddings(model_name = 'BAAI/bge-base-en-v1.5')
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_community.document_loaders import WebBaseLoader
import bs4
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
retriever = vectorstore.as_retriever()

In [5]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contexualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ('system', contextualize_q_system_prompt),
        MessagesPlaceholder('chat_history'),
        ('human', '{input}')
    ]

)

history_aware_retriver = create_history_aware_retriever(llm, retriever, contexualize_q_prompt)



In [6]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(history_aware_retriver, question_answer_chain)

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [7]:
conversational_rag_chain.invoke(
    {"input": "What is Task Decomposition?"},
    config={
        "configurable": {"session_id": "abc123"}
    },
)["answer"]

'Task decomposition is a technique used to break down complex or hard tasks into smaller, simpler steps. This transforms big tasks into multiple manageable parts, often by instructing a model to "think step by step." Chain of Thought (CoT) is a standard prompting technique that utilizes task decomposition to enhance model performance on complex tasks.'

In [8]:
conversational_rag_chain.invoke(
    {"input": "What are common ways of doing it?"},
    config={"configurable": {"session_id": "abc123"}},
)["answer"]

'Based on the text, task decomposition can be done by an LLM using simple prompting or task-specific instructions, or with human inputs. Chain of Thought (CoT) is a standard technique that instructs models to "think step by step" to decompose tasks. Tree of Thoughts extends CoT by decomposing problems into thought steps and generating multiple thoughts per step.'

### implementing streaming capabilities

In [9]:
### implementing streaming capabilities
LANGSMITH_TRACING=True
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
LANGSMITH_API_KEY=os.getenv('LANGSMITH_API_KEY_STREAMING')
LANGSMITH_PROJECT="rag_pipeline_with_chat_history_streaming"


In [10]:
from langchain import hub
bs_strainer = bs4.SoupStrainer(class_=("post-content", "post-title", "post-header"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs_strainer},
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)


In [26]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [27]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
).with_config(tags=["contextualize_q_system_prompt"])

contextualize_q_chain = (contextualize_q_prompt | llm | StrOutputParser()).with_config(
    tags=["contextualize_q_chain"]
)

qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def contextualized_question(input: dict):
    if input.get("chat_history"):
        return contextualize_q_chain 
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(context=contextualized_question | retriever | format_docs)
    | qa_prompt
    | llm
)

In [30]:
from langchain_core.messages import HumanMessage

chat_history = []

question = "What is Task Decomposition?"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])

second_question = "What are common ways of doing it?"

ai_message2=rag_chain.invoke({"question": second_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_message2])
ai_msg
ai_message2

AIMessage(content='Task decomposition can be commonly done using simple prompts for Large Language Models (LLMs) or with specific task instructions. Human inputs are also a way to decompose tasks. Techniques like Chain of Thought and Tree of Thoughts are specific methods that utilize decomposition by breaking problems into sequential or multiple thought steps.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'models/gemini-2.5-flash-preview-04-17', 'safety_ratings': []}, id='run--f7c5804c-a9e9-476e-8f16-c5e0bcc045cd-0', usage_metadata={'input_tokens': 966, 'output_tokens': 58, 'total_tokens': 1719, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 695}})

In [31]:
ai_msg

AIMessage(content='Task decomposition is a technique used to break down complicated tasks into smaller, simpler steps or subgoals. This process makes hard tasks more manageable. Techniques like Chain of Thought and Tree of Thoughts utilize task decomposition.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'models/gemini-2.5-flash-preview-04-17', 'safety_ratings': []}, id='run--bfe279ad-1fc8-47f3-b7aa-70f5b0cc7c10-0', usage_metadata={'input_tokens': 915, 'output_tokens': 41, 'total_tokens': 1274, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 318}})