In [92]:
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import WikipediaLoader,TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate,SystemMessagePromptTemplate,HumanMessagePromptTemplate
from langchain_core.messages import HumanMessage,SystemMessage,AIMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableMap,RunnableLambda,RunnableParallel,RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from pydantic import BaseModel,Field
from dotenv import load_dotenv
load_dotenv()

llm_model = ChatGoogleGenerativeAI(model = 'gemini-2.0-flash',max_retries=2)
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2') ##embedding model

In [109]:
documents = WikipediaLoader(
    query='Virat Kohli',
    lang='en',
    load_max_docs=15
).load()

print(f"Number of docs loaded:\n{len(documents)}")
print(f"Document metadata:\n{documents[0].metadata}")
print(f"Document Page content:\n{documents[0].page_content}")

Number of docs loaded:
15
Document metadata:
{'title': 'Virat Kohli', 'summary': "Virat Kohli (Hindi pronunciation: [ʋɪˈɾaːʈᵊ ˈkoːɦᵊliː] , born 5 November 1988) is an Indian international cricketer and the former captain of the Indian national cricket team. He is a right-handed batsman and an occasional medium-pace bowler. He currently represents Royal Challengers Bengaluru in the IPL and Delhi in domestic cricket.  Kohli is widely regarded as one of the greatest all-format batters of all time. He also holds the record for scoring the most centuries in ODI cricket and stands second in the list of most international centuries scored, and is highest run-scorer in the IPL. Kohli was a member of the Indian team that won the 2011 Cricket World Cup, 2013 ICC Champions Trophy, 2024 T20 World Cup and 2025 Champions Trophy. Under his captaincy India held the ICC World Test Championship Mace between 2016 and 2020, winning the end of season prize on three occasions.\nIn 2013, Kohli was ranked num

In [110]:
chunker = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    separators=["\n\n","\n"," "]
)
doc_chunks = chunker.split_documents(documents=documents)

print(f"Number of chunks loaded:\n{len(doc_chunks)}")
print(f"Chunk metadata:\n{doc_chunks[0].metadata}")
print(f"Chunk Page content:\n{doc_chunks[0].page_content}")

Number of chunks loaded:
188
Chunk metadata:
{'title': 'Virat Kohli', 'summary': "Virat Kohli (Hindi pronunciation: [ʋɪˈɾaːʈᵊ ˈkoːɦᵊliː] , born 5 November 1988) is an Indian international cricketer and the former captain of the Indian national cricket team. He is a right-handed batsman and an occasional medium-pace bowler. He currently represents Royal Challengers Bengaluru in the IPL and Delhi in domestic cricket.  Kohli is widely regarded as one of the greatest all-format batters of all time. He also holds the record for scoring the most centuries in ODI cricket and stands second in the list of most international centuries scored, and is highest run-scorer in the IPL. Kohli was a member of the Indian team that won the 2011 Cricket World Cup, 2013 ICC Champions Trophy, 2024 T20 World Cup and 2025 Champions Trophy. Under his captaincy India held the ICC World Test Championship Mace between 2016 and 2020, winning the end of season prize on three occasions.\nIn 2013, Kohli was ranked num

In [111]:
persist_dir = './chromadb'
vector_store = Chroma.from_documents(
    documents=doc_chunks,
    embedding=embedding_model,
    collection_name='query_decomp2',
    persist_directory=persist_dir
)
print(f"Number of vectors created in the store: {vector_store._collection.count()}")

retriever = vector_store.as_retriever(search_type='mmr',
                                      search_kwargs={'k':3})

Number of vectors created in the store: 188


In [112]:
### query decomposition chain ##

def format_questions_list(questions):

    question_dict = dict(questions)
    questions_list = list(question_dict.values())
    return questions_list
    

class decomposition_resp(BaseModel):
    subques1:str = Field(...,description='SubQuestion number 1')
    subques2:str = Field(...,description='SubQuestion number 2')
    subques3:str = Field(...,description='SubQuestion number 3')

def decompositionChain(query):
    system_prompt = SystemMessagePromptTemplate.from_template(
        '''You are an intelligent assistant, which understands the human query nuances.
    Decompose the question into 3 sub-questions for better retrieval and better coverage of the human query'''
    )

    human_prompt = HumanMessagePromptTemplate.from_template(
        '''Human Query : {query}'''
    )

    chat_prompt = ChatPromptTemplate.from_messages([system_prompt,human_prompt])
    decomposition_chain = chat_prompt | llm_model.with_structured_output(decomposition_resp) | format_questions_list

    return decomposition_chain.invoke({'query':query})


In [113]:
## question retriever chain ###
from typing import List

def synthesizer(query,subans1,subans2,subans3):

    rag_instruction_prompt = ChatPromptTemplate.from_template(
    '''
    You are an intelligent synthesizer, based on following query & retrieved answers,
    create a combined answer, which  represents all the three answers, but keep the combined output compact.

    query : {query}
    answer1 : {subans1},
    answer2 : {subans2},
    answer3 : {subans3}
    '''
    )

    syn_chain = rag_instruction_prompt | llm_model | StrOutputParser()
    return syn_chain.invoke({'query':query,'subans1':subans1,'subans2':subans2,'subans3':subans3})



def format_docs(docs)->str:
    '''Creating a context from chunks'''
    return "\n\n".join(doc.page_content for doc in docs)



def rag_chain(query):
 
    subquestions = decompositionChain(query)
    #print(subquestions[0])
    print(f"3 decomposed questions : {subquestions}")

    rag_instruction_prompt = ChatPromptTemplate.from_template(
    '''
    You are an intelligent assistant, based on following  retrieved context documents,
    answer the question in a concise and streamlined manner in about 4-5 sentences.

    Context : {context},
    question : {query}
    '''
    )

    rag_chain = (
    RunnableParallel(branches={
            "sub_question1_chain": {'context': lambda _:format_docs(retriever.invoke(subquestions[0])) , "query": lambda _: subquestions[0]} | rag_instruction_prompt | llm_model | StrOutputParser(),
            "sub_question2_chain": {'context': lambda _:format_docs(retriever.invoke(subquestions[1])) , "query": lambda _: subquestions[1]} | rag_instruction_prompt | llm_model | StrOutputParser(),
            "sub_question3_chain": {'context': lambda _:format_docs(retriever.invoke(subquestions[2])) , "query": lambda _: subquestions[2]} | rag_instruction_prompt | llm_model | StrOutputParser(),
        }) | RunnableLambda(lambda x : synthesizer(query, x['branches']['sub_question1_chain'],
                                                   x['branches']['sub_question2_chain'],
                                                   x['branches']['sub_question3_chain']))
                                                   )
    

    return rag_chain.invoke(None)
    



In [114]:
rag_chain("Is Virat Kohli the best cricketer?")

3 decomposed questions : ["What are Virat Kohli's career statistics and achievements?", "How do Virat Kohli's statistics compare to those of other top cricketers?", "What are the subjective opinions of experts and fans regarding Virat Kohli's status as a cricketer?"]


"Virat Kohli is a highly accomplished Indian cricketer and former captain, celebrated as a right-handed top-order batsman and considered one of the greatest all-format batters. His impressive record includes 82 international centuries (30 in Tests, 51 in ODIs, and 1 in T20Is). He has received numerous accolades, including the Arjuna Award, Padma Shri, Khel Ratna, and was named Wisden Leading Cricketer in the World multiple times. Kohli's influence extends beyond the field, as recognized by Time magazine, and he is also one of the most commercially successful athletes globally."