<a href="https://colab.research.google.com/github/shivam110601/advance-rag-application/blob/main/advance_rag_application.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain torch chromadb langchain-community langchain-huggingface

In [None]:
!pip install -qU langchain-chroma langchain_google_genai pypdf

In [8]:
# Import necessary libraries
import os
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader #
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter #
from langchain_huggingface import HuggingFaceEmbeddings #
from langchain_chroma import Chroma #
from langchain.chains import create_retrieval_chain #
from langchain.chains.combine_documents import create_stuff_documents_chain #
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate #
import torch
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI #
from langchain.retrievers import ContextualCompressionRetriever #
from langchain.retrievers.document_compressors import CrossEncoderReranker #
from langchain_community.cross_encoders import HuggingFaceCrossEncoder #
from typing import List
from langchain_core.output_parsers import BaseOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

# DOC, CHUNKS, VECTORSTORE

In [4]:
def text_extract(folder_path):
    loader = PyPDFDirectoryLoader(folder_path)
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=0
        )
    pages = loader.load_and_split(text_splitter)
    return pages, f"Splitted documents into {len(pages)} chunks"


def token_text_split(split_texts):
    token_splitter = SentenceTransformersTokenTextSplitter(
        chunk_overlap=0,
        tokens_per_chunk=256,
        model_name="sentence-transformers/all-MiniLM-L6-v2"
        )
    tokens = token_splitter.split_documents(split_texts)
    return tokens, f"Splitted document into {len(tokens)} chunks"



def create_vector_store(split_tokens, c_name="sample-set"):
    embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

    vector_store = Chroma.from_documents(documents=split_tokens,
                                         embedding=embedding,
                                         collection_name=c_name)
    retriever = vector_store.as_retriever(search_kwargs={"k": 20})
    return retriever, f"Created vector store with {vector_store._collection.count()} embeddings"

def process_document(file_path):
    split_texts, m1 = text_extract(file_path)
    print(m1)
    split_tokens, m2 = token_text_split(split_texts)
    print(m2)
    chroma_db, m3 = create_vector_store(split_tokens)
    print(m3)
    return chroma_db

In [5]:
retriever = process_document("/content/uploaded_pdfs")

Splitted documents into 351 chunks


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Splitted document into 353 chunks
Created vector store with 353 embeddings


In [6]:
def pretty_print_docs(docs):
    pretty_docs = ""
    for i, doc in enumerate(docs):
        pretty_docs += f"\n\nDocument {i+1}:\n"
        pretty_docs += f"Page: {doc.metadata.get('page')} and Source: {doc.metadata.get('source')}\n"
        pretty_docs += f"Content: {doc.page_content[:100]}..."
    return pretty_docs

# Query Processing

In [9]:
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY, temperature=0.2)

In [10]:
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""
    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))  # Remove empty lines


def multi_query_retriever(retriever):
    output_parser = LineListOutputParser()

    prompt_template = """
        You are a helpful and creative assistant with a goal to assist users.
        When the user asks a question, your task is to generate 5 semantically
        similar questions to it, to help them find the information they need.
        Make sure that the questions are diverse and cover different aspects
        of the topic. Suggest only short queries without compound sentences.
        Output one question per line. Do not number the questions.
        User question: {question}

        Similar questions:
    """

    # Initialize the prompt with 'query' as the variable
    q_prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

    llm_chain = q_prompt | model | output_parser

    mretriever = MultiQueryRetriever(
        include_original=True,
        retriever=retriever,
        llm_chain=llm_chain,
        parser_key="lines"
    )

    return mretriever


In [11]:
def rag_chain(retriever):
    prompt = ChatPromptTemplate.from_template("""
    You are an expert helpful assistant.
    Answer the following question based on the provided context. For any question,
    if you cannot answer the question from the context, just say "I don't know".
    Keep the answers coincise for the most part.

    Context: {context}

    Question: {input}

    Answer:
    """)

    document_chain = create_stuff_documents_chain(model, prompt)

    mretriever = multi_query_retriever(retriever)

    cross_encoder = HuggingFaceCrossEncoder(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2")
    compressor = CrossEncoderReranker(model=cross_encoder, top_n=5)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor, base_retriever=mretriever
    )

    rag_rerank_chain = create_retrieval_chain(compression_retriever, document_chain)
    return rag_rerank_chain

def get_rag_response(query):
    rag = rag_chain(retriever)
    response = rag.invoke({"input": query})
    return response["answer"], response["context"]

In [13]:
# Example usage
query = "What are the key financial highlights from Microsoft's 2022 annual report?"
response, docs = get_rag_response(query)
print(response, (pretty_print_docs(docs)))



Microsoft Cloud revenue increased 32% to $91.2 billion, Office commercial products and cloud services revenue increased 13%, Office consumer products and cloud services revenue increased 11%, LinkedIn revenue increased 34%, Dynamics products and cloud services revenue increased 25%, server products and cloud services revenue increased 28%, Windows OEM revenue increased 11%, Windows commercial products and cloud services revenue increased 11%, Xbox content and services revenue increased 3%, and search and news advertising revenue excluding traffic acquisition costs increased 27%. 
 

Document 1:
Page: 31 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: highlights from fiscal year 2022 compared with fiscal year 2021 included : • microsoft cloud ( forme...

Document 2:
Page: 31 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 30 management ’ s discussion and analysis of financial condition and results of operations the follo...

Docu

In [14]:
query = "What is the total revenue?"
response, docs = get_rag_response(query)
print(response, pretty_print_docs(docs))



$198,270 million 
 

Document 1:
Page: 48 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 47 financial statements and supplementary data income statements ( in millions, except per share amo...

Document 2:
Page: 83 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: productivity and business processes $ 29, 687 $ 24, 351 $ 18, 724 intelligent cloud 32, 721 26, 126 ...

Document 3:
Page: 83 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: revenue, classified by significant product and service offerings, was as follows : ( in millions ) y...

Document 4:
Page: 54 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 53 service and other revenue includes sales from cloud - based solutions that provide customers with...

Document 5:
Page: 36 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: • windows revenue increased $ 2. 3 billion or 10 % driven by growth i

In [15]:
query = "What are the project guidelines for rag project?"
response, docs = get_rag_response(query)
print(response, pretty_print_docs(docs))



The project guidelines for the RAG project are:

* **Use Docker to containerize the application for easy deployment.**
* **Ensure the system can handle large documents and multiple queries without significant performance drops.**
* **Share the code, deployment instructions, and the final working model through GitHub.** 
 

Document 1:
Page: 0 and Source: /content/uploaded_pdfs/Gen AI Engineer _ Machine Learning Engineer Assignment.pdf
Content: gen ai engineer / machine learning engineer assignment part 1 : retrieval - augmented generation ( r...

Document 2:
Page: 1 and Source: /content/uploaded_pdfs/Gen AI Engineer _ Machine Learning Engineer Assignment.pdf
Content: example interactions demonstrating the bot's capabilities. guidelines : ● use docker to containerize...

Document 3:
Page: 27 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 26 • gaming, focuses on developing hardware, content, and services across a large range of platforms...

Document 4:
Page

In [16]:
query = "How did microsoft's linkedin revenue increased?"
response, docs = get_rag_response(query)
print(response, pretty_print_docs(docs))



LinkedIn revenue increased 34% driven by a strong job market in their talent solutions business and advertising demand in their marketing solutions business. 
 

Document 1:
Page: 31 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: highlights from fiscal year 2022 compared with fiscal year 2021 included : • microsoft cloud ( forme...

Document 2:
Page: 36 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 35 reportable segments fiscal year 2022 compared with fiscal year 2021 productivity and business pro...

Document 3:
Page: 21 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: members and increase their engagement. linkedin revenue is mainly affected by demand from enterprise...

Document 4:
Page: 36 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: • windows revenue increased $ 2. 3 billion or 10 % driven by growth in windows oem and windows comme...

Document 5:
Page: 84 and S

In [17]:
query = "How did microsoft's xbox revenue increased?"
response, docs = get_rag_response(query)
print(response, pretty_print_docs(docs))



Xbox revenue increased by $860 million or 6% due to strong demand for Xbox Series X|S and growth in Xbox content and services. 
 

Document 1:
Page: 37 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 36 • search and news advertising revenue increased $ 2. 3 billion or 25 %. search and news advertisi...

Document 2:
Page: 31 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: highlights from fiscal year 2022 compared with fiscal year 2021 included : • microsoft cloud ( forme...

Document 3:
Page: 36 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: • windows revenue increased $ 2. 3 billion or 10 % driven by growth in windows oem and windows comme...

Document 4:
Page: 25 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: applications and services and to benefit our developer and partner ecosystems by providing access to...

Document 5:
Page: 36 and Source: /content/uploaded_pdfs/m

In [18]:
query = "How do you do things like this?"
response, docs = get_rag_response(query)
print(response, pretty_print_docs(docs))



I don't know. 
 

Document 1:
Page: 5 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: our political activities, our workforce demographics, our human rights work, and more. we should all...

Document 2:
Page: 15 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 14 tools like search, news, and maps have given us immediate access to the world ’ s information. to...

Document 3:
Page: 17 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: microsoft aims to recruit, develop, and retain world - changing talent from a diversity of backgroun...

Document 4:
Page: 5 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: 4 our commitment to responsibly develop and use technologies like ai is core to who we are. we put o...

Document 5:
Page: 15 and Source: /content/uploaded_pdfs/microsoft_annual_report_2022.pdf
Content: • applying ai to drive insights and act on our customer ’ s behalf by unde