# RAG: Data Ingestion and Retrieval for complex documents


<img src="arch.png" width=500px>

In [1]:
import time
from typing import List, Optional, Union

from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from rag_101.retriever import (
    create_parent_retriever,
    load_embedding_model,
    load_pdf,
    load_reranker_model,
    retrieve_context,
)
from rich import print

Load PDF documents and create retriever

In [2]:
files = [
    "/teamspace/studios/this_studio/example_data/2401.00908.pdf",  # DocLLM paper
]

docs = load_pdf(files=files)

embedding_model = load_embedding_model()
retriever = create_parent_retriever(docs, embedding_model)
reranker_model = load_reranker_model()

Initialize Langchain's Ollama Chat API integration using `mistral` model and create the chain with prompt template.

In [3]:
llm = ChatOllama(model="mistral")
prompt_template = ChatPromptTemplate.from_template(
    (
        "Please answer the following question based on the provided `context` that follows the question.\n"
        "If you do not know the answer then just say 'I do not know'\n"
        "question: {question}\n"
        "context: ```{context}```\n"
    )
)
chain = prompt_template | llm | StrOutputParser()

Retrieve document and run the chain using `context` and `question`.

In [4]:
query = "What is the source of the dataset the model was trained on?"
context, similarity_score = retrieve_context(query, retriever, reranker_model)[0]
context = context.page_content

output = chain.invoke({"context": context, "question": query})
print("LLM Response:", output)

In [5]:
# Run through some sample queries
samples = [
    "What are the metrics used to evaluate the answers?",
    "How many pdf data were collected from the USA?",
    "What is the DocLLM architecture ?",
    "Which countries were used to collect dataset?",
    "Where was the agriculture dataset collected for the USA?",
    "how was the content and structure of available documents augmented?",
    "What was the answer generation process used in the paper?",
    "how was the content and structure of available documents augmented?",
    "Explain the DocLLM model architecture step by step",
    "what is the training dataset used in DocLLM?",
    "which pretrained model was used for DocLLM?",
    "which tools were used for web scraping?",
    "which tools were used to extract information from pdfs?",
    "what is GROBID mentioned in the paper?",
]

In [6]:
for query in samples:
    print(query)
    context = retrieve_context(
        query, retriever=retriever, reranker_model=reranker_model
    )[0]
    output = chain.invoke({"context": context[0].page_content, "question": query})
    print("LLM Response:", output)

    print("\n", "=" * 100, "\n\n")