# RAG: Data Ingestion and Retrieval for complex documents


<img src="arch.png" width=500px>

In [1]:
import time
from typing import List, Optional, Union

from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from retriever import (
    create_parent_retriever,
    load_embedding_model,
    load_pdf,
    load_reranker_model,
    retrieve_context,
)
from rich import print

Load PDF documents and create retriever

In [2]:
files = [
    "example_data/nw1.pdf",
    "example_data/nw2.pdf",
      # DocLLM paper
]

docs = load_pdf(files=files)

embedding_model = load_embedding_model()
retriever = create_parent_retriever(docs, embedding_model)
reranker_model = load_reranker_model()

Initialize Langchain's Ollama Chat API integration using `mistral` model and create the chain with prompt template.

In [23]:
llm = ChatOllama(model="orca2", temperature=0)
prompt_template = ChatPromptTemplate.from_template(
    (
        "Please answer the following question based on the provided `context` that follows the question and answer it directly.\n"
        "If you do not know the answer then just say 'I do not know'\n"
        "question: {question}\n"
        "context: ```{context}```\n"
    )
)
chain = prompt_template | llm | StrOutputParser()

Retrieve document and run the chain using `context` and `question`.

In [31]:
# Run through some sample queries
samples = [
    "How many properties available for investment",
    "What is the available rentable area 1401 N Casaloma Dr, Appleton, WI",
    "Summarize the demographics at Appleton location",
    "Which property I should invest from all the properties available o make profit?"
]

In [32]:
for query in samples:
    print(query)
    context = retrieve_context(
        query, retriever=retriever, reranker_model=reranker_model
    )[0]
    output = chain.invoke({"context": context[0].page_content, "question": query})
    print("LLM Response:", output)

    print("\n", "=" * 100, "\n\n")