In [1]:
# %pip install llama-index
# %pip install llama-index qdrant_client torch transformers

# %pip install llama-index-embeddings-huggingface

# %pip install llama-index-llms-ollama

# %pip install llama-index-vector-stores-qdrant

# %pip install ruff

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
## set up the Quadrant Vector Database
import qdrant_client

collection_name = "chat_with_docs"

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

In [4]:
# Read the documents

from llama_index.core import SimpleDirectoryReader

input_dir_path = "./data"

loader = SimpleDirectoryReader(
    input_dir = input_dir_path,
    required_exts = [".pdf"],
    recursive=True
)

docs = loader.load_data()

In [5]:
type(docs), len(docs)

(list, 32)

In [6]:
## A function to index data

from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import VectorStoreIndex, ServiceContext, StorageContext


def create_index(documents):

    vector_store = QdrantVectorStore(client = client,
                                     collection_name=collection_name)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    index = VectorStoreIndex.from_documents(documents,
                                            storage_context=storage_context)

    return index

In [7]:
## Laoding the mebedding model and index data

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

embed_model = HuggingFaceEmbedding(model_name='BAAI/bge-large-en-v1.5',
                                   trust_remote_code=True)

Settings.embed_model = embed_model

index = create_index(docs)

In [8]:
from llama_index.llms.ollama import Ollama

llm = Ollama(model="llama3.2:1b", request_timeout=120.0)

Settings.llm = llm

In [9]:
## Define the prompt template

from llama_index.core import PromptTemplate

template = """Context information is below:
              ----------------------
              {context_str}
              ----------------------
              Given the context information above I want you to think
              step by step to answer the query in a crisp manner,
              incase you don't know the answer say 'I don't know!'

              Query: {Query_str}

              Answer:"""

qa_prompt_tmpl = PromptTemplate(template)

In [10]:
## ReRank the chunks
from llama_index.core.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2",
    top_n=3
)

In [11]:
## Query the document
query_engine = index.as_query_engine(similarity_top_k=10,
                                     node_processors=[rerank])


query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)

response = query_engine.query("What exactly is DSPy?")

In [12]:
from IPython.display import Markdown, display

display(Markdown(str(response)))

DSPy is a programming model that abstracts language models as text transformation graphs, allowing developers to define and optimize these models through declarative modules. The key features of DSPy include the ability to parameterize and learn new tasks, with support for teleprompters to optimize arbitrary pipelines of modules. This enables the development of complex language models that can handle a wide range of tasks and applications.

### Conclusions and limitations
 - Like any other system , RAG isn't perfect either

#### Problems with RAG
 * Questions are not semantically similar to their answers
 * Semantic similarity can be diluted
 * We cannot ask question that require aggregation
 * Document order matters

 #### Limitations of current implementation
 * How to deal with document that include tables and figures
 * Finetune embedding ad reranker model for domain-specific use case
 * this implementation is slow
 * Multimodal RAG
 * Evaluate the RAG app
 * observability to ensure the consistency 