# Introduction to Advanced RAG in LlamaIndex

In [None]:
%pip install nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
%pip install -Uq llama-index

## Extract

In [None]:
from llama_index.core import SimpleDirectoryReader

docs = SimpleDirectoryReader(input_dir="./data").load_data()

# file name as id
# docs_nam_as_id = SimpleDirectoryReader(input_dir="./data", filename_as_id=True).load_data()

In [None]:
len(docs)  # one per page

In [None]:
import pprint
pprint.pprint(docs)

## Transform

In [None]:
# hide some keys from llm

docs[0].__dict__ # too much data about one doc

In [None]:
# quick example of what the LLM and Embeddings see when with a test document

from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    # excluded_embed_metadata_keys=["file_name"],
    excluded_llm_metadata_keys=["category"],
    metadata_seperator="\n",
    metadata_template="{key}:{value}",
    text_template="Metadata:\n{metadata_str}\n-----\nContent:\n{content}",
)

print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
# print(
#     "The Embedding model sees this: \n",
#     document.get_content(metadata_mode=MetadataMode.EMBED),
# )

In [None]:
from llama_index.core.schema import MetadataMode

# print(docs[0].get_content(metadata_mode=MetadataMode.LLM))   # what the llm sees
print(docs[0].get_content(metadata_mode=MetadataMode.EMBED)) # what embeddings see. in this case, same thing

In [None]:
for doc in docs:
    # define the content/metadata template
    doc.text_template = "Metadata:\n{metadata_str}\n---\nContent:\n{content}"

    # exclude page label from embedding
    if "page_label" not in doc.excluded_embed_metadata_keys:
        doc.excluded_embed_metadata_keys.append("page_label")

In [None]:
# after editing the content seen by embedings

print(docs[0].get_content(metadata_mode=MetadataMode.EMBED))

Here are other, more advanced transformations. Some require an LLM to work. We will use Qwen 2.5 32B Instruct 128k through Groq, which is an affordble, high-rate model. It should be enough to extract Q&As and titles from the documents.

In [None]:
%pip install -Uq llama-index-llms-groq

from llama_index.llms.groq import Groq
import os
import getpass

os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")


#llm_transformations = Groq(model="qwen-2.5-32b", api_key=os.environ["GROQ_API_KEY"])
from llama_index.llms.groq import Groq
import os

# transformations: cheap/fast
llm_transformations = Groq(
    model="llama-3.1-8b-instant",           # ← was "qwen-2.5-32b"
    api_key=os.environ["GROQ_API_KEY"]
)

# querying: quality
llm_querying = Groq(
    model="llama-3.3-70b-versatile",        # you already used this; it's supported
    api_key=os.environ["GROQ_API_KEY"]
)


In [None]:
pip install -U llama-index-llms-openai


In [None]:
import os
os.environ["OPENAI_API_KEY"]="your key"

In [None]:
from llama_index.llms.openai import OpenAI
import os

# Fast + cheaper for transformations (titles, QA)
llm_transformations = OpenAI(
    model="gpt-4o-mini",           # fast/affordable
    api_key=os.environ["OPENAI_API_KEY"]
)

# Higher quality for answering
llm_querying = OpenAI(
    model="gpt-4.1",               # strong general model
    api_key=os.environ["OPENAI_API_KEY"]
)


In [None]:
# other transformations

from llama_index.core.extractors import (
    TitleExtractor,
    QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import SentenceSplitter

from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

text_splitter = SentenceSplitter(separator=" ", chunk_size=1024, chunk_overlap=128)

title_extractor = TitleExtractor(llm=llm_transformations, nodes=5)
qa_extractor    = QuestionsAnsweredExtractor(llm=llm_transformations, questions=3)

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor, qa_extractor]
)


pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor
    ]
)

nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

By default, Llamaindex uses OpenAI's embedding models. But you can choose to load a free model from HuggingFace too (but it it will be slower).

In [None]:
len(nodes)

In [None]:
import pprint

# pprint.pprint(nodes[0].__dict__)

print(nodes[0].get_content(metadata_mode=MetadataMode.LLM))

## Index

In [None]:
%pip install -Uq llama-index-embeddings-huggingface

In [None]:
pip install -U llama-index-embeddings-openai


# Embeddings

from llama_index.embeddings.huggingface import HuggingFaceEmbedding

hf_embeddings = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

test_embed = hf_embeddings.get_text_embedding("Hello world")
print(test_embed)

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding(model="text-embedding-3-small")  # great quality/cost
# test:
print(len(embed_model.get_text_embedding("hello")))


In [None]:
# create index

from llama_index.core import VectorStoreIndex

source_objects = nodes if "nodes" in locals() else docs
index = VectorStoreIndex(source_objects, embed_model=embed_model)


## Query

In [None]:
from llama_index.llms.openai import OpenAI

# define your answering llm
llm_querying = OpenAI(
    model="gpt-4o-mini",               # fast + cost-effective
    api_key=os.environ["OPENAI_API_KEY"]
)

# build the query engine with that llm
query_engine = index.as_query_engine(llm=llm_querying)

# run a sample query
response = query_engine.query("what does this model do?")
print(response)


In [None]:
response.__dict__

## Store

In [None]:
index.storage_context.persist(persist_dir="./vectors")

In [None]:
from llama_index.core import StorageContext, load_index_from_storage

# rebuild storage context from your persisted folder
storage_context = StorageContext.from_defaults(persist_dir="./vectors")

# load index with your OpenAI embed model
index_from_storage = load_index_from_storage(storage_context, embed_model=embed_model)


In [None]:
qa = index_from_storage.as_query_engine(llm=llm_querying)

In [None]:
response = qa.query("what does this model do?")
print(response)

# Using Vector Stores

In [None]:
%pip install -Uq chromadb
%pip install -Uq llama-index-vector-stores-chroma

In [None]:
# REQUIRES:
# pip install chromadb llama-index-vector-stores-chroma

import chromadb
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore

# 1) Chroma persistent client & collection
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("healthGPT")

# 2) Wire vector store to storage context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# 3) Build index from your prepared nodes (use OpenAI embeddings)
index = VectorStoreIndex(
    nodes,
    storage_context=storage_context,
    embed_model=embed_model,        # ← was hf_embeddings
)

# 4) Query
query_engine = index.as_query_engine(llm=llm_querying)
response = query_engine.query("What is this model good at?")
print(response)


In [None]:
response = query_engine.query("What is this model good at?")
print(response)

In [None]:
# Coarse recall from the vector store
retriever = index.as_retriever(similarity_top_k=20)  # try 20–30


In [None]:
from llama_index.core.postprocessor import LLMRerank

# Keep only the best M chunks after reranking
reranker = LLMRerank(top_n=5, llm=llm_querying)  # M=5 is a good start


In [None]:
from llama_index.core.postprocessor import LLMRerank

# fine filter: keep best 5 after rerank
reranker = LLMRerank(top_n=5, llm=llm_querying)

# build query engine:
# - similarity_top_k=20 sets the coarse ANN recall
# - node_postprocessors applies the reranker
query_engine = index.as_query_engine(
    llm=llm_querying,
    similarity_top_k=20,
    node_postprocessors=[reranker],
)

response = query_engine.query("what is this model good at?")
print(response)


In [None]:
q = "what is this model good at?"
cands = index.as_retriever(similarity_top_k=20).retrieve(q)
reranked = LLMRerank(top_n=5, llm=llm_querying).postprocess_nodes(cands, query_str=q)

for i, n in enumerate(reranked, 1):
    print(f"{i:02d}. score={getattr(n, 'score', None)} | source={n.metadata.get('file_name')}")
    print(n.get_content()[:220].replace("\n"," "), "\n")


In [None]:
# Peek at what the extractors wrote into metadata so we know the keys
n0 = nodes[0]
print("Known keys on node[0].metadata:", list(n0.metadata.keys()))
print("\nTitle (if present):", n0.metadata.get("title") or n0.metadata.get("document_title"))
print("\nQuestions (if present):", n0.metadata.get("questions") or n0.metadata.get("questions_this_node_can_answer"))


In [None]:
# Step 1: inspect QA metadata on your nodes

def get_qas(md):
    # try common keys your pipeline may use
    for key in (
        "questions_this_excerpt_can_answer",
        "questions_this_node_can_answer",
        "questions",
        "qa_pairs",
    ):
        if key in md and md[key]:
            qas = md[key]
            # normalize to list[str]
            if isinstance(qas, str):
                return [qas]
            try:
                return [str(q) for q in qas]
            except Exception:
                return [str(qas)]
    return []

total = len(nodes)
with_qas = 0
samples = []

for n in nodes:
    qas = get_qas(n.metadata or {})
    if qas:
        with_qas += 1
        if len(samples) < 3:
            samples.append({
                "title": n.metadata.get("title") or n.metadata.get("document_title"),
                "qas": qas[:5],
                "text_preview": n.text[:200].replace("\n"," ")
            })

print(f"Total nodes: {total}")
print(f"Nodes that have non-empty QA metadata: {with_qas}")

for i, s in enumerate(samples, 1):
    print(f"\n--- Sample node {i} ---")
    print("Title:", s["title"])
    print("QAs:", s["qas"])
    print("Text preview:", s["text_preview"])


In [None]:
import re
from typing import List, Sequence, Any
from llama_index.core.schema import TextNode

def _coerce_list(x: Any) -> List[str]:
    if x is None:
        return []
    if isinstance(x, (list, tuple)):
        return [str(i) for i in x]
    return [str(x)]

def _strip_outer_quotes(s: str) -> str:
    s = s.strip()
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        return s[1:-1]
    return s

def _extract_questions(q_blob: Any) -> List[str]:
    """
    Your QA extractor is storing a long Markdown-like block as a single string.
    This pulls out clean question lines so we don't embed the whole paragraph.
    """
    text = "\n".join(_coerce_list(q_blob))
    # normalize bullets/markdown
    text = text.replace("**", "")
    # regex: grab sentences that look like questions (end with '?')
    candidates = re.findall(r'([^?]{3,}?\?)', text, flags=re.MULTILINE|re.DOTALL)
    # strip numbering like "1. " or "- "
    cleaned = []
    for c in candidates:
        line = re.sub(r'^\s*(\d+[\.\)]\s+|\-\s+|\*\s+)?', '', c.strip())
        # keep reasonably sized questions
        if 5 <= len(line) <= 220:
            cleaned.append(line)
    # dedupe while preserving order
    seen = set()
    unique = []
    for q in cleaned:
        k = q.lower()
        if k not in seen:
            seen.add(k)
            unique.append(q)
    # cap to a handful so embedding text stays compact
    return unique[:5]

def make_nodes_with_title_and_qas(nodes: Sequence[TextNode]) -> List[TextNode]:
    out = []
    for n in nodes:
        md = n.metadata or {}
        title = md.get("title") or md.get("document_title") or ""
        title = _strip_outer_quotes(title) if isinstance(title, str) else title
        q_blob = (
            md.get("questions_this_excerpt_can_answer")
            or md.get("questions_this_node_can_answer")
            or md.get("questions")
            or md.get("qa_pairs")
        )
        q_list = _extract_questions(q_blob)

        parts = []
        if title:
            parts.append(f"[Title] {title}")
        parts.append(n.text)  # original chunk
        if q_list:
            parts.append("[QuestionsThisChunkCanAnswer]")
            parts.extend(f"- {q}" for q in q_list)

        enriched_text = "\n".join(parts)

        out.append(TextNode(
            id_=n.node_id,
            text=enriched_text,
            metadata=dict(md),
        ))
    return out

# Build enriched nodes and preview one
nodes_with_title_qas = make_nodes_with_title_and_qas(nodes)
print("✅ Built enriched nodes with title + parsed QAs folded into text.")
print("\nPreview enriched text:\n", nodes_with_title_qas[0].text[:800])


In [None]:
from llama_index.core import VectorStoreIndex

# Build a new index so embeddings include [Title] + parsed QAs
index_with_meta = VectorStoreIndex(
    nodes_with_title_qas,
    embed_model=embed_model  # your OpenAI embedding model from earlier
)

print("✅ Index rebuilt with metadata-aware embeddings (index_with_meta).")


In [None]:
# build query engine (you can also plug in reranker later)
query_engine = index_with_meta.as_query_engine(llm=llm_querying)

# test with a question that overlaps the QAs we injected
response = query_engine.query(
    "What novel technique does HealthGPT employ to adapt heterogeneous knowledge?"
)

print(response)


In [None]:
resp = query_engine.query(
    "What novel technique does HealthGPT employ to adapt heterogeneous knowledge?"
)

print("\nANSWER:\n", resp)

print("\n--- SOURCES USED ---")
for i, s in enumerate(resp.source_nodes, 1):
    md = s.node.metadata or {}
    title = md.get("title") or md.get("document_title") or "(untitled)"
    page  = md.get("page_label") or "?"
    fname = md.get("file_name") or "?"
    print(f"\n{i}. score={getattr(s, 'score', None)}")
    print(f"   Title: {title}")
    print(f"   File/Page: {fname} / {page}")
    print("   Snippet:", s.node.get_content()[:300].replace("\n"," "))


In [None]:
from llama_index.core.postprocessor import LLMRerank

# Keep best 5 after LLM-based reranking
reranker = LLMRerank(top_n=5, llm=llm_querying)

# Build a query engine that:
# - retrieves top-20 by embeddings (from your metadata-enriched index)
# - reranks them down to top-5
query_engine = index_with_meta.as_query_engine(
    llm=llm_querying,
    similarity_top_k=20,
    node_postprocessors=[reranker],
)

resp = query_engine.query("What novel technique does HealthGPT employ to adapt heterogeneous knowledge?")
print(resp)


In [None]:
# Step — Inspect rerank vs. embedding similarity scores

from llama_index.core.postprocessor import LLMRerank

# 1) pick your question (edit as you like)
q = "What novel technique does HealthGPT employ to adapt heterogeneous knowledge?"

# 2) retrieve top-K by embeddings (coarse recall)
K = 20
retriever = index_with_meta.as_retriever(similarity_top_k=K)
candidates = retriever.retrieve(q)

# keep original (embedding) scores by node_id
orig_scores = {nws.node.node_id: (nws.score or 0.0) for nws in candidates}

# 3) rerank down to top-M using the answering LLM (fine precision)
M = 5
reranker = LLMRerank(top_n=M, llm=llm_querying)
reranked = reranker.postprocess_nodes(candidates, query_str=q)

# 4) pretty print comparison
print(f"Query: {q}\n")
print(f"Embedding-retrieved K={K}, LLM-reranked M={M}\n")

for i, nws in enumerate(reranked, 1):
    rerank_score = nws.score  # LLM-based relevance score
    node = nws.node
    md = node.metadata or {}
    title = md.get("title") or md.get("document_title") or "(untitled)"
    page  = md.get("page_label") or "?"
    fname = md.get("file_name") or "?"
    # original embedding similarity (from the first retrieval)
    emb_score = orig_scores.get(node.node_id, None)

    print(f"{i}. RERANK={rerank_score:.3f} | EMBEDDING={emb_score:.3f}  | {title}")
    print(f"   Source: {fname} / p.{page}")
    print("   Snippet:", node.get_content()[:220].replace("\n", " "))
    print()


In [None]:
# Use LLM reranker inside the query engine so only top reranked chunks feed the LLM

from llama_index.core.postprocessor import LLMRerank

K = 20  # coarse recall (embedding retrieval)
M = 5   # fine precision (reranker keeps best M)

reranker = LLMRerank(top_n=M, llm=llm_querying)

# Build a query engine that:
#  - retrieves top-K by embeddings from your metadata-enriched index
#  - reranks to top-M with the LLM
#  - passes only those M chunks to the answering LLM
qe_reranked = index_with_meta.as_query_engine(
    llm=llm_querying,
    similarity_top_k=K,
    node_postprocessors=[reranker],
)

# Try it
resp = qe_reranked.query("What novel technique does HealthGPT employ to adapt heterogeneous knowledge?")
print("ANSWER:\n", resp)

print("\n--- RERANKED SOURCES ---")
for i, s in enumerate(resp.source_nodes, 1):
    md = s.node.metadata or {}
    title = md.get("title") or md.get("document_title") or "(untitled)"
    page  = md.get("page_label") or "?"
    fname = md.get("file_name") or "?"
    print(f"{i}. score={getattr(s, 'score', None)} | {title} — {fname} / p.{page}")


In [None]:
import time
from llama_index.core.postprocessor import LLMRerank

QUESTION = "What novel technique does HealthGPT employ to adapt heterogeneous knowledge?"

# ---------- Baseline: embeddings only ----------
t0 = time.time()
qe_base = index_with_meta.as_query_engine(
    llm=llm_querying,
    similarity_top_k=5,          # directly take top-5 by embeddings
    node_postprocessors=[],      # no rerank
)
resp_base = qe_base.query(QUESTION)
t_base = time.time() - t0

# ---------- Reranked: embeddings (K=20) + LLM rerank (M=5) ----------
t1 = time.time()
reranker = LLMRerank(top_n=5, llm=llm_querying)
qe_rerank = index_with_meta.as_query_engine(
    llm=llm_querying,
    similarity_top_k=20,         # K
    node_postprocessors=[reranker],  # -> M=5
)
resp_rerank = qe_rerank.query(QUESTION)
t_rerank = time.time() - t1

# ---------- Pretty print ----------
def print_sources(resp, title):
    print(f"\n--- {title} SOURCES ---")
    for i, s in enumerate(resp.source_nodes, 1):
        md = s.node.metadata or {}
        title = md.get("title") or md.get("document_title") or "(untitled)"
        page  = md.get("page_label") or "?"
        fname = md.get("file_name") or "?"
        print(f"{i}. score={getattr(s, 'score', None)} | {title} — {fname} / p.{page}")

print("\n==================== BASELINE (Embeddings only) ====================")
print(f"Time: {t_base:.2f}s")
print("ANSWER:\n", resp_base)
print_sources(resp_base, "BASELINE")

print("\n==================== RERANKED (Embeddings + LLM Rerank) ====================")
print(f"Time: {t_rerank:.2f}s")
print("ANSWER:\n", resp_rerank)
print_sources(resp_rerank, "RERANKED")
