In [None]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [None]:
QDRANT_URL = ":memory:"

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai",
    api_key=OPENAI_API_KEY, 
    model="gpt-4", 
    temperature=0.75, 
    system_prompt="""Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
    Your response must be grounded in the provided context and relevant to the essence of the user's query.
    """
    )

setup_embed_model(
    provider="openai", 
    model="text-embedding-3-small",
    api_key=OPENAI_API_KEY)

In [None]:
import random
from utils import get_documents_from_docstore, group_documents_by_author, sample_documents

documents = get_documents_from_docstore("../data/words-of-the-senpais")

random.seed(42)

documents_by_author = group_documents_by_author(documents)

senpai_documents = sample_documents(documents_by_author, num_samples=10)

# Document Summary Index

<img src="https://docs.llamaindex.ai/en/stable/_static/production_rag/decouple_chunks.png" style="width:50%; height:5`0%">

Source: [LlamaIndex Documentation](https://docs.llamaindex.ai/en/stable/optimizing/production_rag/#decoupling-chunks-used-for-retrieval-vs-chunks-used-for-synthesis)

This method extracts summaries for each document to improve retrieval performance over traditional semantic search on text chunks alone. It uses concise summaries and LLM reasoning capabilities to enhance retrieval before synthesis over retrieved chunks.

### 🚫 Limitations of chunk-based retrieval

- Chunks lack global context 

- Careful tuning of similarity thresholds required

- Embeddings may not capture relevance well

- Keyword filtering has its own challenges

#### 📝 The Document Summary Index stores

- A summary extracted by an LLM for each document

- The document split into text chunks 

- Mapping between summaries and source documents/chunks

#### 🔍 Retrieval approaches

1. 🤖 LLM-based: LLM scores relevance of document summaries 

2. 📐 Embedding-based: Retrieve based on summary embedding similarity

### ⚖️ Advantages

- Summaries provide more context than chunks alone

- LLM can reason over summaries before full documents

- Different optimal representations for retrieval vs. synthesis

### 🚀 Key techniques

1. Embed summaries linked to document chunks

2. Retrieve summaries, replace with full document content



## Setup Vector Store

In [None]:
from llama_index.core import StorageContext
from llama_index.core.settings import Settings

from utils import create_index, create_query_engine, ingest, setup_vector_store

COLLECTION_NAME = "words-of-the-senpai-document-summary-index"

doc_summary_vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

## Ingest using [`DocumentSummaryIndex`](https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/indices/document_summary/base.py)

The `DocumentSummaryIndex`:

- 📝 Builds an index from a set of documents

- 🎯 Generates a summary for each document using a response synthesizer

- 💾 Stores the summaries and their corresponding document nodes in the index

#### 🌐 Retrieval

- Supports two retrieval modes: embedding-based and LLM-based
- 🪢 Embedding-based retrieval:
  - Embeds the summaries using an embedding model
  - Retrieves relevant summaries based on similarity to a query embedding

- 🧠 LLM-based retrieval:
  - Uses a LLM to retrieve relevant summaries based on a query

It focuses on indexing documents, generating summaries, and providing efficient retrieval methods based on either embeddings or LLMs. The retriever also supports document management operations like adding and deleting documents from the index.

#### The high-level API uses embedding based retrieval by default.

In [None]:
from llama_index.core import DocumentSummaryIndex, get_response_synthesizer
from llama_index.core.node_parser import TokenTextSplitter

splitter = TokenTextSplitter(chunk_size=1024, chunk_overlap=16)

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)

doc_summary_index = DocumentSummaryIndex.from_documents(
    senpai_documents,
    llm=Settings.llm,
    embed_model=Settings.embed_model,
    transformations=[splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
    vector_store=doc_summary_vector_store
)

### 🔧 Setup Query Engine and Pipeline


In [None]:
from llama_index.core import PromptTemplate
from utils import create_query_engine
from prompts import HYPE_ANSWER_GEN_PROMPT

HYPE_ANSWER_GEN_PROMPT_TEMPLATE = PromptTemplate(HYPE_ANSWER_GEN_PROMPT)

doc_summaries_query_engine = create_query_engine(
    index=doc_summary_index, 
    mode="query",
    response_mode="compact",
    similiarty_top_k=5,
    vector_store_query_mode="mmr", 
    vector_store_kwargs={"mmr_threshold": 0.42},
    )

doc_summaries_query_engine.update_prompts({'response_synthesizer:text_qa_template':HYPE_ANSWER_GEN_PROMPT_TEMPLATE})

Note: We won't run inference using the above as I want to show you the low-level API for embedding based retrieval as well. We'll use that for generation.

## 📜 [Document Summary Retrievers](https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/indices/document_summary/retrievers.py)


<img src="https://www.llamaindex.ai/_next/image?url=https%3A%2F%2Fcdn.sanity.io%2Fimages%2F7m9jw85w%2Fproduction%2F6d78d199badf9b45f5637d2a87aee0b12b9a335c-2099x1134.png%3Ffit%3Dmax%26auto%3Dformat&w=1920&q=75" style="width:70%; height:70%">

Source: [LlamaIndex Blog](https://www.llamaindex.ai/blog/a-new-document-summary-index-for-llm-powered-qa-systems-9a32ece2f9ec)

- 📂 Contains two types of retrievers:
  1. 🧠 LLM-based retriever (`DocumentSummaryIndexLLMRetriever`)
  2. 🎨 Embedding-based retriever (`DocumentSummaryIndexEmbeddingRetriever`)

These document summary retrievers  efficiently retrieve relevant summaries from a document summary index. 

The LLM-based retriever uses language models to select relevant summaries based on a query, while the embedding-based retriever uses embedding similarity to find relevant summaries. 


#### 🧠 [`DocumentSummaryIndexLLMRetriever`](https://github.com/run-llama/llama_index/blob/99984eb87afb2e7feda65d5246ad166b0042f6fe/llama-index-core/llama_index/core/indices/document_summary/retrievers.py#L28)

- 📜 Retrieves relevant summaries from the index using LLM calls

- 🎛️ Customizable prompt for selecting relevant summaries

- 🍰 Processes summary nodes in batches

- 🔝 Retrieves top-k summary nodes based on LLM's relevance scoring

- 🤖 Uses an LLM to select relevant summaries

##### Arguments you need to know:

- `index`:  The index to retrieve from.

- `choice_select_prompt`: The prompt to use for selecting relevant summaries. The default prompt can be found [here](https://github.com/run-llama/llama_index/blob/99984eb87afb2e7feda65d5246ad166b0042f6fe/llama-index-core/llama_index/core/prompts/default_prompts.py#L392)

- `choice_batch_size`: The number of summary nodes to send to LLM at a time. The default value is 10

- `choice_top_k`: The number of summary nodes to retrieve. The default value is 1.

- `format_node_batch_fn`: Function to format a batch of nodes for LLM. This defaults to `default_format_node_batch_fn`, which formats a batch of summary nodes by assigning each node a number and joining their contents with a separator.

- `parse_choice_select_answer_fn`: Function to parse LLM response. It defaults to `default_parse_choice_select_answer_fn`, which parses the answer string from the LLM, extracting the selected answer numbers and their corresponding relevance scores, and returns them as lists.

- `llm` (LLM): The llm to use.

In [None]:
from llama_index.core.indices.document_summary import DocumentSummaryIndexLLMRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")

In [None]:
doc_llm_retriever = DocumentSummaryIndexLLMRetriever(
    doc_summary_index,
    choice_top_k=5,
    llm=Settings.llm,
    # choice_select_prompt=None,
    # choice_batch_size=10,
    # format_node_batch_fn=None,
    # parse_choice_select_answer_fn=None,
)

doc_llm_query_engine = RetrieverQueryEngine(
    retriever=doc_llm_retriever,
    response_synthesizer=response_synthesizer,
)

doc_llm_query_engine.update_prompts({'response_synthesizer:text_qa_template':HYPE_ANSWER_GEN_PROMPT_TEMPLATE})

In [None]:
doc_llm_query_engine.query("How can I stop overanalyzing my own moods and feelings?")

In [None]:
from utils import create_query_pipeline

from llama_index.core.query_pipeline import InputComponent

input_component = InputComponent()

doc_llm__chain = [input_component, doc_llm_query_engine]

doc_llm_query_pipeline = create_query_pipeline(doc_llm__chain)

In [None]:
doc_llm_query_pipeline.run(input="How can I stop overanalyzing my own moods and feelings?")

#### 🎨 [`DocumentSummaryIndexEmbeddingRetriever`](https://github.com/run-llama/llama_index/blob/aad4a6fb94c8fcaf1b7dfac56b88b9e277886bfe/llama-index-core/llama_index/core/indices/document_summary/retrievers.py#L121)

- 📜 Retrieves relevant summaries from the index using embedding similarity

- 🔢 Retrieves top-k summary nodes based on embedding similarity

- 🪢 Uses an embedding model to embed the query

- 📏 Queries the vector store to find similar summaries

##### Arguments you need to know

- `index`: The index to retrieve from.

- `similarity_top_k`: The number of summary nodes to retrieve.


In [None]:
from llama_index.core.indices.document_summary import DocumentSummaryIndexEmbeddingRetriever

doc_embed_retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    # similarity_top_k=1,
)

doc_embed_query_engine = RetrieverQueryEngine(
    retriever=doc_embed_retriever,
    response_synthesizer=response_synthesizer,
)

In [None]:
doc_embed__chain = [input_component, doc_embed_query_engine]

doc_embed_query_pipeline = create_query_pipeline(doc_embed__chain)

In [None]:
doc_embed_query_pipeline.run(input="How can I stop overanalyzing my own moods and feelings?")