In [1]:
from dotenv import load_dotenv
load_dotenv()
import os

NOTE: This is ONLY necessary in jupyter notebook.
Details: Jupyter runs an event-loop behind the scenes. 
This results in nested event-loops when we start an event-loop to make async queries.
This is normally not allowed, we use nest_asyncio to allow it for convenience.

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Global Models


In [4]:
from llama_index.core import Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [5]:
# Using the LlamaDebugHandler to print the trace of the sub questions
# captured by the SUB_QUESTION callback event type
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

# Composable Objects
In this notebook, we show how you can combine multiple objects into a single top-level index.

This approach works by setting up IndexNode objects, with an obj field that points to a:

query engine
retriever
query pipeline
another node!

In [6]:
%pip install llama-index-storage-docstore-mongodb
%pip install llama-index-vector-stores-qdrant
%pip install llama-index-storage-docstore-firestore
%pip install llama-index-retrievers-bm25
%pip install llama-index-storage-docstore-redis
%pip install llama-index-storage-docstore-dynamodb
%pip install llama-index-readers-file pymupdf

Collecting llama-index-storage-docstore-mongodb
  Downloading llama_index_storage_docstore_mongodb-0.2.1-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-storage-kvstore-mongodb<0.3.0,>=0.2.1 (from llama-index-storage-docstore-mongodb)
  Downloading llama_index_storage_kvstore_mongodb-0.2.1-py3-none-any.whl.metadata (738 bytes)
Collecting motor<4.0,>=3.6 (from llama-index-storage-kvstore-mongodb<0.3.0,>=0.2.1->llama-index-storage-docstore-mongodb)
  Downloading motor-3.6.0-py3-none-any.whl.metadata (21 kB)
Collecting pymongo<5.0.0,>=4.6.1 (from llama-index-storage-kvstore-mongodb<0.3.0,>=0.2.1->llama-index-storage-docstore-mongodb)
  Downloading pymongo-4.10.1-cp310-cp310-win_amd64.whl.metadata (22 kB)
  Downloading pymongo-4.9.2-cp310-cp310-win_amd64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo<5.0.0,>=4.6.1->llama-index-storage-kvstore-mongodb<0.3.0,>=0.2.1->llama-index-storage-docstore-mongodb)
  Downloading dnspython-2.7.0-py3-none-any.whl.meta

In [12]:
# !curl "Mozilla" "https://arxiv.org/pdf/2307.09288" -o "./llama2.pdf"
# !curl "Mozilla" "https://arxiv.org/pdf/1706.03762" -o "./attention.pdf"

In [14]:
from llama_index.core import download_loader

from llama_index.readers.file import PyMuPDFReader

llama2_docs = PyMuPDFReader().load_data(
    file_path="./llama2.pdf", metadata=True
)
attention_docs = PyMuPDFReader().load_data(
    file_path="./attention.pdf", metadata=True
)

# Retriever Setup


In [15]:
from llama_index.core.node_parser import TokenTextSplitter

nodes = TokenTextSplitter(
    chunk_size=1024, chunk_overlap=128
).get_nodes_from_documents(llama2_docs + attention_docs)

In [16]:
nodes

[TextNode(id_='f0558ce3-25bd-4d61-a17e-ef47c8e32a42', embedding=None, metadata={'total_pages': 77, 'file_path': './llama2.pdf', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0453fa4f-82f7-49eb-9d7f-86a6c4d566f4', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 77, 'file_path': './llama2.pdf', 'source': '1'}, hash='4b6a83e91d55ec44eeee2d8531b07ed992c1b6d3f635e2138662981b8f258030')}, text='Llama 2: Open Foundation and Fine-Tuned Chat Models\nHugo Touvron∗\nLouis Martin†\nKevin Stone†\nPeter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra\nPrajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen\nGuillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller\nCynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou\nHakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kl

In [17]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.storage.docstore.firestore import FirestoreDocumentStore
from llama_index.storage.docstore.dynamodb import DynamoDBDocumentStore

docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

In [18]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

client = QdrantClient(path="./qdrant_data")
vector_store = QdrantVectorStore("composable", client=client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(nodes=nodes)
vector_retriever = index.as_retriever(similarity_top_k=2)
bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore, similarity_top_k=2
)

resource module not available on Windows


  from .autonotebook import tqdm as notebook_tqdm


**********
Trace: index_construction
    |_embedding -> 4.526768 seconds
    |_embedding -> 2.661247 seconds
**********
DEBUG:bm25s:Building index from IDs objects
Building index from IDs objects


# Composing Objects
Here, we construct the ```IndexNodes```. Note that the text is what is used to index the node by the top-level index.

For a vector index, the text is embedded, for a keyword index, the text is used for keywords.

In this example, the ```SummaryIndex``` is used, which does not technically need the text for retrieval, since it always retrieves all nodes.

In [19]:
from llama_index.core.schema import IndexNode

vector_obj = IndexNode(
    index_id="vector", obj=vector_retriever, text="Vector Retriever"
)
bm25_obj = IndexNode(
    index_id="bm25", obj=bm25_retriever, text="BM25 Retriever"
)

In [20]:
from llama_index.core import SummaryIndex

summary_index = SummaryIndex(objects=[vector_obj, bm25_obj])

**********
Trace: index_construction
**********


# Querying
When we query, all objects will be retrieved and used to generate the nodes to get a final answer.

Using ```tree_summarize``` with ```aquery()``` ensures concurrent execution and faster responses.

In [21]:
query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize", verbose=True
)

In [22]:
response = await query_engine.aquery(
    "How does attention work in transformers?"
)

[1;3;38;2;11;159;203mRetrieval entering vector: VectorIndexRetriever
[0m[1;3;38;2;11;159;203mRetrieval entering bm25: BM25Retriever
[0m**********
Trace: query
    |_query -> 4.105918 seconds
      |_synthesize -> 2.543057 seconds
        |_templating -> 0.0 seconds
        |_llm -> 2.519062 seconds
**********


In [23]:
print(str(response))

Attention in transformers operates by mapping a query and a set of key-value pairs to an output. This process involves computing a weighted sum of the values, where the weights are determined by the compatibility of the query with the keys. Each input element is transformed into three vectors: a query vector, a key vector, and a value vector. The attention mechanism calculates a score for each key based on its similarity to the query, typically using a dot product. These scores are then normalized using a softmax function to produce attention weights, which are applied to the value vectors to generate the final output. This allows the model to focus on different parts of the input sequence when producing each output, effectively capturing dependencies regardless of their distance in the sequence.


In [24]:
response = await query_engine.aquery(
    "What is the architecture of Llama2 based on?"
)

[1;3;38;2;11;159;203mRetrieval entering vector: VectorIndexRetriever
[0m[1;3;38;2;11;159;203mRetrieval entering bm25: BM25Retriever
[0m**********
Trace: query
    |_query -> 1.722148 seconds
      |_synthesize -> 1.109148 seconds
        |_templating -> 0.0 seconds
        |_llm -> 1.095143 seconds
**********


In [25]:
print(str(response))

The architecture of Llama 2 is based on the standard transformer architecture, which incorporates several enhancements for improved performance. These enhancements include pre-normalization using RMSNorm, the SwiGLU activation function, rotary positional embeddings (RoPE), and an increased context length. Additionally, the model employs grouped-query attention (GQA) for better inference scalability, particularly in its larger variants.


In [26]:
response = await query_engine.aquery(
    "What was used before attention in transformers?"
)

[1;3;38;2;11;159;203mRetrieval entering vector: VectorIndexRetriever
[0m[1;3;38;2;11;159;203mRetrieval entering bm25: BM25Retriever
[0m**********
Trace: query
    |_query -> 1.773313 seconds
      |_synthesize -> 1.281316 seconds
        |_templating -> 0.0 seconds
        |_llm -> 1.273315 seconds
**********


In [27]:
print(str(response))

Before the introduction of attention mechanisms in transformers, dominant sequence transduction models relied on complex recurrent neural networks (RNNs) or convolutional neural networks (CNNs) that included both an encoder and a decoder. These models often connected the encoder and decoder through attention mechanisms, but the foundational architectures were based on recurrence or convolutions.


# Note on Saving and Loading
Since objects aren't technically serializable, when saving and loading, then need to be provided at load time as well.

Here's an example of how I might save/load this setup.

# Save


In [28]:
# qdrant is already saved automatically!
# we only need to save the docstore here

# save our docstore nodes for bm25
docstore.persist("./docstore.json")

# Load

In [30]:
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

docstore = SimpleDocumentStore.from_persist_path("./docstore.json")

client = QdrantClient(path="./qdrant_data")
vector_store = QdrantVectorStore("composable", client=client)

RuntimeError: Storage folder ./qdrant_data is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.

In [None]:
index = VectorStoreIndex.from_vector_store(vector_store)
vector_retriever = index.as_retriever(similarity_top_k=2)
bm25_retriever = BM25Retriever.from_defaults(
    docstore=docstore, similarity_top_k=2
)

In [None]:
from llama_index.core.schema import IndexNode

vector_obj = IndexNode(
    index_id="vector", obj=vector_retriever, text="Vector Retriever"
)
bm25_obj = IndexNode(
    index_id="bm25", obj=bm25_retriever, text="BM25 Retriever"
)

In [None]:
# if we had added regular nodes to the summary index, we could save/load that as well
# summary_index.persist("./summary_index.json")
# summary_index = load_index_from_storage(storage_context, objects=objects)

from llama_index.core import SummaryIndex

summary_index = SummaryIndex(objects=[vector_obj, bm25_obj])