# Test Complex Queries over Multiple Documents (with and without Query Decomposition)

Query Decomposition: The ability to decompose a complex query into a simpler query given the content of the index.

Use OpenAI as the LLM model and embedding model.

In [4]:
import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Uncomment if you want to temporarily disable logger
logger = logging.getLogger()
logger.disabled = True

In [5]:
from gpt_index import (
    GPTSimpleVectorIndex, 
    GPTSimpleKeywordTableIndex, 
    GPTListIndex, 
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext
)
import requests

ImportError: cannot import name 'BaseRetriever' from partially initialized module 'gpt_index.indices.base_retriever' (most likely due to a circular import) (/Users/suo/dev/gpt_index/gpt_index/indices/base_retriever.py)

#### Load Datasets

Load Wikipedia pages as well as Paul Graham's "What I Worked On" essay

In [None]:
wiki_titles = ["Toronto", "Seattle", "San Francisco", "Chicago", "Boston", "Washington, D.C.", "Cambridge, Massachusetts", "Houston"]

In [None]:
from pathlib import Path
import requests

data_path = Path('data_wiki')

for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)


In [None]:
# Load all wiki documents
city_docs = {}
all_docs = []
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[data_path / f"{wiki_title}.txt"]).load_data()
    all_docs.extend(city_docs[wiki_title])


In [None]:
# define service context
service_context = ServiceContext.from_defaults(
    chunk_size_limit=512, 
)

### Building the document indices
Build a separate vector index for each wiki pages about cities.

We also build a "global" vector index, which ingest documents for *all* cities. 

This allows us to test different types of data structures!

In [None]:
# Build index for each city document
city_indices = {}
index_summaries = {}
for wiki_title in wiki_titles:
    print(f"Building index for {wiki_title}")
    city_indices[wiki_title] = GPTSimpleVectorIndex.from_documents(city_docs[wiki_title], service_context=service_context)
    # set summary text for city
    index_summaries[wiki_title] = f"Wikipedia articles about {wiki_title}"
    city_indices[wiki_title].save_to_disk(f'index_{wiki_title}.json')

In [None]:
# also setup a global vector index 
global_index = GPTSimpleVectorIndex.from_documents(all_docs, service_context=service_context)
global_index.save_to_disk(f'index_cities_global.json')

### Loading the indices

If the index is already built, run these cells to just load index from disk.

In [None]:
# If indices already saved, try loading
city_indices = {}
for wiki_title in wiki_titles:
    city_indices[wiki_title] = GPTSimpleVectorIndex.load_from_disk(
      f'index_{wiki_title}.json', service_context=service_context
    )

In [None]:
global_index = GPTSimpleVectorIndex.load_from_disk('index_cities_global.json', service_context=service_context)

### Creating the right structure to run compare/contrast queries

Our key goal in this notebook is to run compare/contrast queries between different cities.

We currently have a separate vector index for every city document. We want to setup a "graph" structure in order to route the query 
in the right manner in order to retrieve the relevant text sections for each city. 

We compose a keyword table index on top of all the vector indices.

In [None]:
from gpt_index.indices.composability import ComposableGraph

In [None]:
graph = ComposableGraph.from_indices(
    GPTSimpleKeywordTableIndex,
    [index for _, index in city_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

In [None]:
# [optional] save to disk
graph.save_to_disk("index_multi_doc_graph.json")

In [None]:
# [optional] load from disk
graph = ComposableGraph.load_from_disk("index_multi_doc_graph.json")

### Define Query Transformation + Query Configs

We also define a "query decomposition" transform. Since we have a graph structure over multiple indexes, query decomposition
allows us to break a complex question into a simpler one over a given index.

This works well in comparing/contrasting different cities because it allows us to ask questions specific to each city.

**Query Transform**

In [None]:
from gpt_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor, verbose=True
)

In [None]:
# set query config
query_configs = [
    {
        # config for the vector index
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1,
            "verbose": True
        },
        # NOTE: set query transform for subindices
        "query_transform": decompose_transform
    },
    {
        # config for the keyword table index 
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        },
    },
]

### Let's Run Some Queries! 

We run queries over the graphs and analyze the results.

We also compare results against the baseline global vector index. In the majority of cases the global vector index provides insufficient answers.

**Complex Query 1**

In [None]:
# with query decomposition in subindices
query_str = (
    "Compare and contrast the demographics in Seattle, Houston, and Toronto. "
)

In [None]:
response = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

In [None]:
print(str(response))

In [None]:
response = global_index.query(query_str, similarity_top_k=3, response_mode="tree_summarize")

In [None]:
# NOTE: the global vector index seems to provide the right results....
# BUT see below! 
print(str(response))

In [None]:
# NOTE: there's hallucination! the sources only reference Toronto
print(response.source_nodes[0].source_text)
print(response.source_nodes[1].source_text)

**Complex Query 2**

In [None]:
# with query decomposition
query_str = (
    "What are the basketball teams in Houston and Boston?"
)

In [None]:
response = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

In [None]:
print(str(response))

In [None]:
response = global_index.query(query_str, similarity_top_k=2, response_mode="tree_summarize")

In [None]:
print(str(response))

**Complex Query 3**

In [None]:
# with query decomposition
query_str = (
    "Compare and contrast the climate of Houston and Boston "
)

In [None]:
response = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

In [None]:
print(response)

In [None]:
response = global_index.query(query_str, similarity_top_k=2, response_mode="tree_summarize")

In [None]:
print(str(response))