# Defining a Unified Query Interface over your Data

Query Decomposition: The ability to decompose a complex query into a simpler query given the content of the index.

Use ChatGPT as the LLM model

In [28]:
import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Uncomment if you want to temporarily disable logger
logger = logging.getLogger()
logger.disabled = True

In [29]:
from gpt_index import (
    GPTSimpleVectorIndex, 
    GPTSimpleKeywordTableIndex, 
    GPTListIndex, 
    GPTTreeIndex,
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext
)
from langchain.llms.openai import OpenAIChat, OpenAI
import requests

#### Load Datasets

Load Wikipedia pages as well as Paul Graham's "What I Worked On" essay

In [30]:
wiki_titles = ["Toronto", "Seattle", "Chicago", "Boston", "Houston"]

In [31]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)


In [32]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()


### Building each Vector Index
Build a vector index for the wiki pages about cities and persons, and PG essay

In [33]:
# # LLM Predictor (gpt-3.5-turbo)
# llm_predictor_chatgpt = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
# service_context = ServiceContext.from_defaults(
#     llm_predictor=llm_predictor_chatgpt, chunk_size_limit=1024
# )


llm_predictor_gpt4 = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-4"))
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor_gpt4, chunk_size_limit=1024
)

In [None]:
# Build city document index
vector_indices = {}
for wiki_title in wiki_titles:
    # build vector index
    vector_indices[wiki_title] = GPTSimpleVectorIndex.from_documents(
        city_docs[wiki_title], service_context=service_context
    )
    # set id for vector index
    vector_indices[wiki_title].index_struct.index_id = wiki_title
    vector_indices[wiki_title].save_to_disk(f'index_{wiki_title}.json')

In [35]:
index_summaries = {}
for wiki_title in wiki_titles:
    # set summary text for city
    index_summaries[wiki_title] = (
        f"This content contains Wikipedia articles about {wiki_title}. "
        f"Use this index if you need to lookup specific facts about {wiki_title}.\n"
        "Do not use this index if you want to analyze multiple cities."
    )

In [36]:
# If indices already saved, try loading
vector_indices = {}
for wiki_title in wiki_titles:
    vector_indices[wiki_title] = GPTSimpleVectorIndex.load_from_disk(
      f'index_{wiki_title}.json', service_context=service_context
    )

#### Test Querying the Vector Index

In [None]:
response = vector_indices["Toronto"].query("What are the sports teams in Toronto?")

In [49]:
print(str(response))


The sports teams in Toronto are the Toronto Maple Leafs (NHL), Toronto Blue Jays (MLB), Toronto Raptors (NBA), Toronto Argonauts (CFL), Toronto FC (MLS), Toronto Rock (NLL), Toronto Wolfpack (RFL), and Toronto Rush (NARL).


### Build a Graph for Compare/Contrast Queries

We compose a keyword table index on top of all the vector indices.
We use this index for compare/contrast queries

In [37]:
from gpt_index.indices.composability import ComposableGraph

In [None]:
graph = ComposableGraph.from_indices(
    GPTSimpleKeywordTableIndex,
    [index for _, index in vector_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

In [39]:
# get root index
root_index = graph.get_index(graph.index_struct.root_id, GPTSimpleKeywordTableIndex)
# set id of root index
root_index.index_struct.index_id = "compare_contrast"
root_summary = (
    "This index contains Wikipedia articles about multiple cities. "
    "Use this index if you want to compare multiple cities. "
)

#### Test querying the graph

In [40]:
# define decompose_transform
from gpt_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor_chatgpt, verbose=True
)

In [14]:
# set query config
query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        },
        # NOTE: set query transform for subindices
        "query_transform": decompose_transform
    },
    {
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        },
    },
]

In [None]:
query_str = (
    "Compare and contrast the arts and culture of Houston and Boston. "
)
response = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

In [51]:
print(response)

Houston and Boston both have rich arts and culture scenes, with a variety of cultural institutions and events. Both cities have a strong presence of performing arts, with Houston having the Houston Theater District, Houston Grand Opera, Houston Ballet, and Houston Symphony Orchestra, while Boston has the Boston Symphony Orchestra, Boston Lyric Opera Company, Opera Boston, and several other music ensembles. Both cities also host annual gay pride parades and festivals.

In terms of visual arts, both Houston and Boston have notable museums. Houston is home to the Museum of Fine Arts, the Houston Museum of Natural Science, and the Contemporary Arts Museum Houston, among others. Boston boasts the Museum of Fine Arts, the Isabella Stewart Gardner Museum, and the Institute of Contemporary Art. Unique to Houston are institutions like the Menil Collection, Rothko Chapel, and the Byzantine Fresco Chapel Museum, while Boston has the Boston Athenæum.

Houston has a diverse range of cultural events

### Build the Outer Router Index

In [41]:
num_children = len(vector_indices) + 1
outer_graph = ComposableGraph.from_indices(
    GPTTreeIndex,
    [index for _, index in vector_indices.items()] + [root_index], 
    [summary for _, summary in index_summaries.items()] + [root_summary],
    num_children=num_children
)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [42]:
# set query config
query_configs = [
    {
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        },
    },
    {
        "index_struct_type": "tree",
        "query_mode": "default",
        
    }
]
for wiki_title in wiki_titles:
    query_config = {
        "index_struct_id": wiki_title,
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        },
        # NOTE: set query transform for subindices
        "query_transform": decompose_transform
    }
    query_configs.append(query_config)

In [None]:
# ask a compare/contrast question 
response = outer_graph.query(
    "Compare and contrast the arts and culture of Houston and Boston.",
    query_configs=query_configs,
    service_context=service_context
)

In [45]:
str(response)

In [None]:
response = outer_graph.query("What are the sports teams in Toronto?")

In [48]:
str(response)

'\nThe sports teams in Toronto are the Toronto Maple Leafs (NHL), Toronto Blue Jays (MLB), Toronto Raptors (NBA), Toronto Argonauts (CFL), Toronto FC (MLS), Toronto Rock (NLL), Toronto Wolfpack (RFL), and Toronto Rush (NARL).'