# Defining a Unified Query Interface over your Data

Query Decomposition: The ability to decompose a complex query into a simpler query given the content of the index.

Use ChatGPT as the LLM model

In [1]:
import logging
import sys

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Uncomment if you want to temporarily disable logger
logger = logging.getLogger()
logger.disabled = True

In [2]:
from gpt_index import (
    GPTSimpleVectorIndex, 
    GPTSimpleKeywordTableIndex, 
    GPTListIndex, 
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext
)
from langchain.llms.openai import OpenAIChat, OpenAI
import requests

  from .autonotebook import tqdm as notebook_tqdm


#### Load Datasets

Load Wikipedia pages as well as Paul Graham's "What I Worked On" essay

In [3]:
wiki_titles = ["Toronto", "Seattle", "Chicago", "Boston", "Houston"]

In [4]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)


In [5]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()


### Building each Vector Index
Build a vector index for the wiki pages about cities and persons, and PG essay

In [6]:
# # LLM Predictor (gpt-3.5-turbo)
llm_predictor_chatgpt = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor_chatgpt, chunk_size_limit=1024
)



In [None]:
# Build city document index
vector_indices = {}
for wiki_title in wiki_titles:
    # build vector index
    vector_indices[wiki_title] = GPTSimpleVectorIndex.from_documents(
        city_docs[wiki_title], service_context=service_context
    )
    # set id for vector index
    vector_indices[wiki_title].index_struct.index_id = wiki_title
    vector_indices[wiki_title].save_to_disk(f'index_{wiki_title}.json')

In [7]:
index_summaries = {}
for wiki_title in wiki_titles:
    # set summary text for city
    index_summaries[wiki_title] = (
        f"This index contains Wikipedia articles about {wiki_title}. "
        f"Use this index if you need to lookup specific facts about {wiki_title}. "
    )

In [8]:
# If indices already saved, try loading
vector_indices = {}
for wiki_title in wiki_titles:
    vector_indices[wiki_title] = GPTSimpleVectorIndex.load_from_disk(
      f'index_{wiki_title}.json', service_context=service_context
    )

#### Test Querying the Vector Index

In [9]:
response = vector_indices["Toronto"].query("What are the sports teams in Toronto?")

{}
getting query transform: QueryConfig(index_struct_type=<IndexStructType.SIMPLE_DICT: 'simple_dict'>, query_mode=<QueryMode.DEFAULT: 'default'>, query_kwargs={}, index_struct_id=None, query_transform=None, query_combiner=None)


Exception: 

In [37]:
print(str(response))

The sports teams in Toronto include the Toronto Maple Leafs (NHL), Toronto Blue Jays (MLB), Toronto Raptors (NBA), Toronto Argonauts (CFL), Toronto FC (MLS), Toronto Rock (National Lacrosse League), and the Toronto Wolfpack (professional rugby league team).


### Build a Graph for Compare/Contrast Queries

We compose a keyword table index on top of all the vector indices.
We use this index for compare/contrast queries

In [9]:
from gpt_index.indices.composability import ComposableGraph

In [10]:
graph = ComposableGraph.from_indices(
    GPTSimpleKeywordTableIndex,
    [index for _, index in vector_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [11]:
# get root index
root_index = graph.get_index(graph.index_struct.root_id, GPTSimpleKeywordTableIndex)
# set id of root index
root_index.index_struct.index_id = "compare_contrast"
root_summary = (
    "This index contains Wikipedia articles about multiple cities. "
    "Use this index if you need to lookup specific facts about multiple cities. "
)

#### Test querying the graph

In [14]:
# define decompose_transform
from gpt_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    llm_predictor_chatgpt, verbose=True
)

In [22]:
# set query config
query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        },
        # NOTE: set query transform for subindices
        "query_transform": decompose_transform
    },
    {
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        },
    },
]

In [23]:
query_str = (
    "Compare and contrast the arts and culture of Houston and Boston. "
)
response_chatgpt = graph.query(
    query_str, 
    query_configs=query_configs, 
    service_context=service_context,
)

INFO:gpt_index.indices.keyword_table.query:> Starting query: Compare and contrast the arts and culture of Houston and Boston. 
INFO:gpt_index.indices.keyword_table.query:query keywords: ['compare', 'arts', 'boston', 'culture', 'contrast', 'houston']
INFO:gpt_index.indices.keyword_table.query:> Extracted keywords: ['boston', 'houston']


index struct id: compare_contrast
[33;1m[1;3m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What are some notable cultural institutions or events in Boston?
[0mindex struct id: Boston
[33;1m[1;3m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[38;5;200m[1;3m> New query: What are some notable arts and cultural institutions in Houston?
[0mindex struct id: Houston



KeyboardInterrupt



In [17]:
print(response_chatgpt)

Houston and Boston both have a rich arts and culture scene with a variety of institutions and events. Houston is known for the Houston Livestock Show and Rodeo, the Houston Greek Festival, the Art Car Parade, and the Houston Theater District. Boston, on the other hand, is known for the Boston Lyric Opera Company, the Boston Early Music Festival, the annual Boston Arts Festival, and the Italian summer feasts in the North End. Both cities have museums of fine arts, but Boston also has the Isabella Stewart Gardner Museum, the Institute of Contemporary Art, and the Boston Athenæum. Houston has the Museum of Natural Science, the Holocaust Museum Houston, and the Children's Museum of Houston. Overall, both cities offer a diverse range of cultural experiences for residents and visitors alike.


### Build the Outer R|outer Index

In [12]:
# put in all vector indexes + the ro|ot index for the graph

outer_graph = ComposableGraph.from_indices(
    GPTSimpleVectorIndex,
    [index for _, index in vector_indices.items()] + [root_index], 
    [summary for _, summary in index_summaries.items()] + [root_summary]
)

INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:gpt_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 134 tokens


In [15]:
# set query config
query_configs = [
    {
        "index_struct_id": "compare_contrast",
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        },
    },
    # {
    #     "index_struct_type": "simple_dict",
    #     "query_mode": "default",
    #     "query_kwargs": {
    #         "similarity_top_k": 1
    #     },
    #     # NOTE: set query transform for subindices
    #     "query_transform": decompose_transform
    # },
    {
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        },
    },
]
for wiki_title in wiki_titles:
    query_config = {
        "index_struct_id": wiki_title,
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        },
        # NOTE: set query transform for subindices
        "query_transform": decompose_transform
    }
    query_configs.append(query_config)

In [17]:
# ask a compare/contrast question 
response = outer_graph.query(
    "Compare and contrast the arts and culture of Houston and Boston.",
    query_configs=query_configs,
    service_context=service_context
)

{'compare_contrast': QueryConfig(index_struct_type='simple_dict', query_mode=<QueryMode.DEFAULT: 'default'>, query_kwargs={'similarity_top_k': 1}, index_struct_id='compare_contrast', query_transform=None, query_combiner=None), 'Toronto': QueryConfig(index_struct_type='simple_dict', query_mode=<QueryMode.DEFAULT: 'default'>, query_kwargs={'similarity_top_k': 1}, index_struct_id='Toronto', query_transform=<gpt_index.indices.query.query_transform.base.DecomposeQueryTransform object at 0x2b96b4c40>, query_combiner=None), 'Seattle': QueryConfig(index_struct_type='simple_dict', query_mode=<QueryMode.DEFAULT: 'default'>, query_kwargs={'similarity_top_k': 1}, index_struct_id='Seattle', query_transform=<gpt_index.indices.query.query_transform.base.DecomposeQueryTransform object at 0x2b96b4c40>, query_combiner=None), 'Chicago': QueryConfig(index_struct_type='simple_dict', query_mode=<QueryMode.DEFAULT: 'default'>, query_kwargs={'similarity_top_k': 1}, index_struct_id='Chicago', query_transform=<


KeyboardInterrupt



In [82]:
response

Response(response='The context information only provides information about notable cultural institutions or events in Houston, not Boston.', source_nodes=[NodeWithScore(node=Node(text="Some notable cultural institutions or events in Houston according to the Wikipedia index include the Theater District, Bayou Place entertainment complex, Space Center Houston, the Galleria shopping mall, the Downtown Aquarium, the Houston Zoo, the Houston Museum of Natural Science, and various parks and green spaces such as Hermann Park and Buffalo Bayou Park. The city also has a significant music scene, particularly in hip-hop, and is home to the chopped and screwed remixing-technique in hip-hop. Additionally, there are ethnic enclaves such as Houston's Chinatown and the Mahatma Gandhi District.", doc_id='6d3ca6dd-de55-4b80-b349-a99333afd624', embedding=None, doc_hash='5945a98908b9e657c2e17b3d7a1c40731fb7b59d2fd97055732f8bbbd093f23c', extra_info=None, node_info=None, relationships={}), score=0.805562031

In [68]:
response = outer_graph.query("What are the sports teams in Toronto?")

In [72]:
response.source_nodes[1].node

Node(text='\nThe sports teams in Toronto are the Toronto Maple Leafs (NHL), Toronto Blue Jays (MLB), Toronto Raptors (NBA), Toronto Argonauts (CFL), Toronto FC (MLS), Toronto Rock (NLL), Toronto Wolfpack (RFL), and Toronto Rush (NARL).', doc_id='2fbc9b34-11f7-4314-9446-e3286ebbc00f', embedding=None, doc_hash='b202900dff4de0f0680b2991858081456feeb3bcfc1e0e50a671b39bfa876f8d', extra_info=None, node_info=None, relationships={})