# NYC Wikipedia Embeddings Demo

Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex

### Setup + Data Prep

In [1]:
# fetch "New York City" page from Wikipedia
from pathlib import Path

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'New York City',
        'prop': 'extracts',
        # 'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
nyc_text = page['extract']

data_path = Path('data')
if not data_path.exists():
    Path.mkdir(data_path)

with open('data/nyc_text.txt', 'w') as fp:
    fp.write(nyc_text)

In [None]:
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] = "INSERT OPENAI KEY"

### GPTTreeIndex - Embedding-based Query

In [1]:
from gpt_index import GPTTreeIndex, SimpleDirectoryReader
from IPython.display import Markdown

In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTTreeIndex(documents)

In [3]:
index.save_to_disk('index.json')

In [4]:
new_index = GPTTreeIndex.load_from_disk('index.json')

In [None]:
response = new_index.query("What is the name of the professional women's basketball team in New York City?", mode="embedding", verbose=True)

In [6]:
display(Markdown(f"<b>{response}</b>"))

<b>The professional women's basketball team in New York City is the New York Liberty.</b>

In [None]:
response = new_index.query(
    "What battles took place in New York City in the American Revolution?", 
    mode="embedding", 
    verbose=True
)

In [8]:
display(Markdown(f"<b>{response}</b>"))

<b>The Battle of Long Island, the largest battle of the American Revolutionary War, was fought in August 1776 within the modern-day borough of Brooklyn.</b>

In [None]:
response = new_index.query("What are the airports in New York City?", mode="embedding", verbose=True)

In [10]:
display(Markdown(f"<b>{response}</b>"))

<b>The airports in New York City are John F. Kennedy International Airport, Newark Liberty International Airport, LaGuardia Airport, and Stewart International Airport.</b>

### GPTListIndex - Embedding-based Query

In [11]:
from gpt_index import GPTListIndex, SimpleDirectoryReader
from IPython.display import Markdown

In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTListIndex(documents)

In [13]:
index.save_to_disk('index_list_emb.json')

In [14]:
# try loading
new_index = GPTListIndex.load_from_disk('index_list_emb.json')

In [None]:
response = new_index.query("What is the name of the professional women's basketball team in New York City?", mode="embedding", verbose=True)

In [16]:
display(Markdown(f"<b>{response}</b>"))

<b>
The New York Liberty is the professional women's basketball team in New York City.</b>

In [None]:
response = new_index.query("What battles took place in New York City in the American Revolution?", mode="embedding", verbose=True)

In [18]:
display(Markdown(f"<b>{response}</b>"))

<b>
The Battle of Long Island, the largest battle of the American Revolutionary War, was fought in August 1776 within the modern-day borough of Brooklyn.</b>

In [None]:
response = new_index.query("What are the airports in New York City?", mode="embedding", verbose=True)

In [20]:
display(Markdown(f"<b>{response}</b>"))

<b>
The airports in New York City are John F. Kennedy International Airport, Newark Liberty International Airport, and LaGuardia Airport.</b>