# NYC Wikipedia Embeddings Demo

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex

### Setup + Data Prep

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [1]:
# fetch "New York City" page from Wikipedia
from pathlib import Path

import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'New York City',
        'prop': 'extracts',
        # 'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
nyc_text = page['extract']

data_path = Path('data')
if not data_path.exists():
    Path.mkdir(data_path)

with open('data/nyc_text.txt', 'w') as fp:
    fp.write(nyc_text)

In [None]:
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] = "INSERT OPENAI KEY"

### GPTTreeIndex - Embedding-based Query

In [2]:
from gpt_index import GPTTreeIndex, SimpleDirectoryReader
from IPython.display import Markdown

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTTreeIndex(documents)

In [4]:
index.save_to_disk('index.json')

In [13]:
new_index = GPTTreeIndex.load_from_disk('index.json')

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What is the name of the professional women's basketball team in New York City?", mode="embedding")

In [13]:
display(Markdown(f"<b>{response}</b>"))

<b>The New York Liberty is the professional women's basketball team in New York City.</b>

In [None]:
response = new_index.query(
    "What battles took place in New York City in the American Revolution?", 
    mode="embedding"
)

In [15]:
display(Markdown(f"<b>{response}</b>"))

<b>The Battle of Long Island, the largest battle of the American Revolutionary War, was fought in August 1776 within the modern-day borough of Brooklyn.</b>

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What are the airports in New York City?", mode="embedding")

In [15]:
display(Markdown(f"<b>{response}</b>"))

<b>The airports in New York City are John F. Kennedy International Airport, Newark Liberty International Airport, and LaGuardia Airport.</b>

### GPTListIndex - Embedding-based Query

In [5]:
from gpt_index import GPTListIndex, SimpleDirectoryReader
from IPython.display import Markdown

In [None]:
documents = SimpleDirectoryReader('data').load_data()
index = GPTListIndex(documents)

In [7]:
index.save_to_disk('index_list_emb.json')

In [8]:
# try loading
new_index = GPTListIndex.load_from_disk('index_list_emb.json')

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What is the name of the professional women's basketball team in New York City?", mode="embedding")

In [8]:
display(Markdown(f"<b>{response}</b>"))

<b>
The New York Liberty is the professional women's basketball team in New York City.</b>

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What battles took place in New York City in the American Revolution?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query("What are the airports in New York City?", mode="embedding")

In [None]:
display(Markdown(f"<b>{response}</b>"))

## Try out other embeddings! 
(courtesy of langchain)

In [1]:
from gpt_index import GPTListIndex, SimpleDirectoryReader
from IPython.display import Markdown

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load in HF embedding model from langchain
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from gpt_index import LangchainEmbedding
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

In [3]:
# try loading index
new_index = GPTListIndex.load_from_disk('index_list_emb.json')

In [None]:
# set Logging to DEBUG for more detailed outputs
response = new_index.query(
    "What is the name of the professional women's basketball team in New York City?", 
    mode="embedding", 
    embed_model=embed_model
)

In [5]:
response

"\n\nThe name of the professional women's basketball team in New York City is the New York Liberty."