# SQL Index Demo

In [1]:
from gpt_index import GPTSQLStructStoreIndex, SimpleDirectoryReader, WikipediaReader
from IPython.display import Markdown, display

### Load Wikipedia Data

In [None]:
# install wikipedia python package
!pip install wikipedia

In [2]:
wiki_docs = WikipediaReader().load_data(pages=['Toronto', 'Berlin', 'Tokyo'])

### Create Database Schema

In [3]:
from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, select, column

In [4]:
engine = create_engine("sqlite:///:memory:")
metadata_obj = MetaData(bind=engine)

In [5]:
# create city SQL table
table_name = "city_stats"
city_stats_table = Table(
    table_name,
    metadata_obj,
    Column("city_name", String(16), primary_key=True),
    Column("population", Integer),
    Column("country", String(16), nullable=False),
)
metadata_obj.create_all()

### Build Index

In [6]:
from gpt_index import LLMPredictor
from langchain import OpenAI

In [7]:
# define LLM
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-002"))

In [None]:
index = GPTSQLStructStoreIndex(
    wiki_docs, 
    sql_engine=engine, 
    table_name="city_stats",
    llm_predictor=llm_predictor
)

In [9]:
# view current table
stmt = select(
    [column("city_name"), column("population"), column("country")]
).select_from(city_stats_table)

with engine.connect() as connection:
    results = connection.execute(stmt).fetchall()
    print(results)


[('Toronto', 2731571, 'Canada'), ('Berlin', 3769495, 'Germany'), ('Tokyo', 13929286, 'Japan')]


### Query Index

We first show a raw SQL query, which directly executes over the table

In [15]:
response = index.query("SELECT city_name from city_stats", mode="sql")

> [query] Total token usage: 0 tokens


In [16]:
display(Markdown(f"<b>{response}</b>"))

<b>[('Berlin',), ('Tokyo',), ('Toronto',)]</b>

Here we show a natural language query, which is translated to a SQL query under the hood

In [12]:
response = index.query("Which city has the highest population?", mode="default", verbose=True)

> Predicted SQL query: SELECT city_name, population
FROM city_stats
ORDER BY population DESC
LIMIT 1
> [query] Total token usage: 144 tokens


In [13]:
display(Markdown(f"<b>{response}</b>"))

<b>[('Tokyo', 13929286)]</b>

In [14]:
# you can also fetch the raw result from SQLAlchemy! 
response.extra_info["result"]

[('Tokyo', 13929286)]