In [None]:
from llama_index.tools.waii import WaiiToolSpec

waii_tool = WaiiToolSpec(
    url="https://tweakit.waii.ai/api/",
    # API Key of Waii (not OpenAI API key)
    api_key="3........",
    # Which database you want to use, you need add the db connection to Waii first
    database_key="snowflake://....",
    verbose=True,
)

In [None]:
from llama_index import VectorStoreIndex

# Use as Data Loader, load data to index and query it
documents = waii_tool.load_data("Get all tables with their number of columns")
index = VectorStoreIndex.from_documents(documents).as_query_engine()

index.query(
    "Which table contains most columns, tell me top 5 tables with number of columns?"
).response

.

'SELECT\n    table_schema,\n    table_name,\n    COUNT(column_name) AS number_of_columns\nFROM waii.information_schema.columns\nGROUP BY\n    table_schema,\n    table_name\nORDER BY\n    table_schema,\n    table_name\n'

..

Unnamed: 0,TABLE_SCHEMA,TABLE_NAME,NUMBER_OF_COLUMNS
0,BATTLE_DEATH,BATTLE,6
1,BATTLE_DEATH,DEATH,5
2,BATTLE_DEATH,SHIP,7
3,CAR,CARS_DATA,8
4,CAR,CAR_MAKERS,4
...,...,...,...
107,VOTER,VOTES,5
108,WORLD,CITY,5
109,WORLD,COUNTRY,15
110,WORLD,COUNTRYLANGUAGE,4


"The table 'COLUMNS' contains the most columns. The top 5 tables with the number of columns are 'COLUMNS' with 43 columns, 'TABLES' with 25 columns, and the remaining tables have fewer than 25 columns."

In [None]:
# Use as tool, initialize it
from llama_index.core.agent.workflow import FunctionAgent
from llama_index.llms.openai import OpenAI

agent = FunctionAgent(
    waii_tool.to_tool_list(), llm=OpenAI(model="gpt-4.1"),
)

from llama_index.core.workflow import Context

ctx = Context(agent)

print(await agent.run("Give me top 3 countries with the most number of car factory", ctx=ctx))
print(await agent.run("What are the car factories of these countries", ctx=ctx))

In [None]:
# Do performance analysis
print(
    await agent.run(
        "Give me top 3 longest running queries, include the complete query_id and their duration. And analyze performance of the first query",
        ctx=ctx,
    )
)

In [None]:
# Diff two queries
previous_query = """
SELECT
    employee_id,
    department,
    salary,
    AVG(salary) OVER (PARTITION BY department) AS department_avg_salary,
    salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg
FROM
    employees;
"""
current_query = """
SELECT
    employee_id,
    department,
    salary,
    MAX(salary) OVER (PARTITION BY department) AS department_max_salary,
    salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg
FROM
    employees;
LIMIT 100;
"""
print(await agent.run(f"tell me difference between {previous_query} and {current_query}", ctx=ctx))

In [None]:
# Describe dataset
print(await agent.run("Summarize the dataset", ctx=ctx))

In [None]:
q = """
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, lag, lead, round
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("yearly_car_analysis").getOrCreate()

yearly_avg_hp = cars_data.groupBy("year").agg(avg("horsepower").alias("avg_horsepower"))

windowSpec = Window.orderBy("year")

yearly_comparisons = yearly_avg_hp.select(
    "year",
    "avg_horsepower",
    lag("avg_horsepower").over(windowSpec).alias("prev_year_hp"),
    lead("avg_horsepower").over(windowSpec).alias("next_year_hp")
)

final_result = yearly_comparisons.select(
    "year",
    "avg_horsepower",
    round(
        (yearly_comparisons.avg_horsepower - yearly_comparisons.prev_year_hp) / 
        yearly_comparisons.prev_year_hp * 100, 2
    ).alias("percentage_diff_prev_year"),
    round(
        (yearly_comparisons.next_year_hp - yearly_comparisons.avg_horsepower) / 
        yearly_comparisons.avg_horsepower * 100, 2
    ).alias("percentage_diff_next_year")
).orderBy("year")

final_result.show()
"""
print(await agent.run(f"translate this pyspark query {q}, to Snowflake", ctx=ctx))