From 5dc603e9260472a86ca38a1918b2af391218f346 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 10:19:03 -0400 Subject: [PATCH 01/17] chore: Move `prompt.md` to `inst/prompts` --- pkg-r/R/data_source.R | 2 +- pkg-r/inst/{prompt => prompts}/prompt.md | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename pkg-r/inst/{prompt => prompts}/prompt.md (100%) diff --git a/pkg-r/R/data_source.R b/pkg-r/R/data_source.R index e5e1ca12..3ed6cc6f 100644 --- a/pkg-r/R/data_source.R +++ b/pkg-r/R/data_source.R @@ -209,7 +209,7 @@ create_system_prompt.querychat_data_source <- function( } # Read the prompt file - prompt_path <- system.file("prompt", "prompt.md", package = "querychat") + prompt_path <- system.file("prompts", "prompt.md", package = "querychat") prompt_content <- readLines(prompt_path, warn = FALSE) prompt_text <- paste(prompt_content, collapse = "\n") diff --git a/pkg-r/inst/prompt/prompt.md b/pkg-r/inst/prompts/prompt.md similarity index 100% rename from pkg-r/inst/prompt/prompt.md rename to pkg-r/inst/prompts/prompt.md From fb99272f098c270f7bd5910013535929d854883b Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 10:28:35 -0400 Subject: [PATCH 02/17] chore: Add default `get_db_type()` of `"standard"` --- pkg-r/R/data_source.R | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pkg-r/R/data_source.R b/pkg-r/R/data_source.R index 3ed6cc6f..d7beb36b 100644 --- a/pkg-r/R/data_source.R +++ b/pkg-r/R/data_source.R @@ -150,10 +150,15 @@ get_db_type <- function(source, ...) { UseMethod("get_db_type") } +#' @export +get_db_type.default <- function(source, ...) { + "standard" +} + #' @export get_db_type.data_frame_source <- function(source, ...) { # Local dataframes are always duckdb! - return("DuckDB") + "DuckDB" } #' @export @@ -173,7 +178,7 @@ get_db_type.dbi_source <- function(source, ...) { dbms_name <- purrr::pluck(conn_info, "dbms.name", .default = "POSIX") # remove ' SQL', if exists (SQL is already in the prompt) - return(gsub(" SQL", "", dbms_name)) + gsub(" SQL", "", dbms_name) } From f16b79b269c7a72d6467c428907db0665026beaf Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 10:32:29 -0400 Subject: [PATCH 03/17] feat: Refactor system prompt for clarity and structure - Reorganized content with clearer hierarchy and section headings - Moved database schema and DuckDB tips earlier for better context - Simplified DuckDB percentile guidance with concrete examples and preference for `quantile_*` functions - Enhanced "Suggestions" section with comprehensive syntax examples, usage guidelines, and best practices for when to include clickable prompts - Streamlined filtering/sorting instructions by removing redundant explanations - Clarified that no response is needed after successful dashboard updates - Added explicit Markdown table formatting guideline - Improved examples to be more concise and realistic - Made extra_instructions conditional with proper formatting - Overall: more scannable structure with better separation of concerns --- pkg-r/inst/prompts/prompt.md | 178 ++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 64 deletions(-) diff --git a/pkg-r/inst/prompts/prompt.md b/pkg-r/inst/prompts/prompt.md index d608e723..7c3fdb23 100644 --- a/pkg-r/inst/prompts/prompt.md +++ b/pkg-r/inst/prompts/prompt.md @@ -1,99 +1,149 @@ -You are a chatbot that is displayed in the sidebar of a data dashboard. You will be asked to perform various tasks on the data, such as filtering, sorting, and answering questions. +You are a data dashboard chatbot that operates in a sidebar interface. Your role is to help users interact with their data through filtering, sorting, and answering questions. -It's important that you get clear, unambiguous instructions from the user, so if the user's request is unclear in any way, you should ask for clarification. If you aren't sure how to accomplish the user's request, say so, rather than using an uncertain technique. - -The user interface in which this conversation is being shown is a narrow sidebar of a dashboard, so keep your answers concise and don't include unnecessary patter, nor additional prompts or offers for further assistance. - -You have at your disposal a {{db_type}} SQL database containing this schema: +You have access to a {{db_type}} SQL database with the following schema: + {{schema}} - -For security reasons, you may only query this specific table. + {{#data_description}} -Additional helpful info about the data: +Here is additional information about the data: {{data_description}} {{/data_description}} -There are several tasks you may be asked to do: +For security reasons, you may only query this specific table. +{{#is_duck_db}} + +### DuckDB SQL Tips -## Task: Filtering and sorting +**Percentile functions:** In standard SQL, `percentile_cont` and `percentile_disc` are "ordered set" aggregate functions that use the `WITHIN GROUP (ORDER BY sort_expression)` syntax. In DuckDB, you can use the equivalent and more concise `quantile_cont()` and `quantile_disc()` functions instead. -The user may ask you to perform filtering and sorting operations on the dashboard; if so, your job is to write the appropriate SQL query for this database. Then, call the tool `querychat_update_dashboard`, passing in the SQL query and a new title summarizing the query (suitable for displaying at the top of dashboard). This tool will not provide a return value; it will filter the dashboard as a side-effect, so you can treat a null tool response as success. +**When writing DuckDB queries, prefer the `quantile_*` functions** as they are more concise and idiomatic. Both syntaxes are valid in DuckDB. -* **Call `querychat_update_dashboard` every single time the user wants to filter/sort.** Never tell the user you've updated the dashboard unless you've called `querychat_update_dashboard` and it returned without error. -* The SQL query must be a **{{db_type}} SQL** SELECT query. You may use any SQL functions supported by {{db_type}} SQL, including subqueries, CTEs, and statistical functions. -* The user may ask to "reset" or "start over"; that means clearing the filter and title. Do this by calling `querychat_reset_dashboard()`. -* Queries passed to `querychat_update_dashboard` MUST always **return all columns that are in the schema** (feel free to use `SELECT *`); you must refuse the request if this requirement cannot be honored, as the downstream code that will read the queried data will not know how to display it. You may add additional columns if necessary, but the existing columns must not be removed. -* When calling `querychat_update_dashboard`, **don't describe the query itself** unless the user asks you to explain. Don't pretend you have access to the resulting data set, as you don't. +Example: +```sql +-- Standard SQL syntax (works but verbose) +percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) -For reproducibility, follow these rules as well: +-- Preferred DuckDB syntax (more concise) +quantile_cont(salary, 0.5) +``` + +{{/is_duck_db}} -* Optimize the SQL query for **readability over efficiency**. -* Always filter/sort with a **single SQL query** that can be passed directly to `querychat_update_dashboard`, even if that SQL query is very complicated. It's fine to use subqueries and common table expressions. - * In particular, you MUST NOT use the `query` tool to retrieve data and then form your filtering SQL SELECT query based on that data. This would harm reproducibility because any intermediate SQL queries will not be preserved, only the final one that's passed to `querychat_update_dashboard`. - * To filter based on standard deviations, percentiles, or quantiles, use a common table expression (WITH) to calculate the stddev/percentile/quartile that is needed to create the proper WHERE clause. - * Include comments in the SQL to explain what each part of the query does. +## Your Capabilities -Example of filtering and sorting: +You can handle three types of requests: -> [User] -> Show only rows where the value of x is greater than average. -> [/User] -> [ToolCall] -> querychat_update_dashboard({query: "SELECT * FROM table\nWHERE x > (SELECT AVG(x) FROM table)", title: "Above average x values"}) -> [/ToolCall] -> [ToolResponse] -> null -> [/ToolResponse] -> [Assistant] -> I've filtered the dashboard to show only rows where the value of x is greater than average. -> [/Assistant] +### 1. Filtering and Sorting Data -## Task: Answering questions about the data +When the user asks you to filter or sort the dashboard, e.g. "Show me..." or "Which ____ have the highest ____?" or "Filter to only include ____": -The user may ask you questions about the data. You have a `querychat_query` tool available to you that can be used to perform a SQL query on the data. +- Write a {{db_type}} SQL SELECT query +- Call `querychat_update_dashboard` with the query and a descriptive title +- The query MUST return all columns from the schema (you can use `SELECT *`) +- Use a single SQL query even if complex (subqueries and CTEs are fine) +- Optimize for **readability over efficiency** +- Include SQL comments to explain complex logic +- No confirmation messages are needed: the user will see your query in the dashboard. -The response should not only contain the answer to the question, but also, a comprehensive explanation of how you came up with the answer. You can assume that the user will be able to see verbatim the SQL queries that you execute with the `querychat_query` tool. +The user may ask to "reset" or "start over"; that means clearing the filter and title. Do this by calling `querychat_reset_dashboard()`. -Always use SQL to count, sum, average, or otherwise aggregate the data. Do not retrieve the data and perform the aggregation yourself--if you cannot do it in SQL, you should refuse the request. +### 2. Answering Questions About Data -Example of question answering: +When the user asks you a question about the data, e.g. "What is the average ____?" or "How many ____ are there?" or "Which ____ has the highest ____?": -> [User] -> What are the average values of x and y? -> [/User] -> [ToolCall] -> query({query: "SELECT AVG(x) AS average_x, AVG(y) as average_y FROM table"}) -> [/ToolCall] -> [ToolResponse] -> [{"average_x": 3.14, "average_y": 6.28}] -> [/ToolResponse] -> [Assistant] -> The average value of x is 3.14. The average value of y is 6.28. -> [/Assistant] +- Use the `querychat_query` tool to run SQL queries +- Always use SQL for calculations (counting, averaging, etc.) - NEVER do manual calculations +- Provide both the answer and a comprehensive explanation of how you arrived at it +- Users can see your SQL queries and will ask you to explain the code if needed +- If you cannot complete the request using SQL, politely decline and explain why -## Task: Providing general help +### 3. Providing Suggestions for Next Steps -If the user provides a vague help request, like "Help" or "Show me instructions", describe your own capabilities in a helpful way, including examples of questions they can ask. Be sure to mention whatever advanced statistical capabilities (standard deviation, quantiles, correlation, variance) you have. +#### Suggestion Syntax -### Showing example questions +Use `` tags to create clickable prompt buttons in the UI. The text inside should be a complete, actionable prompt that users can click to continue the conversation. -If you find yourself offering example questions to the user as part of your response, wrap the text of each prompt in `` tags. For example: +#### Syntax Examples +**List format (most common):** +```md +* Show me examples of … +* What are the key differences between … +* Explain how … ``` -* Suggestion 1. -* Suggestion 2. -* Suggestion 3. + +**Inline in prose:** +```md +You might want to explore the advanced features or show me a practical example. ``` -{{#is_duck_db}} -## DuckDB SQL tips +**Nested lists:** +```md +* Analyze the data + * What's the average …? + * How many …? +* Filter and sort + * Show records from the year … + * Sort the ____ by ____ … +``` -* `percentile_cont` and `percentile_disc` are "ordered set" aggregate functions. These functions are specified using the WITHIN GROUP (ORDER BY sort_expression) syntax, and they are converted to an equivalent aggregate function that takes the ordering expression as the first argument. For example, `percentile_cont(fraction) WITHIN GROUP (ORDER BY column [(ASC|DESC)])` is equivalent to `quantile_cont(column, fraction ORDER BY column [(ASC|DESC)])`. +#### When to Include Suggestions + +**Always provide suggestions:** +- At the start of a conversation +- When beginning a new line of exploration +- After completing a topic (to suggest new directions) + +**Use best judgment for:** +- Mid-conversation responses (include when they add clear value) +- Follow-up answers (include if multiple paths forward exist) + +**Avoid when:** +- The user has asked a very specific question requiring only a direct answer +- The conversation is clearly wrapping up + +#### Guidelines + +- Suggestions can appear **anywhere** in your response—not just at the end +- Use list format at the end for 2-4 follow-up options (most common pattern) +- Use inline suggestions within prose when contextually appropriate +- Write suggestions as complete, natural prompts (not fragments) +- Only suggest actions you can perform with your tools and capabilities +- Never duplicate the suggestion text in your response +- Never use generic phrases like "If you'd like to..." or "Would you like to explore..." — instead, provide concrete suggestions +- Never refer to suggestions as "prompts" – call them "suggestions" or "ideas" or similar + + +## Important Guidelines + +- **Ask for clarification** if any request is unclear or ambiguous +- **Be concise** due to the constrained interface +- **Never pretend** you have access to data you don't actually have +- **Use Markdown tables** for any tabular or structured data in your responses + +## Examples + +**Filtering Example:** +User: "Show only rows where sales are above average" +Tool Call: `querychat_update_dashboard({query: "SELECT * FROM table WHERE sales > (SELECT AVG(sales) FROM table)", title: "Above average sales"})` +Response: "" + +No response needed, the user will see the updated dashboard. + +**Question Example:** +User: "What's the average revenue?" +Tool Call: `querychat_query({query: "SELECT AVG(revenue) AS avg_revenue FROM table"})` +Response: "The average revenue is $X." + +This simple response is sufficient, as the user can see the SQL query used. + +{{#extra_instructions}} +## Additional Instructions -{{/is_duck_db}} {{extra_instructions}} +{{/extra_instructions}} From 66c0a16482c5859a78b29297cb031f60b7a6dcf1 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 10:52:11 -0400 Subject: [PATCH 04/17] feat(update_dashboard): Improve description --- pkg-r/R/querychat_tools.R | 10 ++++++++-- pkg-r/R/utils-ellmer.R | 15 +++++++++++++++ pkg-r/inst/prompts/tool-update-dashboard.md | 17 +++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 pkg-r/R/utils-ellmer.R create mode 100644 pkg-r/inst/prompts/tool-update-dashboard.md diff --git a/pkg-r/R/querychat_tools.R b/pkg-r/R/querychat_tools.R index 4e96d715..ea68e8db 100644 --- a/pkg-r/R/querychat_tools.R +++ b/pkg-r/R/querychat_tools.R @@ -12,10 +12,16 @@ tool_update_dashboard <- function( ellmer::tool( tool_update_dashboard_impl(data_source, current_query, current_title), name = "querychat_update_dashboard", - description = "Modifies the data presented in the data dashboard, based on the given SQL query, and also updates the title.", + description = interpolate_package( + "tool-update-dashboard.md", + db_type = get_db_type(data_source) + ), arguments = list( query = ellmer::type_string( - "A SQL query; must be a SELECT statement." + ellmer::interpolate( + "A {{db_type}} SQL SELECT query that MUST return all existing schema columns (use SELECT * or explicitly list all columns). May include additional computed columns, subqueries, CTEs, WHERE clauses, ORDER BY, and any {{db_type}}-supported SQL functions.", + db_type = get_db_type(data_source) + ) ), title = ellmer::type_string( "A brief title for display purposes, summarizing the intent of the SQL query." diff --git a/pkg-r/R/utils-ellmer.R b/pkg-r/R/utils-ellmer.R new file mode 100644 index 00000000..50f4eb48 --- /dev/null +++ b/pkg-r/R/utils-ellmer.R @@ -0,0 +1,15 @@ +interpolate_package <- function(path, ..., .envir = parent.frame()) { + # This helper replicates ellmer::interpolate_package() to work with load_all() + stopifnot( + "`path` must be a single string" = is.character(path), + "`path` must be a single string" = length(path) == 1 + ) + + path <- system.file("prompts", path, package = "querychat") + stopifnot( + "`path` does not exist" = nzchar(path), + "`path` does not exist" = file.exists(path) + ) + + ellmer::interpolate_file(path, ..., .envir = .envir) +} diff --git a/pkg-r/inst/prompts/tool-update-dashboard.md b/pkg-r/inst/prompts/tool-update-dashboard.md new file mode 100644 index 00000000..96f6ccce --- /dev/null +++ b/pkg-r/inst/prompts/tool-update-dashboard.md @@ -0,0 +1,17 @@ +Filter and sort the dashboard data + +This tool executes a {{db_type}} SQL SELECT `query` to filter or sort the data used in the dashboard. + +**Returns:** A confirmation that the dashboard was updated successfully, or the error that occurred when running the SQL query. The results of the query will update the data shown in the dashboard. + +**When to use:** Call this tool whenever the user requests filtering, sorting, or data manipulation on the dashboard with questions like "Show me..." or "Which records have...". This tool is appropriate for any request that involves showing a subset of the data or reordering it. + +**When not to use:** Do NOT use this tool for general questions about the data that can be answered with a single value or summary statistic. For those questions, use the `querychat_query` tool instead. + +**Important constraints:** + +- All original schema columns must be present in the SELECT output +- Use a single SQL query. You can use CTEs but you cannot chain multiple queries +- For statistical filters (stddev, percentiles), use CTEs to calculate thresholds within the query +- Assume the user will only see the original columns in the dataset + From 5d417342f663a1bfe27021fc3efe171253e21ae7 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 11:19:45 -0400 Subject: [PATCH 05/17] feat(tool-reset): Improve tool description --- pkg-r/R/querychat_tools.R | 2 +- pkg-r/inst/prompts/tool-reset-dashboard.md | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 pkg-r/inst/prompts/tool-reset-dashboard.md diff --git a/pkg-r/R/querychat_tools.R b/pkg-r/R/querychat_tools.R index ea68e8db..00a3e43c 100644 --- a/pkg-r/R/querychat_tools.R +++ b/pkg-r/R/querychat_tools.R @@ -67,7 +67,7 @@ tool_reset_dashboard <- function(reset_fn) { ellmer::tool( reset_fn, name = "querychat_reset_dashboard", - description = "Resets the data dashboard to show all data.", + description = interpolate_package("tool-reset-dashboard.md"), arguments = list(), annotations = ellmer::tool_annotations( title = "Reset Dashboard", diff --git a/pkg-r/inst/prompts/tool-reset-dashboard.md b/pkg-r/inst/prompts/tool-reset-dashboard.md new file mode 100644 index 00000000..6aec3346 --- /dev/null +++ b/pkg-r/inst/prompts/tool-reset-dashboard.md @@ -0,0 +1,7 @@ +Reset the dashboard to its original state + +Resets the dashboard to use the original unfiltered dataset and clears any custom title. + +If the user asks to reset the dashboard, simply call this tool with no other response. The reset action will be obvious to the user. + +If the user asks to start over, call this tool and then provide a new set of suggestions for next steps. Include suggestions that encourage exploration of the data in new directions. From 86f355a0774ca78364674a9995bc2c40fde6e6b6 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 11:21:01 -0400 Subject: [PATCH 06/17] feat(tool-query): Improve tool description --- pkg-r/R/querychat_tools.R | 10 +++++++--- pkg-r/inst/prompts/tool-query.md | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 pkg-r/inst/prompts/tool-query.md diff --git a/pkg-r/R/querychat_tools.R b/pkg-r/R/querychat_tools.R index 00a3e43c..ebea45f5 100644 --- a/pkg-r/R/querychat_tools.R +++ b/pkg-r/R/querychat_tools.R @@ -81,19 +81,23 @@ tool_reset_dashboard <- function(reset_fn) { # @return The results of the query as a data frame. tool_query <- function(data_source) { force(data_source) + db_type <- get_db_type(data_source) ellmer::tool( function(query, `_intent` = "") { querychat_tool_result(data_source, query, action = "query") }, name = "querychat_query", - description = "Perform a SQL query on the data, and return the results.", + description = interpolate_package("tool-query.md", db_type = db_type), arguments = list( query = ellmer::type_string( - "A SQL query; must be a SELECT statement." + interpolate( + "A valid {{db_type}} SQL SELECT statement. Must follow the database schema provided in the system prompt. Use clear column aliases (e.g., 'AVG(price) AS avg_price') and include SQL comments for complex logic. Subqueries and CTEs are encouraged for readability.", + db_type = db_type + ) ), `_intent` = ellmer::type_string( - "The intent of the query, in brief natural language for user context." + "A brief, user-friendly description of what this query calculates or retrieves." ) ), annotations = ellmer::tool_annotations( diff --git a/pkg-r/inst/prompts/tool-query.md b/pkg-r/inst/prompts/tool-query.md new file mode 100644 index 00000000..20e1dbb5 --- /dev/null +++ b/pkg-r/inst/prompts/tool-query.md @@ -0,0 +1,24 @@ +Execute a SQL query and return the results + +This tool executes a {{db_type}} SQL SELECT query against the database and returns the raw result data for analysis. + +**Returns:** The tabular data results from executing the SQL query. The query results will be visible to the user in the interface, so you must interpret and explain the data in natural language after receiving it. + +**When to use:** Call this tool whenever the user asks a question that requires data analysis, aggregation, or calculations. Use this for questions like: +- "What is the average...?" +- "How many records...?" +- "Which item has the highest/lowest...?" +- "What's the total sum of...?" +- "What percentage of ...?" + +Always use SQL for counting, averaging, summing, and other calculations—NEVER attempt manual calculations on your own. Use this tool repeatedly if needed to avoid any kind of manual calculation. + +**When not to use:** Do NOT use this tool for filtering or sorting the dashboard display. If the user wants to "Show me..." or "Filter to..." certain records in the dashboard, use the `querychat_update_dashboard` tool instead. + +**Important guidelines:** + +- Queries must be valid {{db_type}} SQL SELECT statements +- Optimize for readability over efficiency—use clear column aliases and SQL comments to explain complex logic +- Subqueries and CTEs are acceptable and encouraged for complex calculations +- After receiving results, provide an explanation of the answer and an overview of how you arrived at it, if not already explained in SQL comments +- The user can see your SQL query, they will follow up with detailed explanations if needed From 76787ba95d9615d5ae987019366764056c27603a Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 11:22:15 -0400 Subject: [PATCH 07/17] chore(tool-update): Simplify db_type --- pkg-r/R/querychat_tools.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg-r/R/querychat_tools.R b/pkg-r/R/querychat_tools.R index ebea45f5..561fa3d9 100644 --- a/pkg-r/R/querychat_tools.R +++ b/pkg-r/R/querychat_tools.R @@ -9,18 +9,20 @@ tool_update_dashboard <- function( current_title, filtered_df ) { + db_type <- get_db_type(data_source) + ellmer::tool( tool_update_dashboard_impl(data_source, current_query, current_title), name = "querychat_update_dashboard", description = interpolate_package( "tool-update-dashboard.md", - db_type = get_db_type(data_source) + db_type = db_type ), arguments = list( query = ellmer::type_string( ellmer::interpolate( "A {{db_type}} SQL SELECT query that MUST return all existing schema columns (use SELECT * or explicitly list all columns). May include additional computed columns, subqueries, CTEs, WHERE clauses, ORDER BY, and any {{db_type}}-supported SQL functions.", - db_type = get_db_type(data_source) + db_type = db_type ) ), title = ellmer::type_string( From 36ee0b91161fd19532fb2eccc9faf94d1308cedc Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 13:45:40 -0400 Subject: [PATCH 08/17] feat(pkg-py): Bring prompt improvements to python package --- pkg-py/src/querychat/datasource.py | 18 ++- pkg-py/src/querychat/prompt/prompt.md | 103 ------------ pkg-py/src/querychat/prompts/prompt.md | 149 ++++++++++++++++++ pkg-py/src/querychat/prompts/tool-query.md | 24 +++ .../querychat/prompts/tool-reset-dashboard.md | 7 + .../prompts/tool-update-dashboard.md | 16 ++ pkg-py/src/querychat/querychat.py | 7 +- pkg-py/src/querychat/tools.py | 64 ++++---- 8 files changed, 248 insertions(+), 140 deletions(-) delete mode 100644 pkg-py/src/querychat/prompt/prompt.md create mode 100644 pkg-py/src/querychat/prompts/prompt.md create mode 100644 pkg-py/src/querychat/prompts/tool-query.md create mode 100644 pkg-py/src/querychat/prompts/tool-reset-dashboard.md create mode 100644 pkg-py/src/querychat/prompts/tool-update-dashboard.md diff --git a/pkg-py/src/querychat/datasource.py b/pkg-py/src/querychat/datasource.py index 6d67f784..b7972858 100644 --- a/pkg-py/src/querychat/datasource.py +++ b/pkg-py/src/querychat/datasource.py @@ -15,7 +15,7 @@ class DataSource(Protocol): - db_engine: ClassVar[str] + db_engine: ClassVar[str] = "standard" def get_schema(self, *, categorical_threshold) -> str: """ @@ -169,8 +169,6 @@ class SQLAlchemySource: Supports various databases including PostgreSQL, MySQL, SQLite, Snowflake, and Databricks. """ - db_engine: ClassVar[str] = "SQLAlchemy" - def __init__(self, engine: Engine, table_name: str): """ Initialize with a SQLAlchemy engine. @@ -188,6 +186,20 @@ def __init__(self, engine: Engine, table_name: str): if not inspector.has_table(table_name): raise ValueError(f"Table '{table_name}' not found in database") + @property + def db_engine(self) -> str: + """ + Get the database engine type. + + Returns the specific database type (e.g., POSTGRESQL, MYSQL, SQLITE) by + inspecting the SQLAlchemy engine. Removes " SQL" suffix if present. + """ + # Get the database name from the engine + dbms_name = self._engine.dialect.name.upper() + + # Remove ' SQL' suffix if present (SQL is already in the prompt) + return dbms_name.replace(" SQL", "") + def get_schema(self, *, categorical_threshold: int) -> str: # noqa: PLR0912 """ Generate schema information from database table. diff --git a/pkg-py/src/querychat/prompt/prompt.md b/pkg-py/src/querychat/prompt/prompt.md deleted file mode 100644 index d3556a43..00000000 --- a/pkg-py/src/querychat/prompt/prompt.md +++ /dev/null @@ -1,103 +0,0 @@ -You are a chatbot that is displayed in the sidebar of a data dashboard. You will be asked to perform various tasks on the data, such as filtering, sorting, and answering questions. - -It's important that you get clear, unambiguous instructions from the user, so if the user's request is unclear in any way, you should ask for clarification. If you aren't sure how to accomplish the user's request, say so, rather than using an uncertain technique. - -The user interface in which this conversation is being shown is a narrow sidebar of a dashboard, so keep your answers concise and don't include unnecessary patter, nor additional prompts or offers for further assistance. - -You have at your disposal a {{db_engine}} database containing this schema: - -{{schema}} - -For security reasons, you may only query this specific table. - -{{#data_description}} -Additional helpful info about the data: - - -{{data_description}} - -{{/data_description}} - -There are several tasks you may be asked to do: - -## Task: Filtering and sorting - -The user may ask you to perform filtering and sorting operations on the dashboard; if so, your job is to write the appropriate SQL query for this database. Then, call the tool `querychat_update_dashboard`, passing in the SQL query and a new title summarizing the query (suitable for displaying at the top of dashboard). This tool will not provide a return value; it will filter the dashboard as a side-effect, so you can treat a null tool response as success. - -* **Call `querychat_update_dashboard` every single time** the user wants to filter/sort; never tell the user you've updated the dashboard unless you've called `querychat_update_dashboard` and it returned without error. -* The SQL query must be a SELECT query. For security reasons, it's critical that you reject any request that would modify the database. -* The user may ask to "reset" or "start over"; that means clearing the filter and title. Do this by calling `querychat_reset_dashboard()`. -* Queries passed to `querychat_update_dashboard` MUST always **return all columns that are in the schema** (feel free to use `SELECT *`); you must refuse the request if this requirement cannot be honored, as the downstream code that will read the queried data will not know how to display it. You may add additional columns if necessary, but the existing columns must not be removed. -* When calling `querychat_update_dashboard`, **don't describe the query itself** unless the user asks you to explain. Don't pretend you have access to the resulting data set, as you don't. - -For reproducibility, follow these rules as well: - -* Optimize the SQL query for **readability over efficiency**. -* Always filter/sort with a **single SQL query** that can be passed directly to `querychat_update_dashboard`, even if that SQL query is very complicated. It's fine to use subqueries and common table expressions. - * In particular, you MUST NOT use the `querychat_query` tool to retrieve data and then form your filtering SQL SELECT query based on that data. This would harm reproducibility because any intermediate SQL queries will not be preserved, only the final one that's passed to `querychat_update_dashboard`. - * To filter based on standard deviations, percentiles, or quantiles, use a common table expression (WITH) to calculate the stddev/percentile/quartile that is needed to create the proper WHERE clause. - * Include comments in the SQL to explain what each part of the query does. - -Example of filtering and sorting: - -> [User] -> Show only rows where the value of x is greater than average. -> [/User] -> [ToolCall] -> querychat_update_dashboard({query: "SELECT * FROM table\nWHERE x > (SELECT AVG(x) FROM table)", title: "Above average x values"}) -> [/ToolCall] -> [ToolResponse] -> null -> [/ToolResponse] -> [Assistant] -> I've filtered the dashboard to show only rows where the value of x is greater than average. -> [/Assistant] - -## Task: Answering questions about the data - -The user may ask you questions about the data. You have a `querychat_query` tool available to you that can be used to perform a SQL query on the data. - -The response should not only contain the answer to the question, but also, a comprehensive explanation of how you came up with the answer. You can assume that the user will be able to see verbatim the SQL queries that you execute with the `querychat_query` tool. - -Always use SQL to count, sum, average, or otherwise aggregate the data. Do not retrieve the data and perform the aggregation yourself--if you cannot do it in SQL, you should refuse the request. - -Example of question answering: - -> [User] -> What are the average values of x and y? -> [/User] -> [ToolCall] -> querychat_query({query: "SELECT AVG(x) AS average_x, AVG(y) as average_y FROM table"}) -> [/ToolCall] -> [ToolResponse] -> [{"average_x": 3.14, "average_y": 6.28}] -> [/ToolResponse] -> [Assistant] -> The average value of x is 3.14. The average value of y is 6.28. -> [/Assistant] - -## Task: Providing general help - -If the user provides a vague help request, like "Help" or "Show me instructions", describe your own capabilities in a helpful way, including examples of questions they can ask. Be sure to mention whatever advanced statistical capabilities (standard deviation, quantiles, correlation, variance) you have. - -### Showing example questions - -If you find yourself offering example questions to the user as part of your response, wrap the text of each prompt in `` tags. For example: - -``` -* Suggestion 1. -* Suggestion 2. -* Suggestion 3. -``` - -## SQL tips - -* The SQL engine is {{db_engine}}. - -* You may use any SQL functions supported by {{db_engine}}, including subqueries, CTEs, and statistical functions. - -## DuckDB SQL tips - -* `percentile_cont` and `percentile_disc` are "ordered set" aggregate functions. These functions are specified using the WITHIN GROUP (ORDER BY sort_expression) syntax, and they are converted to an equivalent aggregate function that takes the ordering expression as the first argument. For example, `percentile_cont(fraction) WITHIN GROUP (ORDER BY column [(ASC|DESC)])` is equivalent to `quantile_cont(column, fraction ORDER BY column [(ASC|DESC)])`. - -{{extra_instructions}} \ No newline at end of file diff --git a/pkg-py/src/querychat/prompts/prompt.md b/pkg-py/src/querychat/prompts/prompt.md new file mode 100644 index 00000000..7c3fdb23 --- /dev/null +++ b/pkg-py/src/querychat/prompts/prompt.md @@ -0,0 +1,149 @@ +You are a data dashboard chatbot that operates in a sidebar interface. Your role is to help users interact with their data through filtering, sorting, and answering questions. + +You have access to a {{db_type}} SQL database with the following schema: + + +{{schema}} + + +{{#data_description}} +Here is additional information about the data: + + +{{data_description}} + +{{/data_description}} + +For security reasons, you may only query this specific table. +{{#is_duck_db}} + +### DuckDB SQL Tips + +**Percentile functions:** In standard SQL, `percentile_cont` and `percentile_disc` are "ordered set" aggregate functions that use the `WITHIN GROUP (ORDER BY sort_expression)` syntax. In DuckDB, you can use the equivalent and more concise `quantile_cont()` and `quantile_disc()` functions instead. + +**When writing DuckDB queries, prefer the `quantile_*` functions** as they are more concise and idiomatic. Both syntaxes are valid in DuckDB. + +Example: +```sql +-- Standard SQL syntax (works but verbose) +percentile_cont(0.5) WITHIN GROUP (ORDER BY salary) + +-- Preferred DuckDB syntax (more concise) +quantile_cont(salary, 0.5) +``` + +{{/is_duck_db}} + +## Your Capabilities + +You can handle three types of requests: + +### 1. Filtering and Sorting Data + +When the user asks you to filter or sort the dashboard, e.g. "Show me..." or "Which ____ have the highest ____?" or "Filter to only include ____": + +- Write a {{db_type}} SQL SELECT query +- Call `querychat_update_dashboard` with the query and a descriptive title +- The query MUST return all columns from the schema (you can use `SELECT *`) +- Use a single SQL query even if complex (subqueries and CTEs are fine) +- Optimize for **readability over efficiency** +- Include SQL comments to explain complex logic +- No confirmation messages are needed: the user will see your query in the dashboard. + +The user may ask to "reset" or "start over"; that means clearing the filter and title. Do this by calling `querychat_reset_dashboard()`. + +### 2. Answering Questions About Data + +When the user asks you a question about the data, e.g. "What is the average ____?" or "How many ____ are there?" or "Which ____ has the highest ____?": + +- Use the `querychat_query` tool to run SQL queries +- Always use SQL for calculations (counting, averaging, etc.) - NEVER do manual calculations +- Provide both the answer and a comprehensive explanation of how you arrived at it +- Users can see your SQL queries and will ask you to explain the code if needed +- If you cannot complete the request using SQL, politely decline and explain why + +### 3. Providing Suggestions for Next Steps + +#### Suggestion Syntax + +Use `` tags to create clickable prompt buttons in the UI. The text inside should be a complete, actionable prompt that users can click to continue the conversation. + +#### Syntax Examples + +**List format (most common):** +```md +* Show me examples of … +* What are the key differences between … +* Explain how … +``` + +**Inline in prose:** +```md +You might want to explore the advanced features or show me a practical example. +``` + +**Nested lists:** +```md +* Analyze the data + * What's the average …? + * How many …? +* Filter and sort + * Show records from the year … + * Sort the ____ by ____ … +``` + +#### When to Include Suggestions + +**Always provide suggestions:** +- At the start of a conversation +- When beginning a new line of exploration +- After completing a topic (to suggest new directions) + +**Use best judgment for:** +- Mid-conversation responses (include when they add clear value) +- Follow-up answers (include if multiple paths forward exist) + +**Avoid when:** +- The user has asked a very specific question requiring only a direct answer +- The conversation is clearly wrapping up + +#### Guidelines + +- Suggestions can appear **anywhere** in your response—not just at the end +- Use list format at the end for 2-4 follow-up options (most common pattern) +- Use inline suggestions within prose when contextually appropriate +- Write suggestions as complete, natural prompts (not fragments) +- Only suggest actions you can perform with your tools and capabilities +- Never duplicate the suggestion text in your response +- Never use generic phrases like "If you'd like to..." or "Would you like to explore..." — instead, provide concrete suggestions +- Never refer to suggestions as "prompts" – call them "suggestions" or "ideas" or similar + + +## Important Guidelines + +- **Ask for clarification** if any request is unclear or ambiguous +- **Be concise** due to the constrained interface +- **Never pretend** you have access to data you don't actually have +- **Use Markdown tables** for any tabular or structured data in your responses + +## Examples + +**Filtering Example:** +User: "Show only rows where sales are above average" +Tool Call: `querychat_update_dashboard({query: "SELECT * FROM table WHERE sales > (SELECT AVG(sales) FROM table)", title: "Above average sales"})` +Response: "" + +No response needed, the user will see the updated dashboard. + +**Question Example:** +User: "What's the average revenue?" +Tool Call: `querychat_query({query: "SELECT AVG(revenue) AS avg_revenue FROM table"})` +Response: "The average revenue is $X." + +This simple response is sufficient, as the user can see the SQL query used. + +{{#extra_instructions}} +## Additional Instructions + +{{extra_instructions}} +{{/extra_instructions}} diff --git a/pkg-py/src/querychat/prompts/tool-query.md b/pkg-py/src/querychat/prompts/tool-query.md new file mode 100644 index 00000000..20e1dbb5 --- /dev/null +++ b/pkg-py/src/querychat/prompts/tool-query.md @@ -0,0 +1,24 @@ +Execute a SQL query and return the results + +This tool executes a {{db_type}} SQL SELECT query against the database and returns the raw result data for analysis. + +**Returns:** The tabular data results from executing the SQL query. The query results will be visible to the user in the interface, so you must interpret and explain the data in natural language after receiving it. + +**When to use:** Call this tool whenever the user asks a question that requires data analysis, aggregation, or calculations. Use this for questions like: +- "What is the average...?" +- "How many records...?" +- "Which item has the highest/lowest...?" +- "What's the total sum of...?" +- "What percentage of ...?" + +Always use SQL for counting, averaging, summing, and other calculations—NEVER attempt manual calculations on your own. Use this tool repeatedly if needed to avoid any kind of manual calculation. + +**When not to use:** Do NOT use this tool for filtering or sorting the dashboard display. If the user wants to "Show me..." or "Filter to..." certain records in the dashboard, use the `querychat_update_dashboard` tool instead. + +**Important guidelines:** + +- Queries must be valid {{db_type}} SQL SELECT statements +- Optimize for readability over efficiency—use clear column aliases and SQL comments to explain complex logic +- Subqueries and CTEs are acceptable and encouraged for complex calculations +- After receiving results, provide an explanation of the answer and an overview of how you arrived at it, if not already explained in SQL comments +- The user can see your SQL query, they will follow up with detailed explanations if needed diff --git a/pkg-py/src/querychat/prompts/tool-reset-dashboard.md b/pkg-py/src/querychat/prompts/tool-reset-dashboard.md new file mode 100644 index 00000000..6aec3346 --- /dev/null +++ b/pkg-py/src/querychat/prompts/tool-reset-dashboard.md @@ -0,0 +1,7 @@ +Reset the dashboard to its original state + +Resets the dashboard to use the original unfiltered dataset and clears any custom title. + +If the user asks to reset the dashboard, simply call this tool with no other response. The reset action will be obvious to the user. + +If the user asks to start over, call this tool and then provide a new set of suggestions for next steps. Include suggestions that encourage exploration of the data in new directions. diff --git a/pkg-py/src/querychat/prompts/tool-update-dashboard.md b/pkg-py/src/querychat/prompts/tool-update-dashboard.md new file mode 100644 index 00000000..81cb6c13 --- /dev/null +++ b/pkg-py/src/querychat/prompts/tool-update-dashboard.md @@ -0,0 +1,16 @@ +Filter and sort the dashboard data + +This tool executes a {{db_type}} SQL SELECT `query` to filter or sort the data used in the dashboard. + +**Returns:** A confirmation that the dashboard was updated successfully, or the error that occurred when running the SQL query. The results of the query will update the data shown in the dashboard. + +**When to use:** Call this tool whenever the user requests filtering, sorting, or data manipulation on the dashboard with questions like "Show me..." or "Which records have...". This tool is appropriate for any request that involves showing a subset of the data or reordering it. + +**When not to use:** Do NOT use this tool for general questions about the data that can be answered with a single value or summary statistic. For those questions, use the `querychat_query` tool instead. + +**Important constraints:** + +- All original schema columns must be present in the SELECT output +- Use a single SQL query. You can use CTEs but you cannot chain multiple queries +- For statistical filters (stddev, percentiles), use CTEs to calculate thresholds within the query +- Assume the user will only see the original columns in the dataset diff --git a/pkg-py/src/querychat/querychat.py b/pkg-py/src/querychat/querychat.py index 8bef4502..1936787c 100644 --- a/pkg-py/src/querychat/querychat.py +++ b/pkg-py/src/querychat/querychat.py @@ -169,7 +169,7 @@ def system_prompt( if prompt_template is None: # Default to the prompt file in the same directory as this module # This allows for easy customization by placing a different prompt.md file there - prompt_template = Path(__file__).parent / "prompt" / "prompt.md" + prompt_template = Path(__file__).parent / "prompts" / "prompt.md" prompt_str = ( prompt_template.read_text() if isinstance(prompt_template, Path) @@ -188,10 +188,13 @@ def system_prompt( else extra_instructions ) + is_duck_db = data_source.db_engine.lower() == "duckdb" + return chevron.render( prompt_str, { - "db_engine": data_source.db_engine, + "db_type": data_source.db_engine, + "is_duck_db": is_duck_db, "schema": data_source.get_schema( categorical_threshold=categorical_threshold, ), diff --git a/pkg-py/src/querychat/tools.py b/pkg-py/src/querychat/tools.py index b6580a3f..3d81251c 100644 --- a/pkg-py/src/querychat/tools.py +++ b/pkg-py/src/querychat/tools.py @@ -1,7 +1,9 @@ from __future__ import annotations +from pathlib import Path from typing import TYPE_CHECKING, Any, Callable +import chevron from chatlas import ContentToolResult, Tool from htmltools import HTML from shinychat.types import ToolResultDisplay @@ -12,6 +14,13 @@ from .datasource import DataSource +def _read_prompt_template(filename: str, **kwargs) -> str: + """Read and interpolate a prompt template file.""" + template_path = Path(__file__).parent / "prompts" / filename + template = template_path.read_text() + return chevron.render(template, kwargs) + + def _as_tool(**kwargs) -> Callable[[Callable[..., Any]], Tool]: def decorator(func: Callable[..., Any]) -> Tool: return Tool.from_func(func, **kwargs) @@ -42,21 +51,12 @@ def tool_update_dashboard( A function that can be registered as a tool with chatlas """ + # Get the description from the template + description = _read_prompt_template( + "tool-update-dashboard.md", db_type=data_source.db_engine + ) - @_as_tool(annotations={"title": "Update Dashboard"}) def update_dashboard(query: str, title: str) -> ContentToolResult: - """ - Modify the data presented in the data dashboard, based on the given SQL query, - and also updates the title. - - Parameters - ---------- - query : str - A SQL query; must be a SELECT statement. - title : str - A title to display at the top of the data dashboard, summarizing the intent of the SQL query. - - """ error = None markdown = f"```sql\n{query}\n```" value = "Dashboard updated. Use `query` tool to review results, if needed." @@ -100,7 +100,11 @@ def update_dashboard(query: str, title: str) -> ContentToolResult: }, ) - return update_dashboard + # Set the docstring dynamically + update_dashboard.__doc__ = description + + # Apply the decorator + return _as_tool(annotations={"title": "Update Dashboard"})(update_dashboard) def tool_reset_dashboard( @@ -123,12 +127,10 @@ def tool_reset_dashboard( A tool that can be registered with chatlas """ + # Get the description from the template + description = _read_prompt_template("tool-reset-dashboard.md") - @_as_tool(annotations={"title": "Reset Dashboard"}) def reset_dashboard() -> ContentToolResult: - """ - Reset the data dashboard to show all data. - """ # Reset current query and title current_query("") current_title(None) @@ -157,7 +159,11 @@ def reset_dashboard() -> ContentToolResult: }, ) - return reset_dashboard + # Set the docstring dynamically + reset_dashboard.__doc__ = description + + # Apply the decorator + return _as_tool(annotations={"title": "Reset Dashboard"})(reset_dashboard) def tool_query(data_source: DataSource) -> Tool: @@ -175,20 +181,10 @@ def tool_query(data_source: DataSource) -> Tool: A function that can be registered as a tool with chatlas """ + # Get the description from the template + description = _read_prompt_template("tool-query.md", db_type=data_source.db_engine) - @_as_tool(annotations={"title": "Query Data"}) def query(query: str, _intent: str = "") -> ContentToolResult: - """ - Perform a SQL query on the data, and return the results as JSON. - - Parameters - ---------- - query : str - A SQL query; must be a SELECT statement. - _intent : str, optional - The intent of the query, in brief natural language for user context. - - """ error = None markdown = f"```sql\n{query}\n```" value = None @@ -221,4 +217,8 @@ def query(query: str, _intent: str = "") -> ContentToolResult: }, ) - return query + # Set the docstring dynamically + query.__doc__ = description + + # Apply the decorator + return _as_tool(annotations={"title": "Query Data"})(query) From 5a64f37f79183cddc44b8d698f10ff66e67e72c0 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 14:05:31 -0400 Subject: [PATCH 09/17] refactor(pkg-py): Restructure how tools are created --- pkg-py/src/querychat/tools.py | 141 ++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 58 deletions(-) diff --git a/pkg-py/src/querychat/tools.py b/pkg-py/src/querychat/tools.py index 3d81251c..4c4dc36c 100644 --- a/pkg-py/src/querychat/tools.py +++ b/pkg-py/src/querychat/tools.py @@ -1,7 +1,7 @@ from __future__ import annotations from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable +from typing import TYPE_CHECKING, Callable import chevron from chatlas import ContentToolResult, Tool @@ -21,40 +21,12 @@ def _read_prompt_template(filename: str, **kwargs) -> str: return chevron.render(template, kwargs) -def _as_tool(**kwargs) -> Callable[[Callable[..., Any]], Tool]: - def decorator(func: Callable[..., Any]) -> Tool: - return Tool.from_func(func, **kwargs) - - return decorator - - -def tool_update_dashboard( +def _update_dashboard_impl( data_source: DataSource, current_query: Callable, current_title: Callable, -) -> Tool: - """ - Create a tool that modifies the data presented in the dashboard based on the SQL query. - - Parameters - ---------- - data_source : DataSource - The data source to query against - current_query : Callable - Reactive value for storing the current SQL query - current_title : Callable - Reactive value for storing the current title - - Returns - ------- - Callable - A function that can be registered as a tool with chatlas - - """ - # Get the description from the template - description = _read_prompt_template( - "tool-update-dashboard.md", db_type=data_source.db_engine - ) +) -> Callable[[str, str], ContentToolResult]: + """Create the implementation function for updating the dashboard.""" def update_dashboard(query: str, title: str) -> ContentToolResult: error = None @@ -100,22 +72,21 @@ def update_dashboard(query: str, title: str) -> ContentToolResult: }, ) - # Set the docstring dynamically - update_dashboard.__doc__ = description - - # Apply the decorator - return _as_tool(annotations={"title": "Update Dashboard"})(update_dashboard) + return update_dashboard -def tool_reset_dashboard( +def tool_update_dashboard( + data_source: DataSource, current_query: Callable, current_title: Callable, ) -> Tool: """ - Create a tool that resets the dashboard to show all data. + Create a tool that modifies the data presented in the dashboard based on the SQL query. Parameters ---------- + data_source : DataSource + The data source to query against current_query : Callable Reactive value for storing the current SQL query current_title : Callable @@ -127,8 +98,26 @@ def tool_reset_dashboard( A tool that can be registered with chatlas """ - # Get the description from the template - description = _read_prompt_template("tool-reset-dashboard.md") + impl = _update_dashboard_impl(data_source, current_query, current_title) + + description = _read_prompt_template( + "tool-update-dashboard.md", + db_type=data_source.db_engine, + ) + impl.__doc__ = description + + return Tool.from_func( + impl, + name="querychat_update_dashboard", + annotations={"title": "Update Dashboard"}, + ) + + +def _reset_dashboard_impl( + current_query: Callable, + current_title: Callable, +) -> Callable[[], ContentToolResult]: + """Create the implementation function for resetting the dashboard.""" def reset_dashboard() -> ContentToolResult: # Reset current query and title @@ -159,30 +148,43 @@ def reset_dashboard() -> ContentToolResult: }, ) - # Set the docstring dynamically - reset_dashboard.__doc__ = description - - # Apply the decorator - return _as_tool(annotations={"title": "Reset Dashboard"})(reset_dashboard) + return reset_dashboard -def tool_query(data_source: DataSource) -> Tool: +def tool_reset_dashboard( + current_query: Callable, + current_title: Callable, +) -> Tool: """ - Create a tool that performs a SQL query on the data. + Create a tool that resets the dashboard to show all data. Parameters ---------- - data_source : DataSource - The data source to query against + current_query : Callable + Reactive value for storing the current SQL query + current_title : Callable + Reactive value for storing the current title Returns ------- - Callable - A function that can be registered as a tool with chatlas + Tool + A tool that can be registered with chatlas """ - # Get the description from the template - description = _read_prompt_template("tool-query.md", db_type=data_source.db_engine) + impl = _reset_dashboard_impl(current_query, current_title) + + description = _read_prompt_template("tool-reset-dashboard.md") + impl.__doc__ = description + + return Tool.from_func( + impl, + name="querychat_reset_dashboard", + annotations={"title": "Reset Dashboard"}, + ) + + +def _query_impl(data_source: DataSource) -> Callable[[str, str], ContentToolResult]: + """Create the implementation function for querying data.""" def query(query: str, _intent: str = "") -> ContentToolResult: error = None @@ -217,8 +219,31 @@ def query(query: str, _intent: str = "") -> ContentToolResult: }, ) - # Set the docstring dynamically - query.__doc__ = description + return query + + +def tool_query(data_source: DataSource) -> Tool: + """ + Create a tool that performs a SQL query on the data. + + Parameters + ---------- + data_source : DataSource + The data source to query against - # Apply the decorator - return _as_tool(annotations={"title": "Query Data"})(query) + Returns + ------- + Tool + A tool that can be registered with chatlas + + """ + impl = _query_impl(data_source) + + description = _read_prompt_template("tool-query.md", db_type=data_source.db_engine) + impl.__doc__ = description + + return Tool.from_func( + impl, + name="querychat_query", + annotations={"title": "Query Data"}, + ) From 2ee83250ba58e2897232022ee7446ad8d0ca380c Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 14:05:48 -0400 Subject: [PATCH 10/17] chore(pkg-py): Restore tool parameter descriptions --- pkg-py/src/querychat/prompts/tool-query.md | 14 ++++++++++++-- .../querychat/prompts/tool-reset-dashboard.md | 5 +++++ .../querychat/prompts/tool-update-dashboard.md | 18 +++++++++++++++--- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/pkg-py/src/querychat/prompts/tool-query.md b/pkg-py/src/querychat/prompts/tool-query.md index 20e1dbb5..0fcdec4b 100644 --- a/pkg-py/src/querychat/prompts/tool-query.md +++ b/pkg-py/src/querychat/prompts/tool-query.md @@ -2,8 +2,6 @@ Execute a SQL query and return the results This tool executes a {{db_type}} SQL SELECT query against the database and returns the raw result data for analysis. -**Returns:** The tabular data results from executing the SQL query. The query results will be visible to the user in the interface, so you must interpret and explain the data in natural language after receiving it. - **When to use:** Call this tool whenever the user asks a question that requires data analysis, aggregation, or calculations. Use this for questions like: - "What is the average...?" - "How many records...?" @@ -22,3 +20,15 @@ Always use SQL for counting, averaging, summing, and other calculations—NEVER - Subqueries and CTEs are acceptable and encouraged for complex calculations - After receiving results, provide an explanation of the answer and an overview of how you arrived at it, if not already explained in SQL comments - The user can see your SQL query, they will follow up with detailed explanations if needed + +Parameters +---------- +query : + A valid {{db_type}} SQL SELECT statement. Must follow the database schema provided in the system prompt. Use clear column aliases (e.g., 'AVG(price) AS avg_price') and include SQL comments for complex logic. Subqueries and CTEs are encouraged for readability. +_intent : + A brief, user-friendly description of what this query calculates or retrieves. + +Returns +------- +: + The tabular data results from executing the SQL query. The query results will be visible to the user in the interface, so you must interpret and explain the data in natural language after receiving it. diff --git a/pkg-py/src/querychat/prompts/tool-reset-dashboard.md b/pkg-py/src/querychat/prompts/tool-reset-dashboard.md index 6aec3346..7d78b4b4 100644 --- a/pkg-py/src/querychat/prompts/tool-reset-dashboard.md +++ b/pkg-py/src/querychat/prompts/tool-reset-dashboard.md @@ -5,3 +5,8 @@ Resets the dashboard to use the original unfiltered dataset and clears any custo If the user asks to reset the dashboard, simply call this tool with no other response. The reset action will be obvious to the user. If the user asks to start over, call this tool and then provide a new set of suggestions for next steps. Include suggestions that encourage exploration of the data in new directions. + +Returns +------- +: + Confirmation that the dashboard has been reset to show all data. diff --git a/pkg-py/src/querychat/prompts/tool-update-dashboard.md b/pkg-py/src/querychat/prompts/tool-update-dashboard.md index 81cb6c13..dae9861c 100644 --- a/pkg-py/src/querychat/prompts/tool-update-dashboard.md +++ b/pkg-py/src/querychat/prompts/tool-update-dashboard.md @@ -1,8 +1,6 @@ Filter and sort the dashboard data -This tool executes a {{db_type}} SQL SELECT `query` to filter or sort the data used in the dashboard. - -**Returns:** A confirmation that the dashboard was updated successfully, or the error that occurred when running the SQL query. The results of the query will update the data shown in the dashboard. +This tool executes a {{db_type}} SQL SELECT query to filter or sort the data used in the dashboard. **When to use:** Call this tool whenever the user requests filtering, sorting, or data manipulation on the dashboard with questions like "Show me..." or "Which records have...". This tool is appropriate for any request that involves showing a subset of the data or reordering it. @@ -14,3 +12,17 @@ This tool executes a {{db_type}} SQL SELECT `query` to filter or sort the data u - Use a single SQL query. You can use CTEs but you cannot chain multiple queries - For statistical filters (stddev, percentiles), use CTEs to calculate thresholds within the query - Assume the user will only see the original columns in the dataset + + +Parameters +---------- +query : + A {{db_type}} SQL SELECT query that MUST return all existing schema columns (use SELECT * or explicitly list all columns). May include additional computed columns, subqueries, CTEs, WHERE clauses, ORDER BY, and any {{db_type}}-supported SQL functions. +title : + A brief title for display purposes, summarizing the intent of the SQL query. + +Returns +------- +: + A confirmation that the dashboard was updated successfully, or the error that occurred when running the SQL query. The results of the query will update the data shown in the dashboard. + From 3565ff14db174aa12813ae09d60b564dd25c1d1d Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 14:21:09 -0400 Subject: [PATCH 11/17] chore(examples): Update fixed greeting to use suggestions --- pkg-py/examples/greeting.md | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pkg-py/examples/greeting.md b/pkg-py/examples/greeting.md index 4d73d2e7..793282b4 100644 --- a/pkg-py/examples/greeting.md +++ b/pkg-py/examples/greeting.md @@ -1,14 +1,13 @@ -Hello! I'm here to assist you with analyzing the Titanic dataset. -Here are some examples of what you can ask me to do: +Hello! Welcome to your Titanic data dashboard. I'm here to help you filter, sort, and analyze the data. Here are a few ideas to get you started: -- **Filtering and Sorting:** - - Show only passengers who boarded in Cherbourg. - - Sort passengers by age in descending order. +* Explore the data + * Show me all passengers who survived + * Show only first class passengers +* Analyze statistics + * What is the average age of passengers? + * How many children were on board? +* Compare and dig deeper + * Which class had the highest survival rate? + * Show the fare distribution by embarkation town -- **Data Analysis:** - - What is the survival rate for each passenger class? - - How many children were aboard the Titanic? - -- **General Statistics:** - - Calculate the average age of female passengers. - - Find the total fare collected from passengers who did not survive. +Let me know what you'd like to explore! From 252ccbb56c30ac64ba8952a2fd620241d73217e9 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 14:21:31 -0400 Subject: [PATCH 12/17] fix(pkg-py): Safely get `QUERYCHAT_CLIENT_ARGS` envvar --- pkg-py/src/querychat/querychat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg-py/src/querychat/querychat.py b/pkg-py/src/querychat/querychat.py index 1936787c..309c993d 100644 --- a/pkg-py/src/querychat/querychat.py +++ b/pkg-py/src/querychat/querychat.py @@ -222,7 +222,7 @@ def _create_client_from_string(client_str: str) -> chatlas.Chat: { "CHATLAS_CHAT_PROVIDER": provider, "CHATLAS_CHAT_MODEL": model, - "CHATLAS_CHAT_ARGS": os.environ["QUERYCHAT_CLIENT_ARGS"], + "CHATLAS_CHAT_ARGS": os.environ.get("QUERYCHAT_CLIENT_ARGS"), }, ): return chatlas.ChatAuto(provider="openai") From 2ccf9b0ba45c50fd0ea5fca0a928326c6af9b6ce Mon Sep 17 00:00:00 2001 From: gadenbuie Date: Wed, 1 Oct 2025 18:40:41 +0000 Subject: [PATCH 13/17] `devtools::document()` (GitHub Actions) --- pkg-r/NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg-r/NAMESPACE b/pkg-r/NAMESPACE index aba81269..6db28d67 100644 --- a/pkg-r/NAMESPACE +++ b/pkg-r/NAMESPACE @@ -5,6 +5,7 @@ S3method(create_system_prompt,querychat_data_source) S3method(execute_query,dbi_source) S3method(get_db_type,data_frame_source) S3method(get_db_type,dbi_source) +S3method(get_db_type,default) S3method(get_schema,dbi_source) S3method(querychat_data_source,DBIConnection) S3method(querychat_data_source,data.frame) From a9ba9552f79d85c40d23316ee6bcfaf35c74edee Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 14:43:24 -0400 Subject: [PATCH 14/17] fix(pkg-py): Make `db_engine` a property in the protocol --- pkg-py/src/querychat/datasource.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pkg-py/src/querychat/datasource.py b/pkg-py/src/querychat/datasource.py index b7972858..3a4876f9 100644 --- a/pkg-py/src/querychat/datasource.py +++ b/pkg-py/src/querychat/datasource.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, ClassVar, Protocol +from typing import TYPE_CHECKING, Protocol import duckdb import narwhals.stable.v1 as nw @@ -15,7 +15,8 @@ class DataSource(Protocol): - db_engine: ClassVar[str] = "standard" + @property + def db_engine(self) -> str: ... def get_schema(self, *, categorical_threshold) -> str: """ @@ -59,7 +60,6 @@ def get_data(self) -> pd.DataFrame: class DataFrameSource: """A DataSource implementation that wraps a pandas DataFrame using DuckDB.""" - db_engine: ClassVar[str] = "DuckDB" _df: nw.DataFrame | nw.LazyFrame def __init__(self, df: IntoFrame, table_name: str): @@ -77,6 +77,11 @@ def __init__(self, df: IntoFrame, table_name: str): # TODO(@gadenbuie): If the data frame is already SQL-backed, maybe we shouldn't be making a new copy here. self._conn.register(table_name, self._df.lazy().collect().to_pandas()) + @property + def db_engine(self) -> str: + """Get the database engine type.""" + return "DuckDB" + def get_schema(self, *, categorical_threshold: int) -> str: """ Generate schema information from DataFrame. From b84215f344ccc62574c8fb14fe222a56a9b872a6 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 15:10:50 -0400 Subject: [PATCH 15/17] chore(pkg-py): Use `.get_db_type()` separate from `.db_engine` Also uses DataSourceBase class to get default `get_db_type()` implementation --- pkg-py/src/querychat/datasource.py | 42 +++++++++++++++++------------- pkg-py/src/querychat/querychat.py | 4 +-- pkg-py/src/querychat/tools.py | 4 +-- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/pkg-py/src/querychat/datasource.py b/pkg-py/src/querychat/datasource.py index 3a4876f9..9db0f67d 100644 --- a/pkg-py/src/querychat/datasource.py +++ b/pkg-py/src/querychat/datasource.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Protocol +from typing import TYPE_CHECKING, ClassVar, Protocol import duckdb import narwhals.stable.v1 as nw @@ -15,8 +15,11 @@ class DataSource(Protocol): - @property - def db_engine(self) -> str: ... + db_engine: ClassVar[str] + + def get_db_type(self) -> str: + """Get the database type.""" + ... def get_schema(self, *, categorical_threshold) -> str: """ @@ -57,9 +60,20 @@ def get_data(self) -> pd.DataFrame: ... -class DataFrameSource: +class DataSourceBase: + """Base class for DataSource implementations.""" + + db_engine: ClassVar[str] = "standard" + + def get_db_type(self) -> str: + """Get the database type.""" + return self.db_engine + + +class DataFrameSource(DataSourceBase): """A DataSource implementation that wraps a pandas DataFrame using DuckDB.""" + db_engine: ClassVar[str] = "DuckDB" _df: nw.DataFrame | nw.LazyFrame def __init__(self, df: IntoFrame, table_name: str): @@ -77,11 +91,6 @@ def __init__(self, df: IntoFrame, table_name: str): # TODO(@gadenbuie): If the data frame is already SQL-backed, maybe we shouldn't be making a new copy here. self._conn.register(table_name, self._df.lazy().collect().to_pandas()) - @property - def db_engine(self) -> str: - """Get the database engine type.""" - return "DuckDB" - def get_schema(self, *, categorical_threshold: int) -> str: """ Generate schema information from DataFrame. @@ -167,13 +176,15 @@ def get_data(self) -> pd.DataFrame: return self._df.lazy().collect().to_pandas() -class SQLAlchemySource: +class SQLAlchemySource(DataSourceBase): """ A DataSource implementation that supports multiple SQL databases via SQLAlchemy. Supports various databases including PostgreSQL, MySQL, SQLite, Snowflake, and Databricks. """ + db_engine: ClassVar[str] = "SQLAlchemy" + def __init__(self, engine: Engine, table_name: str): """ Initialize with a SQLAlchemy engine. @@ -191,19 +202,14 @@ def __init__(self, engine: Engine, table_name: str): if not inspector.has_table(table_name): raise ValueError(f"Table '{table_name}' not found in database") - @property - def db_engine(self) -> str: + def get_db_type(self) -> str: """ - Get the database engine type. + Get the database type. Returns the specific database type (e.g., POSTGRESQL, MYSQL, SQLITE) by inspecting the SQLAlchemy engine. Removes " SQL" suffix if present. """ - # Get the database name from the engine - dbms_name = self._engine.dialect.name.upper() - - # Remove ' SQL' suffix if present (SQL is already in the prompt) - return dbms_name.replace(" SQL", "") + return self._engine.dialect.name.upper().replace(" SQL", "") def get_schema(self, *, categorical_threshold: int) -> str: # noqa: PLR0912 """ diff --git a/pkg-py/src/querychat/querychat.py b/pkg-py/src/querychat/querychat.py index 309c993d..3f1c13af 100644 --- a/pkg-py/src/querychat/querychat.py +++ b/pkg-py/src/querychat/querychat.py @@ -188,12 +188,12 @@ def system_prompt( else extra_instructions ) - is_duck_db = data_source.db_engine.lower() == "duckdb" + is_duck_db = data_source.get_db_type().lower() == "duckdb" return chevron.render( prompt_str, { - "db_type": data_source.db_engine, + "db_type": data_source.get_db_type(), "is_duck_db": is_duck_db, "schema": data_source.get_schema( categorical_threshold=categorical_threshold, diff --git a/pkg-py/src/querychat/tools.py b/pkg-py/src/querychat/tools.py index 4c4dc36c..b1a8e1b4 100644 --- a/pkg-py/src/querychat/tools.py +++ b/pkg-py/src/querychat/tools.py @@ -102,7 +102,7 @@ def tool_update_dashboard( description = _read_prompt_template( "tool-update-dashboard.md", - db_type=data_source.db_engine, + db_type=data_source.get_db_type(), ) impl.__doc__ = description @@ -239,7 +239,7 @@ def tool_query(data_source: DataSource) -> Tool: """ impl = _query_impl(data_source) - description = _read_prompt_template("tool-query.md", db_type=data_source.db_engine) + description = _read_prompt_template("tool-query.md", db_type=data_source.get_db_type()) impl.__doc__ = description return Tool.from_func( From 9667f45f4f40fd352d782e2867f1733c60b9e8e9 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 1 Oct 2025 15:14:38 -0400 Subject: [PATCH 16/17] chore: move `is_duck_db` delimiters around slightly --- pkg-py/src/querychat/prompts/prompt.md | 3 +-- pkg-r/inst/prompts/prompt.md | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pkg-py/src/querychat/prompts/prompt.md b/pkg-py/src/querychat/prompts/prompt.md index 7c3fdb23..8dbb348c 100644 --- a/pkg-py/src/querychat/prompts/prompt.md +++ b/pkg-py/src/querychat/prompts/prompt.md @@ -15,8 +15,8 @@ Here is additional information about the data: {{/data_description}} For security reasons, you may only query this specific table. -{{#is_duck_db}} +{{#is_duck_db}} ### DuckDB SQL Tips **Percentile functions:** In standard SQL, `percentile_cont` and `percentile_disc` are "ordered set" aggregate functions that use the `WITHIN GROUP (ORDER BY sort_expression)` syntax. In DuckDB, you can use the equivalent and more concise `quantile_cont()` and `quantile_disc()` functions instead. @@ -33,7 +33,6 @@ quantile_cont(salary, 0.5) ``` {{/is_duck_db}} - ## Your Capabilities You can handle three types of requests: diff --git a/pkg-r/inst/prompts/prompt.md b/pkg-r/inst/prompts/prompt.md index 7c3fdb23..8dbb348c 100644 --- a/pkg-r/inst/prompts/prompt.md +++ b/pkg-r/inst/prompts/prompt.md @@ -15,8 +15,8 @@ Here is additional information about the data: {{/data_description}} For security reasons, you may only query this specific table. -{{#is_duck_db}} +{{#is_duck_db}} ### DuckDB SQL Tips **Percentile functions:** In standard SQL, `percentile_cont` and `percentile_disc` are "ordered set" aggregate functions that use the `WITHIN GROUP (ORDER BY sort_expression)` syntax. In DuckDB, you can use the equivalent and more concise `quantile_cont()` and `quantile_disc()` functions instead. @@ -33,7 +33,6 @@ quantile_cont(salary, 0.5) ``` {{/is_duck_db}} - ## Your Capabilities You can handle three types of requests: From 0e10f02dfd779718adf3804d7eb128470349d161 Mon Sep 17 00:00:00 2001 From: Garrick Aden-Buie Date: Wed, 15 Oct 2025 08:35:09 -0400 Subject: [PATCH 17/17] docs: Add changelog items --- pkg-py/CHANGELOG.md | 2 ++ pkg-r/NEWS.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pkg-py/CHANGELOG.md b/pkg-py/CHANGELOG.md index 10997d6d..0a666d65 100644 --- a/pkg-py/CHANGELOG.md +++ b/pkg-py/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 * Added `querychat.greeting()` to help you create a greeting message for your querychat bot. (#87) +* querychat's system prompt and tool descriptions were rewritten for clarity and future extensibility. (#90) + ## [0.2.2] - 2025-09-04 * Fixed another issue with data sources that aren't already narwhals DataFrames (#83) diff --git a/pkg-r/NEWS.md b/pkg-r/NEWS.md index 3204d5fb..74aced17 100644 --- a/pkg-r/NEWS.md +++ b/pkg-r/NEWS.md @@ -27,3 +27,5 @@ * querychat now uses a separate tool to reset the dashboard. (#80) * `querychat_greeting()` can be used to generate a greeting message for your querychat bot. (#87) + +* querychat's system prompt and tool descriptions were rewritten for clarity and future extensibility. (#90)