From 956c687b9a662f335435eb9198e0bc14c56c0b06 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Fri, 10 May 2024 10:40:17 -0700 Subject: [PATCH 01/13] Commit at with first working version --- .../llama-index-tools-cassandra/.gitignore | 153 ++++ .../tools/llama-index-tools-cassandra/BUILD | 1 + .../llama-index-tools-cassandra/Makefile | 17 + .../llama-index-tools-cassandra/README.md | 1 + .../examples/casssandra.ipynb | 176 +++++ .../llama_index/tools/cassandra/__init__.py | 4 + .../llama_index/tools/cassandra/base.py | 183 +++++ .../cassandra/cassandra_database_wrapper.py | 675 ++++++++++++++++++ .../pyproject.toml | 49 ++ .../tests/__init__.py | 0 .../tests/test_tools_cassandra.py | 0 11 files changed, 1259 insertions(+) create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/.gitignore create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/Makefile create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/README.md create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/__init__.py create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/tests/__init__.py create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/.gitignore b/llama-index-integrations/tools/llama-index-tools-cassandra/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD b/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/Makefile b/llama-index-integrations/tools/llama-index-tools-cassandra/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/README.md b/llama-index-integrations/tools/llama-index-tools-cassandra/README.md new file mode 100644 index 0000000000000..9496b5638247d --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/README.md @@ -0,0 +1 @@ +# LlamaIndex Tools Integration: Cassandra diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb b/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb new file mode 100644 index 0000000000000..b0e36cf65af23 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary libraries\n", + "import os\n", + "\n", + "import cassio\n", + "\n", + "# from llama_index.tools.tavily_research.base import TavilyToolSpec\n", + "from llama_index.tools.cassandra.base import CassandraDatabaseToolSpec\n", + "from llama_index.tools.cassandra.cassandra_database_wrapper import CassandraDatabase\n", + "import openai\n", + "\n", + "from llama_index.agent.openai import OpenAIAgent\n", + "from llama_index.llms.openai import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cassandra_db_schema\n", + "cassandra_db_schema(keyspace: str) -> List[llama_index.core.schema.Document]\n", + "Input to this tool is a keyspace name, output is a table description\n", + " of Apache Cassandra tables.\n", + " If the query is not correct, an error message will be returned.\n", + " If an error is returned, report back to the user that the keyspace\n", + " doesn't exist and stop.\n", + "\n", + " Args:\n", + " keyspace (str): The name of the keyspace for which to return the schema.\n", + "\n", + " Returns:\n", + " List[Document]: A list of Document objects, each containing a table description.\n", + " \n", + "\n", + "cassandra_db_select_table_data\n", + "cassandra_db_select_table_data(keyspace: str, table: str, predicate: str, limit: int) -> List[llama_index.core.schema.Document]\n", + " Tool for getting data from a table in an Apache Cassandra database.\n", + " Use the WHERE clause to specify the predicate for the query that uses the\n", + " primary key. A blank predicate will return all rows. Avoid this if possible.\n", + " Use the limit to specify the number of rows to return. A blank limit will\n", + " return all rows.\n", + "\n", + " Args:\n", + " keyspace (str): The name of the keyspace containing the table.\n", + " table (str): The name of the table for which to return data.\n", + " predicate (str): The predicate for the query that uses the primary key.\n", + " limit (int): The maximum number of rows to return.\n", + "\n", + " Returns:\n", + " List[Document]: A list of Document objects, each containing a row of data.\n", + " \n", + "\n" + ] + } + ], + "source": [ + "cassio.init(auto=True)\n", + "\n", + "session = cassio.config.resolve_session()\n", + "if not session:\n", + " raise Exception(\n", + " \"Check environment configuration or manually configure cassio connection parameters\"\n", + " )\n", + "\n", + "# Create a CassandraDatabaseToolSpec object\n", + "db = CassandraDatabase()\n", + "\n", + "spec = CassandraDatabaseToolSpec(db=db)\n", + "\n", + "tools = spec.to_tool_list()\n", + "for tool in tools:\n", + " print(tool.metadata.name)\n", + " print(tool.metadata.description)\n", + " print(tool.metadata.fn_schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Added user message to memory: What tables are in the keyspace langchain_agent_test?\n", + "=== Calling Function ===\n", + "Calling function: cassandra_db_schema with args: {\"keyspace\":\"langchain_agent_test\"}\n", + "Got output: [Document(id_='ef42ca5a-be46-496b-b8f9-037106182c71', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Table Name: user_credentials\\n- Keyspace: langchain_agent_test\\n- Columns\\n - password (text)\\n - user_email (text)\\n - user_id (uuid)\\n- Partition Keys: (user_email)\\n- Clustering Keys: \\n\\nTable Name: user_videos\\n- Keyspace: langchain_agent_test\\n- Columns\\n - description (text)\\n - title (text)\\n - user_id (uuid)\\n - video_id (uuid)\\n- Partition Keys: (user_id)\\n- Clustering Keys: (video_id asc)\\n\\n\\nTable Name: users\\n- Keyspace: langchain_agent_test\\n- Columns\\n - email (text)\\n - id (uuid)\\n - name (text)\\n- Partition Keys: (id)\\n- Clustering Keys: \\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", + "========================\n", + "\n", + "Added user message to memory: What is the userid for patrick@datastax.com ?\n", + "=== Calling Function ===\n", + "Calling function: cassandra_db_select_table_data with args: {\"keyspace\":\"langchain_agent_test\",\"table\":\"user_credentials\",\"predicate\":\"user_email = 'patrick@datastax.com'\",\"limit\":1}\n", + "Got output: [Document(id_='2b477f24-5db4-454c-871c-17fb3c542719', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email='patrick@datastax.com', password=None, user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'))\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", + "========================\n", + "\n" + ] + }, + { + "data": { + "text/plain": [ + "AgentChatResponse(response='The user ID for `patrick@datastax.com` is `522b1fe2-2e36-4cef-a667-cd4237d08b89`.', sources=[ToolOutput(content='[Document(id_=\\'2b477f24-5db4-454c-871c-17fb3c542719\\', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email=\\'patrick@datastax.com\\', password=None, user_id=UUID(\\'522b1fe2-2e36-4cef-a667-cd4237d08b89\\'))\", start_char_idx=None, end_char_idx=None, text_template=\\'{metadata_str}\\\\n\\\\n{content}\\', metadata_template=\\'{key}: {value}\\', metadata_seperator=\\'\\\\n\\')]', tool_name='cassandra_db_select_table_data', raw_input={'args': (), 'kwargs': {'keyspace': 'langchain_agent_test', 'table': 'user_credentials', 'predicate': \"user_email = 'patrick@datastax.com'\", 'limit': 1}}, raw_output=[Document(id_='2b477f24-5db4-454c-871c-17fb3c542719', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email='patrick@datastax.com', password=None, user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'))\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')], is_error=False)], source_nodes=[], is_dummy_stream=False)" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create the Agent with our tools\n", + "llm = OpenAI(model=\"gpt-4-1106-preview\")\n", + "agent = OpenAIAgent.from_tools(tools, llm=llm, verbose=True)\n", + "\n", + "agent.chat(\"What tables are in the keyspace langchain_agent_test?\")\n", + "agent.chat(\"What is the userid for patrick@datastax.com ?\")\n", + "agent.chat(\"What videos did user patrick@datastax.com upload?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llamaindex", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/__init__.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/__init__.py new file mode 100644 index 0000000000000..7811ec18bfc89 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/__init__.py @@ -0,0 +1,4 @@ +from llama_index.tools.cassandra.base import CassandraDatabaseToolSpec + + +__all__ = ["CassandraDatabaseToolSpec"] diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py new file mode 100644 index 0000000000000..6101596f8690e --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py @@ -0,0 +1,183 @@ +"""Tools for interacting with an Apache Cassandra database.""" +from typing import List + +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document +from llama_index.core.tools.tool_spec.base import BaseToolSpec + + +from pydantic import Field + +from llama_index.tools.cassandra.cassandra_database_wrapper import ( + CassandraDatabase, +) + + +class CassandraDatabaseToolSpec(BaseToolSpec, BaseReader): + """Base tool for interacting with an Apache Cassandra database.""" + + db: CassandraDatabase = Field(exclude=True) + + spec_functions = [ + # "cassandra_db_query", + "cassandra_db_schema", + "cassandra_db_select_table_data", + ] + + def __init__(self, db: CassandraDatabase) -> None: + """DB session in context.""" + self.db = db + + def cassandra_db_query(self, query: str) -> List[Document]: + """Execute a CQL query and return the results as a list of Documents. + + Args: + query (str): A CQL query to execute. + + Returns: + List[Document]: A list of Document objects, each containing data from a row. + """ + documents = [] + result = self.db.run_no_throw(query, fetch="Cursor") + for row in result: + doc_str = ", ".join([str(value) for value in row]) + documents.append(Document(text=doc_str)) + return documents + + def cassandra_db_schema(self, keyspace: str) -> List[Document]: + """Input to this tool is a keyspace name, output is a table description + of Apache Cassandra tables. + If the query is not correct, an error message will be returned. + If an error is returned, report back to the user that the keyspace + doesn't exist and stop. + + Args: + keyspace (str): The name of the keyspace for which to return the schema. + + Returns: + List[Document]: A list of Document objects, each containing a table description. + """ + return [Document(text=self.db.get_keyspace_tables_str_no_throw(keyspace))] + + def cassandra_db_select_table_data( + self, keyspace: str, table: str, predicate: str, limit: int + ) -> List[Document]: + """Tool for getting data from a table in an Apache Cassandra database. + Use the WHERE clause to specify the predicate for the query that uses the + primary key. A blank predicate will return all rows. Avoid this if possible. + Use the limit to specify the number of rows to return. A blank limit will + return all rows. + + Args: + keyspace (str): The name of the keyspace containing the table. + table (str): The name of the table for which to return data. + predicate (str): The predicate for the query that uses the primary key. + limit (int): The maximum number of rows to return. + + Returns: + List[Document]: A list of Document objects, each containing a row of data. + """ + return [ + Document( + text=self.db.get_table_data_no_throw(keyspace, table, predicate, limit) + ) + ] + + +# class QueryCassandraDatabaseTool(BaseCassandraDatabaseTool, BaseTool): +# """Tool for querying an Apache Cassandra database with provided CQL.""" + +# name: str = "cassandra_db_query" +# description: str = """ +# Execute a CQL query against the database and get back the result. +# If the query is not correct, an error message will be returned. +# If an error is returned, rewrite the query, check the query, and try again. +# """ +# args_schema: Type[BaseModel] = _QueryCassandraDatabaseToolInput + +# def _run( +# self, +# query: str, +# run_manager: Optional[CallbackManagerForToolRun] = None, +# ) -> Union[str, Sequence[Dict[str, Any]], ResultSet]: +# """Execute the query, return the results or an error message.""" + + +# class _GetSchemaCassandraDatabaseToolInput(BaseModel): +# keyspace: str = Field( +# ..., +# description=("The name of the keyspace for which to return the schema."), +# ) + + +# class GetSchemaCassandraDatabaseTool(BaseCassandraDatabaseTool, BaseTool): +# """Tool for getting the schema of a keyspace in an Apache Cassandra database.""" + +# name: str = "cassandra_db_schema" +# description: str = """ +# Input to this tool is a keyspace name, output is a table description +# of Apache Cassandra tables. +# If the query is not correct, an error message will be returned. +# If an error is returned, report back to the user that the keyspace +# doesn't exist and stop. +# """ + +# args_schema: Type[BaseModel] = _GetSchemaCassandraDatabaseToolInput + +# def _run( +# self, +# keyspace: str, +# run_manager: Optional[CallbackManagerForToolRun] = None, +# ) -> str: +# """Get the schema for a keyspace.""" +# return self.db.get_keyspace_tables_str_no_throw(keyspace) + + +# class _GetTableDataCassandraDatabaseToolInput(BaseModel): +# keyspace: str = Field( +# ..., +# description=("The name of the keyspace containing the table."), +# ) +# table: str = Field( +# ..., +# description=("The name of the table for which to return data."), +# ) +# predicate: str = Field( +# ..., +# description=("The predicate for the query that uses the primary key."), +# ) +# limit: int = Field( +# ..., +# description=("The maximum number of rows to return."), +# ) + + +# class GetTableDataCassandraDatabaseTool(BaseCassandraDatabaseTool, BaseTool): +# """ +# Tool for getting data from a table in an Apache Cassandra database. +# Use the WHERE clause to specify the predicate for the query that uses the +# primary key. A blank predicate will return all rows. Avoid this if possible. +# Use the limit to specify the number of rows to return. A blank limit will +# return all rows. +# """ + +# name: str = "cassandra_db_select_table_data" +# description: str = """ +# Tool for getting data from a table in an Apache Cassandra database. +# Use the WHERE clause to specify the predicate for the query that uses the +# primary key. A blank predicate will return all rows. Avoid this if possible. +# Use the limit to specify the number of rows to return. A blank limit will +# return all rows. +# """ +# args_schema: Type[BaseModel] = _GetTableDataCassandraDatabaseToolInput + +# def _run( +# self, +# keyspace: str, +# table: str, +# predicate: str, +# limit: int, +# run_manager: Optional[CallbackManagerForToolRun] = None, +# ) -> str: +# """Get data from a table in a keyspace.""" +# return self.db.get_table_data_no_throw(keyspace, table, predicate, limit) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py new file mode 100644 index 0000000000000..1d1ff146b261f --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py @@ -0,0 +1,675 @@ +"""Apache Cassandra database wrapper.""" +from __future__ import annotations + +import re +import traceback +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +from cassandra.cluster import ResultSet, Session +from pydantic import BaseModel, Field, root_validator + + +IGNORED_KEYSPACES = [ + "system", + "system_auth", + "system_distributed", + "system_schema", + "system_traces", + "system_views", + "datastax_sla", + "data_endpoint_auth", +] + + +class CassandraDatabase: + """Apache Cassandra® database wrapper.""" + + def __init__( + self, + session: Optional[Session] = None, + exclude_tables: Optional[List[str]] = None, + include_tables: Optional[List[str]] = None, + cassio_init_kwargs: Optional[Dict[str, Any]] = None, + ): + self._session = self._resolve_session(session, cassio_init_kwargs) + if not self._session: + raise ValueError("Session not provided and cannot be resolved") + + self._exclude_keyspaces = IGNORED_KEYSPACES + self._exclude_tables = exclude_tables or [] + self._include_tables = include_tables or [] + + def run( + self, + query: str, + fetch: str = "all", + include_columns: bool = False, + **kwargs: Any, + ) -> Union[str, Sequence[Dict[str, Any]], ResultSet]: + """Execute a CQL query and return the results.""" + clean_query = self._validate_cql(query, "SELECT") + result = self._session.execute(clean_query, **kwargs) + if fetch == "all": + return list(result) + elif fetch == "one": + return result.one()._asdict() if result else {} + elif fetch == "cursor": + return result + else: + raise ValueError("Fetch parameter must be either 'one', 'all', or 'cursor'") + + def run_no_throw( + self, + query: str, + fetch: str = "all", + include_columns: bool = False, + **kwargs: Any, + ) -> Union[str, Sequence[Dict[str, Any]], ResultSet]: + """Execute a CQL query and return the results or an error message.""" + try: + return self.run(query, fetch, include_columns, **kwargs) + except Exception as e: + """Format the error message""" + return f"Error: {e}\n{traceback.format_exc()}" + + def get_keyspace_tables_str_no_throw(self, keyspace: str) -> str: + """Get the tables for the specified keyspace.""" + try: + return self.get_keyspace_tables_str(keyspace) + except Exception as e: + """Format the error message""" + return f"Error: {e}\n{traceback.format_exc()}" + + def get_keyspace_tables_str(self, keyspace: str) -> str: + """Get the tables for the specified keyspace.""" + tables = self.get_keyspace_tables(keyspace) + schema_string = "" + for table in tables: + schema_string += table.as_markdown() + "\n\n" + + return schema_string + + def get_keyspace_tables(self, keyspace: str) -> List[Table]: + """Get the Table objects for the specified keyspace.""" + schema = self._resolve_schema([keyspace]) + if keyspace in schema: + return schema[keyspace] + else: + return [] + + def get_table_data_no_throw( + self, keyspace: str, table: str, predicate: str, limit: int + ) -> str: + """Get data from the specified table in the specified keyspace. Optionally can + take a predicate for the WHERE clause and a limit. + """ + try: + return self.get_table_data(keyspace, table, predicate, limit) + except Exception as e: + """Format the error message""" + return f"Error: {e}\n{traceback.format_exc()}" + + # This is a more basic string building function that doesn't use a query builder + # or prepared statements + # TODO: Refactor to use prepared statements + def get_table_data( + self, keyspace: str, table: str, predicate: str, limit: int + ) -> str: + """Get data from the specified table in the specified keyspace.""" + query = f"SELECT * FROM {keyspace}.{table}" + + if predicate: + query += f" WHERE {predicate}" + if limit: + query += f" LIMIT {limit}" + + query += ";" + + result = self.run(query, fetch="all") + return "\n".join(str(row) for row in result) + + def get_context(self) -> Dict[str, Any]: + """Return db context that you may want in agent prompt.""" + keyspaces = self._fetch_keyspaces() + return {"keyspaces": ", ".join(keyspaces)} + + def format_keyspace_to_markdown( + self, keyspace: str, tables: Optional[List[Table]] = None + ) -> str: + """ + Generates a markdown representation of the schema for a specific keyspace + by iterating over all tables within that keyspace and calling their + as_markdown method. + + Parameters: + - keyspace (str): The name of the keyspace to generate markdown + documentation for. + - tables (list[Table]): list of tables in the keyspace; it will be resolved + if not provided. + + Returns: + A string containing the markdown representation of the specified + keyspace schema. + """ + if not tables: + tables = self.get_keyspace_tables(keyspace) + + if tables: + output = f"## Keyspace: {keyspace}\n\n" + if tables: + for table in tables: + output += table.as_markdown(include_keyspace=False, header_level=3) + output += "\n\n" + else: + output += "No tables present in keyspace\n\n" + + return output + else: + return "" + + def format_schema_to_markdown(self) -> str: + """ + Generates a markdown representation of the schema for all keyspaces and tables + within the CassandraDatabase instance. This method utilizes the + format_keyspace_to_markdown method to create markdown sections for each + keyspace, assembling them into a comprehensive schema document. + + Iterates through each keyspace in the database, utilizing + format_keyspace_to_markdown to generate markdown for each keyspace's schema, + including details of its tables. These sections are concatenated to form a + single markdown document that represents the schema of the entire database or + the subset of keyspaces that have been resolved in this instance. + + Returns: + A markdown string that documents the schema of all resolved keyspaces and + their tables within this CassandraDatabase instance. This includes keyspace + names, table names, comments, columns, partition keys, clustering keys, + and indexes for each table. + """ + schema = self._resolve_schema() + output = "# Cassandra Database Schema\n\n" + for keyspace, tables in schema.items(): + output += f"{self.format_keyspace_to_markdown(keyspace, tables)}\n\n" + return output + + def _validate_cql(self, cql: str, type: str = "SELECT") -> str: + """ + Validates a CQL query string for basic formatting and safety checks. + Ensures that `cql` starts with the specified type (e.g., SELECT) and does + not contain content that could indicate CQL injection vulnerabilities. + + Parameters: + - cql (str): The CQL query string to be validated. + - type (str): The expected starting keyword of the query, used to verify + that the query begins with the correct operation type + (e.g., "SELECT", "UPDATE"). Defaults to "SELECT". + + Returns: + - str: The trimmed and validated CQL query string without a trailing semicolon. + + Raises: + - ValueError: If the value of `type` is not supported + - DatabaseError: If `cql` is considered unsafe + """ + SUPPORTED_TYPES = ["SELECT"] + if type and type.upper() not in SUPPORTED_TYPES: + raise ValueError( + f"""Unsupported CQL type: {type}. Supported types: + {SUPPORTED_TYPES}""" + ) + + # Basic sanity checks + cql_trimmed = cql.strip() + if not cql_trimmed.upper().startswith(type.upper()): + raise DatabaseError(f"CQL must start with {type.upper()}.") + + # Allow a trailing semicolon, but remove (it is optional with the Python driver) + cql_trimmed = cql_trimmed.rstrip(";") + + # Consider content within matching quotes to be "safe" + # Remove single-quoted strings + cql_sanitized = re.sub(r"'.*?'", "", cql_trimmed) + + # Remove double-quoted strings + cql_sanitized = re.sub(r'".*?"', "", cql_sanitized) + + # Find unsafe content in the remaining CQL + if ";" in cql_sanitized: + raise DatabaseError( + """Potentially unsafe CQL, as it contains a ; at a + place other than the end or within quotation marks.""" + ) + + # The trimmed query, before modifications + return cql_trimmed + + def _fetch_keyspaces(self, keyspace_list: Optional[List[str]] = None) -> List[str]: + """ + Fetches a list of keyspace names from the Cassandra database. The list can be + filtered by a provided list of keyspace names or by excluding predefined + keyspaces. + + Parameters: + - keyspace_list (Optional[List[str]]): A list of keyspace names to specifically + include. If provided and not empty, the method returns only the keyspaces + present in this list. If not provided or empty, the method returns all + keyspaces except those specified in the _exclude_keyspaces attribute. + + Returns: + - List[str]: A list of keyspace names according to the filtering criteria. + """ + all_keyspaces = self.run( + "SELECT keyspace_name FROM system_schema.keyspaces", fetch="all" + ) + + # Type check to ensure 'all_keyspaces' is a sequence of dictionaries + if not isinstance(all_keyspaces, Sequence): + raise TypeError("Expected a sequence of dictionaries from 'run' method.") + + # Filtering keyspaces based on 'keyspace_list' and '_exclude_keyspaces' + filtered_keyspaces = [] + for ks in all_keyspaces: + if not isinstance(ks, Dict): + continue # Skip if the row is not a dictionary. + + keyspace_name = ks["keyspace_name"] + if keyspace_list and keyspace_name in keyspace_list: + filtered_keyspaces.append(keyspace_name) + elif not keyspace_list and keyspace_name not in self._exclude_keyspaces: + filtered_keyspaces.append(keyspace_name) + + return filtered_keyspaces + + def _fetch_schema_data(self, keyspace_list: List[str]) -> Tuple: + """ + Fetches schema data, including tables, columns, and indexes, filtered by a + list of keyspaces. This method constructs CQL queries to retrieve detailed + schema information from the specified keyspaces and executes them to gather + data about tables, columns, and indexes within those keyspaces. + + Parameters: + - keyspace_list (List[str]): A list of keyspace names from which to fetch + schema data. + + Returns: + - Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: A + tuple containing three lists: + - The first list contains dictionaries of table details (keyspace name, + table name, and comment). + - The second list contains dictionaries of column details (keyspace name, + table name, column name, type, kind, and position). + - The third list contains dictionaries of index details (keyspace name, + table name, index name, kind, and options). + + This method allows for efficiently fetching schema information for multiple + keyspaces in a single operation, + enabling applications to programmatically analyze or document the database + schema. + """ + # Construct IN clause for CQL query + keyspace_in_clause = ", ".join([f"'{ks}'" for ks in keyspace_list]) + + # Fetch filtered table details + tables_query = f"""SELECT keyspace_name, table_name, comment + FROM system_schema.tables + WHERE keyspace_name + IN ({keyspace_in_clause})""" + + tables_data = self.run(tables_query, fetch="all") + + # Fetch filtered column details + columns_query = f"""SELECT keyspace_name, table_name, column_name, type, + kind, clustering_order, position + FROM system_schema.columns + WHERE keyspace_name + IN ({keyspace_in_clause})""" + + columns_data = self.run(columns_query, fetch="all") + + # Fetch filtered index details + indexes_query = f"""SELECT keyspace_name, table_name, index_name, + kind, options + FROM system_schema.indexes + WHERE keyspace_name + IN ({keyspace_in_clause})""" + + indexes_data = self.run(indexes_query, fetch="all") + + return tables_data, columns_data, indexes_data + + def _resolve_schema( + self, keyspace_list: Optional[List[str]] = None + ) -> Dict[str, List[Table]]: + """ + Efficiently fetches and organizes Cassandra table schema information, + such as comments, columns, and indexes, into a dictionary mapping keyspace + names to lists of Table objects. + + Returns: + A dictionary with keyspace names as keys and lists of Table objects as values, + where each Table object is populated with schema details appropriate for its + keyspace and table name. + """ + if not keyspace_list: + keyspace_list = self._fetch_keyspaces() + + tables_data, columns_data, indexes_data = self._fetch_schema_data(keyspace_list) + + keyspace_dict: dict = {} + for table_data in tables_data: + keyspace = table_data.keyspace_name + table_name = table_data.table_name + comment = table_data.comment + + if self._include_tables and table_name not in self._include_tables: + continue + + if self._exclude_tables and table_name in self._exclude_tables: + continue + + # Filter columns and indexes for this table + table_columns = [ + (c.column_name, c.type) + for c in columns_data + if c.keyspace_name == keyspace and c.table_name == table_name + ] + + partition_keys = [ + c.column_name + for c in columns_data + if c.kind == "partition_key" + and c.keyspace_name == keyspace + and c.table_name == table_name + ] + + clustering_keys = [ + (c.column_name, c.clustering_order) + for c in columns_data + if c.kind == "clustering" + and c.keyspace_name == keyspace + and c.table_name == table_name + ] + + table_indexes = [ + (c.index_name, c.kind, c.options) + for c in indexes_data + if c.keyspace_name == keyspace and c.table_name == table_name + ] + + table_obj = Table( + keyspace=keyspace, + table_name=table_name, + comment=comment, + columns=table_columns, + partition=partition_keys, + clustering=clustering_keys, + indexes=table_indexes, + ) + + if keyspace not in keyspace_dict: + keyspace_dict[keyspace] = [] + keyspace_dict[keyspace].append(table_obj) + + return keyspace_dict + + def _resolve_session( + self, + session: Optional[Session] = None, + cassio_init_kwargs: Optional[Dict[str, Any]] = None, + ) -> Session: + """ + Attempts to resolve and return a Session object for use in database operations. + + This function follows a specific order of precedence to determine the + appropriate session to use: + 1. `session` parameter if given, + 2. Existing `cassio` session, + 3. A new `cassio` session derived from `cassio_init_kwargs`, + 4. `None` + + Parameters: + - session (Optional[Session]): An optional session to use directly. + - cassio_init_kwargs (Optional[Dict[str, Any]]): An optional dictionary of + keyword arguments to `cassio`. + + Returns: + - Session: The resolved session object if successful, or `None` if the session + cannot be resolved. + + Raises: + - ValueError: If `cassio_init_kwargs` is provided but is not a dictionary of + keyword arguments. + """ + # Prefer given session + if session: + return session + + # If a session is not provided, create one using cassio if available + # dynamically import cassio to avoid circular imports + try: + import cassio.config + except ImportError: + raise ValueError( + "cassio package not found, please install with" " `pip install cassio`" + ) + + # Use pre-existing session on cassio + s = cassio.config.resolve_session() + if s: + return s + + # Try to init and return cassio session + if cassio_init_kwargs: + if isinstance(cassio_init_kwargs, dict): + cassio.init(**cassio_init_kwargs) + return cassio.config.check_resolve_session() + else: + raise ValueError("cassio_init_kwargs must be a keyword dictionary") + + # return None if we're not able to resolve + return None + + +class DatabaseError(Exception): + """Exception raised for errors in the database schema. + + Attributes: + message -- explanation of the error + """ + + def __init__(self, message: str): + self.message = message + super().__init__(self.message) + + +class Table(BaseModel): + keyspace: str + """The keyspace in which the table exists.""" + + table_name: str + """The name of the table.""" + + comment: Optional[str] = None + """The comment associated with the table.""" + + columns: List[Tuple[str, str]] = Field(default_factory=list) + partition: List[str] = Field(default_factory=list) + clustering: List[Tuple[str, str]] = Field(default_factory=list) + indexes: List[Tuple[str, str, str]] = Field(default_factory=list) + + class Config: + frozen = True + + @root_validator() + def check_required_fields(cls, class_values: dict) -> dict: + if not class_values["columns"]: + raise ValueError("non-empty column list for must be provided") + if not class_values["partition"]: + raise ValueError("non-empty partition list must be provided") + return class_values + + @classmethod + def from_database( + cls, keyspace: str, table_name: str, db: CassandraDatabase + ) -> Table: + columns, partition, clustering = cls._resolve_columns(keyspace, table_name, db) + return cls( + keyspace=keyspace, + table_name=table_name, + comment=cls._resolve_comment(keyspace, table_name, db), + columns=columns, + partition=partition, + clustering=clustering, + indexes=cls._resolve_indexes(keyspace, table_name, db), + ) + + def as_markdown( + self, include_keyspace: bool = True, header_level: Optional[int] = None + ) -> str: + """ + Generates a Markdown representation of the Cassandra table schema, allowing for + customizable header levels for the table name section. + + Parameters: + - include_keyspace (bool): If True, includes the keyspace in the output. + Defaults to True. + - header_level (Optional[int]): Specifies the markdown header level for the + table name. + If None, the table name is included without a header. Defaults to None + (no header level). + + Returns: + - str: A string in Markdown format detailing the table name + (with optional header level), + keyspace (optional), comment, columns, partition keys, clustering keys + (with optional clustering order), + and indexes. + """ + output = "" + if header_level is not None: + output += f"{'#' * header_level} " + output += f"Table Name: {self.table_name}\n" + + if include_keyspace: + output += f"- Keyspace: {self.keyspace}\n" + if self.comment: + output += f"- Comment: {self.comment}\n" + + output += "- Columns\n" + for column, type in self.columns: + output += f" - {column} ({type})\n" + + output += f"- Partition Keys: ({', '.join(self.partition)})\n" + output += "- Clustering Keys: " + if self.clustering: + cluster_list = [] + for column, clustering_order in self.clustering: + if clustering_order.lower() == "none": + cluster_list.append(column) + else: + cluster_list.append(f"{column} {clustering_order}") + output += f"({', '.join(cluster_list)})\n" + + if self.indexes: + output += "- Indexes\n" + for name, kind, options in self.indexes: + output += f" - {name} : kind={kind}, options={options}\n" + + return output + + @staticmethod + def _resolve_comment( + keyspace: str, table_name: str, db: CassandraDatabase + ) -> Optional[str]: + result = db.run( + f"""SELECT comment + FROM system_schema.tables + WHERE keyspace_name = '{keyspace}' + AND table_name = '{table_name}';""", + fetch="one", + ) + + if isinstance(result, dict): + comment = result.get("comment") + if comment: + return comment + else: + return None # Default comment if none is found + else: + raise ValueError( + f"""Unexpected result type from db.run: + {type(result).__name__}""" + ) + + @staticmethod + def _resolve_columns( + keyspace: str, table_name: str, db: CassandraDatabase + ) -> Tuple[List[Tuple[str, str]], List[str], List[Tuple[str, str]]]: + columns = [] + partition_info = [] + cluster_info = [] + results = db.run( + f"""SELECT column_name, type, kind, clustering_order, position + FROM system_schema.columns + WHERE keyspace_name = '{keyspace}' + AND table_name = '{table_name}';""" + ) + # Type check to ensure 'results' is a sequence of dictionaries. + if not isinstance(results, Sequence): + raise TypeError("Expected a sequence of dictionaries from 'run' method.") + + for row in results: + if not isinstance(row, Dict): + continue # Skip if the row is not a dictionary. + + columns.append((row["column_name"], row["type"])) + if row["kind"] == "partition_key": + partition_info.append((row["column_name"], row["position"])) + elif row["kind"] == "clustering": + cluster_info.append( + (row["column_name"], row["clustering_order"], row["position"]) + ) + + partition = [ + column_name for column_name, _ in sorted(partition_info, key=lambda x: x[1]) + ] + + cluster = [ + (column_name, clustering_order) + for column_name, clustering_order, _ in sorted( + cluster_info, key=lambda x: x[2] + ) + ] + + return columns, partition, cluster + + @staticmethod + def _resolve_indexes( + keyspace: str, table_name: str, db: CassandraDatabase + ) -> List[Tuple[str, str, str]]: + indexes = [] + results = db.run( + f"""SELECT index_name, kind, options + FROM system_schema.indexes + WHERE keyspace_name = '{keyspace}' + AND table_name = '{table_name}';""" + ) + + # Type check to ensure 'results' is a sequence of dictionaries + if not isinstance(results, Sequence): + raise TypeError("Expected a sequence of dictionaries from 'run' method.") + + for row in results: + if not isinstance(row, Dict): + continue # Skip if the row is not a dictionary. + + # Convert 'options' to string if it's not already, + # assuming it's JSON-like and needs conversion + index_options = row["options"] + if not isinstance(index_options, str): + # Assuming index_options needs to be serialized or simply converted + index_options = str(index_options) + + indexes.append((row["index_name"], row["kind"], index_options)) + + return indexes diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml new file mode 100644 index 0000000000000..be7c8acc7b521 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml @@ -0,0 +1,49 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Patrick McFadin "] +description = "llama-index tools Apache Cassandra® integration" +license = "MIT" +name = "llama-index-tools-cassandra" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +llama-index-core = "^0.10.0" + +[tool.poetry.group.dev.dependencies] +black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} +codespell = {extras = ["toml"], version = ">=v2.2.6"} +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" # TODO: unpin when mypy>0.991 +types-setuptools = "67.1.0.0" diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/tests/__init__.py b/llama-index-integrations/tools/llama-index-tools-cassandra/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py b/llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py new file mode 100644 index 0000000000000..e69de29bb2d1d From 37d8d99a8db624ea72cd988d2715966f43e0b0ee Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Fri, 10 May 2024 15:21:22 -0700 Subject: [PATCH 02/13] Updated notebook and add tests --- .../examples/casssandra.ipynb | 286 +++++++++++++++--- .../tests/test_tools_cassandra.py | 7 + 2 files changed, 259 insertions(+), 34 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb b/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb index b0e36cf65af23..53abde73ffabf 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb @@ -1,21 +1,125 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cassandra Database Tools\n", + "\n", + "Apache Cassandra® is a widely used database for storing transactional application data. The introduction of functions and tooling in Large Language Models has opened up some exciting use cases for existing data in Generative AI applications. The Cassandra Database toolkit enables AI engineers to efficiently integrate Agents with Cassandra data, offering the following features: \n", + " - Fast data access through optimized queries. Most queries should run in single-digit ms or less. \n", + " - Schema introspection to enhance LLM reasoning capabilities \n", + " - Compatibility with various Cassandra deployments, including Apache Cassandra®, DataStax Enterprise™, and DataStax Astra™ \n", + " - Currently, the toolkit is limited to SELECT queries and schema introspection operations. (Safety first)\n", + "\n", + "## Quick Start\n", + " - Install the cassio library\n", + " - Set environment variables for the Cassandra database you are connecting to\n", + " - Initialize CassandraDatabase\n", + " - Pass the tools to your agent with spec.to_tool_list()\n", + " - Sit back and watch it do all your work for you\n", + "\n", + "## Theory of Operation\n", + "Cassandra Query Language (CQL) is the primary *human-centric* way of interacting with a Cassandra database. While offering some flexibility when generating queries, it requires knowledge of Cassandra data modeling best practices. LLM function calling gives an agent the ability to reason and then choose a tool to satisfy the request. Agents using LLMs should reason using Cassandra-specific logic when choosing the appropriate tool or chain of tools. This reduces the randomness introduced when LLMs are forced to provide a top-down solution. Do you want an LLM to have complete unfettered access to your database? Yeah. Probably not. To accomplish this, we provide a prompt for use when constructing questions for the agent: \n", + "\n", + "```json\n", + "You are an Apache Cassandra expert query analysis bot with the following features \n", + "and rules:\n", + " - You will take a question from the end user about finding specific \n", + " data in the database.\n", + " - You will examine the schema of the database and create a query path. \n", + " - You will provide the user with the correct query to find the data they are looking \n", + " for, showing the steps provided by the query path.\n", + " - You will use best practices for querying Apache Cassandra using partition keys \n", + " and clustering columns.\n", + " - Avoid using ALLOW FILTERING in the query.\n", + " - The goal is to find a query path, so it may take querying other tables to get \n", + " to the final answer. \n", + "\n", + "The following is an example of a query path in JSON format:\n", + "\n", + " {\n", + " \"query_paths\": [\n", + " {\n", + " \"description\": \"Direct query to users table using email\",\n", + " \"steps\": [\n", + " {\n", + " \"table\": \"user_credentials\",\n", + " \"query\": \n", + " \"SELECT userid FROM user_credentials WHERE email = 'example@example.com';\"\n", + " },\n", + " {\n", + " \"table\": \"users\",\n", + " \"query\": \"SELECT * FROM users WHERE userid = ?;\"\n", + " }\n", + " ]\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "## Tools Provided\n", + "\n", + "### `cassandra_db_schema`\n", + "Gathers all schema information for the connected database or a specific schema. Critical for the agent when determining actions. \n", + "\n", + "### `cassandra_db_select_table_data`\n", + "Selects data from a specific keyspace and table. The agent can pass paramaters for a predicate and limits on the number of returned records. \n", + "\n", + "### `cassandra_db_query`\n", + "Expiriemental alternative to `cassandra_db_select_table_data` which takes a query string completely formed by the agent instead of parameters. *Warning*: This can lead to unusual queries that may not be as performant(or even work). This may be removed in future releases. If it does something cool, we want to know about that too. You never know!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Enviroment Setup\n", + "\n", + "Install the following Python modules:\n", + "\n", + "```bash\n", + "pip install ipykernel python-dotenv cassio llama-index llama-index-agent-openai llama-index-llms-openai llama-index-tools-cassandra\n", + "```\n", + "\n", + "### .env file\n", + "Connection is via `cassio` using `auto=True` parameter, and the notebook uses OpenAI. You should create a `.env` file accordingly.\n", + "\n", + "For Casssandra, set:\n", + "```bash\n", + "CASSANDRA_CONTACT_POINTS\n", + "CASSANDRA_USERNAME\n", + "CASSANDRA_PASSWORD\n", + "CASSANDRA_KEYSPACE\n", + "```\n", + "\n", + "For Astra, set:\n", + "```bash\n", + "ASTRA_DB_APPLICATION_TOKEN\n", + "ASTRA_DB_DATABASE_ID\n", + "ASTRA_DB_KEYSPACE\n", + "```\n", + "\n", + "For example:\n", + "\n", + "```bash\n", + "# Connection to Astra:\n", + "ASTRA_DB_DATABASE_ID=a1b2c3d4-...\n", + "ASTRA_DB_APPLICATION_TOKEN=AstraCS:...\n", + "ASTRA_DB_KEYSPACE=notebooks\n", + "\n", + "# Also set \n", + "OPENAI_API_KEY=sk-....\n", + "```\n", + "\n", + "(You may also modify the below code to directly connect with `cassio`.)" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from dotenv import load_dotenv\n", "\n", @@ -33,15 +137,112 @@ "\n", "import cassio\n", "\n", - "# from llama_index.tools.tavily_research.base import TavilyToolSpec\n", "from llama_index.tools.cassandra.base import CassandraDatabaseToolSpec\n", "from llama_index.tools.cassandra.cassandra_database_wrapper import CassandraDatabase\n", - "import openai\n", "\n", "from llama_index.agent.openai import OpenAIAgent\n", "from llama_index.llms.openai import OpenAI" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Connect to a Cassandra Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cassio.init(auto=True)\n", + "\n", + "session = cassio.config.resolve_session()\n", + "if not session:\n", + " raise Exception(\n", + " \"Check environment configuration or manually configure cassio connection parameters\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test data prep\n", + "\n", + "session = cassio.config.resolve_session()\n", + "\n", + "session.execute(\"\"\"DROP KEYSPACE IF EXISTS llamaindex_agent_test; \"\"\")\n", + "\n", + "session.execute(\n", + " \"\"\"\n", + "CREATE KEYSPACE if not exists llamaindex_agent_test \n", + "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};\n", + "\"\"\"\n", + ")\n", + "\n", + "session.execute(\n", + " \"\"\"\n", + " CREATE TABLE IF NOT EXISTS llamaindex_agent_test.user_credentials (\n", + " user_email text PRIMARY KEY,\n", + " user_id UUID,\n", + " password TEXT\n", + ");\n", + "\"\"\"\n", + ")\n", + "\n", + "session.execute(\n", + " \"\"\"\n", + " CREATE TABLE IF NOT EXISTS llamaindex_agent_test.users (\n", + " id UUID PRIMARY KEY,\n", + " name TEXT,\n", + " email TEXT\n", + ");\"\"\"\n", + ")\n", + "\n", + "session.execute(\n", + " \"\"\"\n", + " CREATE TABLE IF NOT EXISTS llamaindex_agent_test.user_videos ( \n", + " user_id UUID,\n", + " video_id UUID,\n", + " title TEXT,\n", + " description TEXT,\n", + " PRIMARY KEY (user_id, video_id)\n", + ");\n", + "\"\"\"\n", + ")\n", + "\n", + "user_id = \"522b1fe2-2e36-4cef-a667-cd4237d08b89\"\n", + "video_id = \"27066014-bad7-9f58-5a30-f63fe03718f6\"\n", + "\n", + "session.execute(\n", + " f\"\"\"\n", + " INSERT INTO llamaindex_agent_test.user_credentials (user_id, user_email) \n", + " VALUES ({user_id}, 'patrick@datastax.com');\n", + "\"\"\"\n", + ")\n", + "\n", + "session.execute(\n", + " f\"\"\"\n", + " INSERT INTO llamaindex_agent_test.users (id, name, email) \n", + " VALUES ({user_id}, 'Patrick McFadin', 'patrick@datastax.com');\n", + "\"\"\"\n", + ")\n", + "\n", + "session.execute(\n", + " f\"\"\"\n", + " INSERT INTO llamaindex_agent_test.user_videos (user_id, video_id, title)\n", + " VALUES ({user_id}, {video_id}, 'Use Langflow to Build an LLM Application in 5 Minutes');\n", + "\"\"\"\n", + ")\n", + "\n", + "session.set_keyspace(\"llamaindex_agent_test\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -68,7 +269,7 @@ "\n", "cassandra_db_select_table_data\n", "cassandra_db_select_table_data(keyspace: str, table: str, predicate: str, limit: int) -> List[llama_index.core.schema.Document]\n", - " Tool for getting data from a table in an Apache Cassandra database.\n", + "Tool for getting data from a table in an Apache Cassandra database.\n", " Use the WHERE clause to specify the predicate for the query that uses the\n", " primary key. A blank predicate will return all rows. Avoid this if possible.\n", " Use the limit to specify the number of rows to return. A blank limit will\n", @@ -88,14 +289,6 @@ } ], "source": [ - "cassio.init(auto=True)\n", - "\n", - "session = cassio.config.resolve_session()\n", - "if not session:\n", - " raise Exception(\n", - " \"Check environment configuration or manually configure cassio connection parameters\"\n", - " )\n", - "\n", "# Create a CassandraDatabaseToolSpec object\n", "db = CassandraDatabase()\n", "\n", @@ -108,6 +301,28 @@ " print(tool.metadata.fn_schema)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Choose the LLM that will drive the agent\n", + "# Only certain models support this\n", + "llm = OpenAI(model=\"gpt-4-1106-preview\")\n", + "\n", + "# Create the Agent with our tools. Verbose will echo the agent's actions\n", + "agent = OpenAIAgent.from_tools(tools, llm=llm, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Invoking the agent with tools\n", + "We've created an agent that uses an LLM for reasoning and communication with a tool list for actions, Now we can simply ask questions of the agent and watch it utilize the tools we've given it. " + ] + }, { "cell_type": "code", "execution_count": null, @@ -117,16 +332,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Added user message to memory: What tables are in the keyspace langchain_agent_test?\n", + "Added user message to memory: What tables are in the keyspace llamaindex_agent_test?\n", "=== Calling Function ===\n", - "Calling function: cassandra_db_schema with args: {\"keyspace\":\"langchain_agent_test\"}\n", - "Got output: [Document(id_='ef42ca5a-be46-496b-b8f9-037106182c71', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Table Name: user_credentials\\n- Keyspace: langchain_agent_test\\n- Columns\\n - password (text)\\n - user_email (text)\\n - user_id (uuid)\\n- Partition Keys: (user_email)\\n- Clustering Keys: \\n\\nTable Name: user_videos\\n- Keyspace: langchain_agent_test\\n- Columns\\n - description (text)\\n - title (text)\\n - user_id (uuid)\\n - video_id (uuid)\\n- Partition Keys: (user_id)\\n- Clustering Keys: (video_id asc)\\n\\n\\nTable Name: users\\n- Keyspace: langchain_agent_test\\n- Columns\\n - email (text)\\n - id (uuid)\\n - name (text)\\n- Partition Keys: (id)\\n- Clustering Keys: \\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", + "Calling function: cassandra_db_schema with args: {\"keyspace\":\"llamaindex_agent_test\"}\n", + "Got output: [Document(id_='4b6011e6-62e6-4db2-9198-046534b7c8dd', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Table Name: user_credentials\\n- Keyspace: llamaindex_agent_test\\n- Columns\\n - password (text)\\n - user_email (text)\\n - user_id (uuid)\\n- Partition Keys: (user_email)\\n- Clustering Keys: \\n\\nTable Name: user_videos\\n- Keyspace: llamaindex_agent_test\\n- Columns\\n - description (text)\\n - title (text)\\n - user_id (uuid)\\n - video_id (uuid)\\n- Partition Keys: (user_id)\\n- Clustering Keys: (video_id asc)\\n\\n\\nTable Name: users\\n- Keyspace: llamaindex_agent_test\\n- Columns\\n - email (text)\\n - id (uuid)\\n - name (text)\\n- Partition Keys: (id)\\n- Clustering Keys: \\n\\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", "========================\n", "\n", "Added user message to memory: What is the userid for patrick@datastax.com ?\n", "=== Calling Function ===\n", - "Calling function: cassandra_db_select_table_data with args: {\"keyspace\":\"langchain_agent_test\",\"table\":\"user_credentials\",\"predicate\":\"user_email = 'patrick@datastax.com'\",\"limit\":1}\n", - "Got output: [Document(id_='2b477f24-5db4-454c-871c-17fb3c542719', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email='patrick@datastax.com', password=None, user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'))\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", + "Calling function: cassandra_db_select_table_data with args: {\"keyspace\":\"llamaindex_agent_test\",\"table\":\"user_credentials\",\"predicate\":\"user_email = 'patrick@datastax.com'\",\"limit\":1}\n", + "Got output: [Document(id_='e5620177-c735-46f8-a09a-a0e062efcdec', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email='patrick@datastax.com', password=None, user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'))\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", + "========================\n", + "\n", + "Added user message to memory: What videos did user patrick@datastax.com upload?\n", + "=== Calling Function ===\n", + "Calling function: cassandra_db_select_table_data with args: {\"keyspace\":\"llamaindex_agent_test\",\"table\":\"user_videos\",\"predicate\":\"user_id = 522b1fe2-2e36-4cef-a667-cd4237d08b89\",\"limit\":10}\n", + "Got output: [Document(id_='e3ecfba1-e8e1-4ce3-b321-3f51e12077a1', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'), video_id=UUID('27066014-bad7-9f58-5a30-f63fe03718f6'), description=None, title='Use Langflow to Build an LLM Application in 5 Minutes')\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')]\n", "========================\n", "\n" ] @@ -134,7 +355,7 @@ { "data": { "text/plain": [ - "AgentChatResponse(response='The user ID for `patrick@datastax.com` is `522b1fe2-2e36-4cef-a667-cd4237d08b89`.', sources=[ToolOutput(content='[Document(id_=\\'2b477f24-5db4-454c-871c-17fb3c542719\\', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email=\\'patrick@datastax.com\\', password=None, user_id=UUID(\\'522b1fe2-2e36-4cef-a667-cd4237d08b89\\'))\", start_char_idx=None, end_char_idx=None, text_template=\\'{metadata_str}\\\\n\\\\n{content}\\', metadata_template=\\'{key}: {value}\\', metadata_seperator=\\'\\\\n\\')]', tool_name='cassandra_db_select_table_data', raw_input={'args': (), 'kwargs': {'keyspace': 'langchain_agent_test', 'table': 'user_credentials', 'predicate': \"user_email = 'patrick@datastax.com'\", 'limit': 1}}, raw_output=[Document(id_='2b477f24-5db4-454c-871c-17fb3c542719', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_email='patrick@datastax.com', password=None, user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'))\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')], is_error=False)], source_nodes=[], is_dummy_stream=False)" + "AgentChatResponse(response='The user `patrick@datastax.com` uploaded the following video in the `llamaindex_agent_test` keyspace:\\n\\n- Title: \"Use Langflow to Build an LLM Application in 5 Minutes\"\\n- Video ID: `27066014-bad7-9f58-5a30-f63fe03718f6`\\n- Description: Not provided', sources=[ToolOutput(content='[Document(id_=\\'e3ecfba1-e8e1-4ce3-b321-3f51e12077a1\\', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_id=UUID(\\'522b1fe2-2e36-4cef-a667-cd4237d08b89\\'), video_id=UUID(\\'27066014-bad7-9f58-5a30-f63fe03718f6\\'), description=None, title=\\'Use Langflow to Build an LLM Application in 5 Minutes\\')\", start_char_idx=None, end_char_idx=None, text_template=\\'{metadata_str}\\\\n\\\\n{content}\\', metadata_template=\\'{key}: {value}\\', metadata_seperator=\\'\\\\n\\')]', tool_name='cassandra_db_select_table_data', raw_input={'args': (), 'kwargs': {'keyspace': 'llamaindex_agent_test', 'table': 'user_videos', 'predicate': 'user_id = 522b1fe2-2e36-4cef-a667-cd4237d08b89', 'limit': 10}}, raw_output=[Document(id_='e3ecfba1-e8e1-4ce3-b321-3f51e12077a1', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text=\"Row(user_id=UUID('522b1fe2-2e36-4cef-a667-cd4237d08b89'), video_id=UUID('27066014-bad7-9f58-5a30-f63fe03718f6'), description=None, title='Use Langflow to Build an LLM Application in 5 Minutes')\", start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')], is_error=False)], source_nodes=[], is_dummy_stream=False)" ] }, "execution_count": null, @@ -143,11 +364,8 @@ } ], "source": [ - "# Create the Agent with our tools\n", - "llm = OpenAI(model=\"gpt-4-1106-preview\")\n", - "agent = OpenAIAgent.from_tools(tools, llm=llm, verbose=True)\n", - "\n", - "agent.chat(\"What tables are in the keyspace langchain_agent_test?\")\n", + "# Ask our new agent a series of questions. What how the agent uses tools to get the answers.\n", + "agent.chat(\"What tables are in the keyspace llamaindex_agent_test?\")\n", "agent.chat(\"What is the userid for patrick@datastax.com ?\")\n", "agent.chat(\"What videos did user patrick@datastax.com upload?\")" ] diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py b/llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py index e69de29bb2d1d..07a48e8196d6f 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/tests/test_tools_cassandra.py @@ -0,0 +1,7 @@ +from llama_index.core.tools.tool_spec.base import BaseToolSpec +from llama_index.tools.cassandra.base import CassandraDatabaseToolSpec + + +def test_class() -> None: + names_of_base_classes = [b.__name__ for b in CassandraDatabaseToolSpec.__mro__] + assert BaseToolSpec.__name__ in names_of_base_classes From 7309075f0aadb46ba254bc762feebf866c758c2c Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Fri, 10 May 2024 16:06:07 -0700 Subject: [PATCH 03/13] Updated pyproject and added a README.md --- .../llama-index-tools-cassandra/README.md | 42 ++++++++++++++++++- .../pyproject.toml | 3 -- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/README.md b/llama-index-integrations/tools/llama-index-tools-cassandra/README.md index 9496b5638247d..1ce60e4aecfe9 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/README.md +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/README.md @@ -1 +1,41 @@ -# LlamaIndex Tools Integration: Cassandra +# Cassandra Database Tools + +## Overview + +The Cassandra Database Tools project is designed to help AI engineers efficiently integrate Large Language Models (LLMs) with Apache Cassandra® data. It facilitates optimized and safe interactions with Cassandra databases, supporting various deployments like Apache Cassandra®, DataStax Enterprise™, and DataStax Astra™. + +## Key Features + +- **Fast Data Access:** Optimized queries ensure most operations complete in milliseconds. +- **Schema Introspection:** Enhances the reasoning capabilities of LLMs by providing detailed schema information. +- **Compatibility:** Supports various Cassandra deployments, ensuring wide applicability. +- **Safety Measures:** Limits operations to SELECT queries and schema introspection to prioritize data integrity. + +## Installation + +Ensure your system has Python installed and proceed with the following installations via pip: + +```bash +pip install python-dotenv cassio llama-index-tools-cassandra +``` + +Create a `.env` file for environmental variables related to Cassandra and Astra configurations, following the example structure provided in the notebook. + +## Environment Setup + +- For Cassandra: Configure `CASSANDRA_CONTACT_POINTS`, `CASSANDRA_USERNAME`, `CASSANDRA_PASSWORD`, and `CASSANDRA_KEYSPACE`. +- For DataStax Astra: Set `ASTRA_DB_APPLICATION_TOKEN`, `ASTRA_DB_DATABASE_ID`, and `ASTRA_DB_KEYSPACE`. + +## How It Works + +The toolkit leverages the Cassandra Query Language (CQL) and integrates with LLMs to provide an efficient query path determination for the user's requests, ensuring best practices for querying are followed. Using functions, the LLMs decision making can invoke the tool instead of designing custom queries. The result is faster and efficient access to Cassandra data for agents. + +## Tools Included + +- **`cassandra_db_schema`**: Fetches schema information, essential for the agent’s operation. +- **`cassandra_db_select_table_data`**: Allows selection of data from a specific keyspace and table. +- **`cassandra_db_query`**: An experimental tool that accepts fully formed query strings from the agent. + +## Example Usage + +Initialize the CassandraDatabase and set up the agent with the tools provided. Query the database by interacting with the agent as shown in the example [notebook](examples/casssandra.ipynb). diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml index be7c8acc7b521..9417e7142b192 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml @@ -5,13 +5,10 @@ requires = ["poetry-core"] [tool.codespell] check-filenames = true check-hidden = true -# Feel free to un-skip examples, and experimental, you will just need to -# work through many typos (--write-changes and --interactive will help) skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" [tool.mypy] disallow_untyped_defs = true -# Remove venv skip when integrated with pre-commit exclude = ["_static", "build", "examples", "notebooks", "venv"] ignore_missing_imports = true python_version = "3.8" From 316a26841ab3dbec72943616b4a40cd9b08f55fb Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Mon, 13 May 2024 15:08:47 -0700 Subject: [PATCH 04/13] Added cassio 0.1.7 dependency to pyproject.toml. Moved example jupyter notebook to docs. --- .../examples => docs/docs/examples/tools}/casssandra.ipynb | 4 +++- .../tools/llama-index-tools-cassandra/pyproject.toml | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) rename {llama-index-integrations/tools/llama-index-tools-cassandra/examples => docs/docs/examples/tools}/casssandra.ipynb (99%) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb b/docs/docs/examples/tools/casssandra.ipynb similarity index 99% rename from llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb rename to docs/docs/examples/tools/casssandra.ipynb index 53abde73ffabf..4c731f1ac730c 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/examples/casssandra.ipynb +++ b/docs/docs/examples/tools/casssandra.ipynb @@ -138,7 +138,9 @@ "import cassio\n", "\n", "from llama_index.tools.cassandra.base import CassandraDatabaseToolSpec\n", - "from llama_index.tools.cassandra.cassandra_database_wrapper import CassandraDatabase\n", + "from llama_index.tools.cassandra.cassandra_database_wrapper import (\n", + " CassandraDatabase,\n", + ")\n", "\n", "from llama_index.agent.openai import OpenAIAgent\n", "from llama_index.llms.openai import OpenAI" diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml index 9417e7142b192..805ab949bd2e2 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml @@ -25,6 +25,7 @@ version = "0.1.0" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" llama-index-core = "^0.10.0" +cassio = "^0.1.7" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"} From e2102262e43a591e65746e2a48660c3c2c8ea733 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Thu, 16 May 2024 09:34:21 -0700 Subject: [PATCH 05/13] Update docs/docs/examples/tools/casssandra.ipynb Typo Co-authored-by: Christophe Bornet --- docs/docs/examples/tools/casssandra.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/examples/tools/casssandra.ipynb b/docs/docs/examples/tools/casssandra.ipynb index 4c731f1ac730c..d8b874a754a18 100644 --- a/docs/docs/examples/tools/casssandra.ipynb +++ b/docs/docs/examples/tools/casssandra.ipynb @@ -67,7 +67,7 @@ "Selects data from a specific keyspace and table. The agent can pass paramaters for a predicate and limits on the number of returned records. \n", "\n", "### `cassandra_db_query`\n", - "Expiriemental alternative to `cassandra_db_select_table_data` which takes a query string completely formed by the agent instead of parameters. *Warning*: This can lead to unusual queries that may not be as performant(or even work). This may be removed in future releases. If it does something cool, we want to know about that too. You never know!" + "Experimental alternative to `cassandra_db_select_table_data` which takes a query string completely formed by the agent instead of parameters. *Warning*: This can lead to unusual queries that may not be as performant(or even work). This may be removed in future releases. If it does something cool, we want to know about that too. You never know!" ] }, { From 23a55098b0fc282d687ef5c25feb2291bd42bff3 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Thu, 16 May 2024 15:07:32 -0700 Subject: [PATCH 06/13] Cleaned up a bunch of commented sections for cleanliness. --- .../llama_index/tools/cassandra/base.py | 101 +----------------- 1 file changed, 1 insertion(+), 100 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py index 6101596f8690e..f18008a7134fa 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py @@ -19,7 +19,7 @@ class CassandraDatabaseToolSpec(BaseToolSpec, BaseReader): db: CassandraDatabase = Field(exclude=True) spec_functions = [ - # "cassandra_db_query", + "cassandra_db_query", "cassandra_db_schema", "cassandra_db_select_table_data", ] @@ -82,102 +82,3 @@ def cassandra_db_select_table_data( text=self.db.get_table_data_no_throw(keyspace, table, predicate, limit) ) ] - - -# class QueryCassandraDatabaseTool(BaseCassandraDatabaseTool, BaseTool): -# """Tool for querying an Apache Cassandra database with provided CQL.""" - -# name: str = "cassandra_db_query" -# description: str = """ -# Execute a CQL query against the database and get back the result. -# If the query is not correct, an error message will be returned. -# If an error is returned, rewrite the query, check the query, and try again. -# """ -# args_schema: Type[BaseModel] = _QueryCassandraDatabaseToolInput - -# def _run( -# self, -# query: str, -# run_manager: Optional[CallbackManagerForToolRun] = None, -# ) -> Union[str, Sequence[Dict[str, Any]], ResultSet]: -# """Execute the query, return the results or an error message.""" - - -# class _GetSchemaCassandraDatabaseToolInput(BaseModel): -# keyspace: str = Field( -# ..., -# description=("The name of the keyspace for which to return the schema."), -# ) - - -# class GetSchemaCassandraDatabaseTool(BaseCassandraDatabaseTool, BaseTool): -# """Tool for getting the schema of a keyspace in an Apache Cassandra database.""" - -# name: str = "cassandra_db_schema" -# description: str = """ -# Input to this tool is a keyspace name, output is a table description -# of Apache Cassandra tables. -# If the query is not correct, an error message will be returned. -# If an error is returned, report back to the user that the keyspace -# doesn't exist and stop. -# """ - -# args_schema: Type[BaseModel] = _GetSchemaCassandraDatabaseToolInput - -# def _run( -# self, -# keyspace: str, -# run_manager: Optional[CallbackManagerForToolRun] = None, -# ) -> str: -# """Get the schema for a keyspace.""" -# return self.db.get_keyspace_tables_str_no_throw(keyspace) - - -# class _GetTableDataCassandraDatabaseToolInput(BaseModel): -# keyspace: str = Field( -# ..., -# description=("The name of the keyspace containing the table."), -# ) -# table: str = Field( -# ..., -# description=("The name of the table for which to return data."), -# ) -# predicate: str = Field( -# ..., -# description=("The predicate for the query that uses the primary key."), -# ) -# limit: int = Field( -# ..., -# description=("The maximum number of rows to return."), -# ) - - -# class GetTableDataCassandraDatabaseTool(BaseCassandraDatabaseTool, BaseTool): -# """ -# Tool for getting data from a table in an Apache Cassandra database. -# Use the WHERE clause to specify the predicate for the query that uses the -# primary key. A blank predicate will return all rows. Avoid this if possible. -# Use the limit to specify the number of rows to return. A blank limit will -# return all rows. -# """ - -# name: str = "cassandra_db_select_table_data" -# description: str = """ -# Tool for getting data from a table in an Apache Cassandra database. -# Use the WHERE clause to specify the predicate for the query that uses the -# primary key. A blank predicate will return all rows. Avoid this if possible. -# Use the limit to specify the number of rows to return. A blank limit will -# return all rows. -# """ -# args_schema: Type[BaseModel] = _GetTableDataCassandraDatabaseToolInput - -# def _run( -# self, -# keyspace: str, -# table: str, -# predicate: str, -# limit: int, -# run_manager: Optional[CallbackManagerForToolRun] = None, -# ) -> str: -# """Get data from a table in a keyspace.""" -# return self.db.get_table_data_no_throw(keyspace, table, predicate, limit) From f0571b5bd7279617bce7ca77b6c70c612c59fce3 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Thu, 16 May 2024 17:41:18 -0700 Subject: [PATCH 07/13] Removed no_throw methods --- .../cassandra/cassandra_database_wrapper.py | 69 ++++++------------- 1 file changed, 22 insertions(+), 47 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py index 1d1ff146b261f..d6603083ad14a 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py @@ -8,7 +8,6 @@ from cassandra.cluster import ResultSet, Session from pydantic import BaseModel, Field, root_validator - IGNORED_KEYSPACES = [ "system", "system_auth", @@ -58,37 +57,19 @@ def run( else: raise ValueError("Fetch parameter must be either 'one', 'all', or 'cursor'") - def run_no_throw( - self, - query: str, - fetch: str = "all", - include_columns: bool = False, - **kwargs: Any, - ) -> Union[str, Sequence[Dict[str, Any]], ResultSet]: - """Execute a CQL query and return the results or an error message.""" - try: - return self.run(query, fetch, include_columns, **kwargs) - except Exception as e: - """Format the error message""" - return f"Error: {e}\n{traceback.format_exc()}" - - def get_keyspace_tables_str_no_throw(self, keyspace: str) -> str: + def get_keyspace_tables_str(self, keyspace: str) -> str: """Get the tables for the specified keyspace.""" try: - return self.get_keyspace_tables_str(keyspace) + tables = self.get_keyspace_tables(keyspace) + schema_string = "" + for table in tables: + schema_string += table.as_markdown() + "\n\n" + + return schema_string except Exception as e: """Format the error message""" return f"Error: {e}\n{traceback.format_exc()}" - def get_keyspace_tables_str(self, keyspace: str) -> str: - """Get the tables for the specified keyspace.""" - tables = self.get_keyspace_tables(keyspace) - schema_string = "" - for table in tables: - schema_string += table.as_markdown() + "\n\n" - - return schema_string - def get_keyspace_tables(self, keyspace: str) -> List[Table]: """Get the Table objects for the specified keyspace.""" schema = self._resolve_schema([keyspace]) @@ -97,36 +78,30 @@ def get_keyspace_tables(self, keyspace: str) -> List[Table]: else: return [] - def get_table_data_no_throw( - self, keyspace: str, table: str, predicate: str, limit: int - ) -> str: - """Get data from the specified table in the specified keyspace. Optionally can - take a predicate for the WHERE clause and a limit. - """ - try: - return self.get_table_data(keyspace, table, predicate, limit) - except Exception as e: - """Format the error message""" - return f"Error: {e}\n{traceback.format_exc()}" - # This is a more basic string building function that doesn't use a query builder # or prepared statements # TODO: Refactor to use prepared statements def get_table_data( self, keyspace: str, table: str, predicate: str, limit: int ) -> str: - """Get data from the specified table in the specified keyspace.""" - query = f"SELECT * FROM {keyspace}.{table}" + """Get data from the specified table in the specified keyspace. Optionally can + take a predicate for the WHERE clause and a limit. + """ + try: + query = f"SELECT * FROM {keyspace}.{table}" - if predicate: - query += f" WHERE {predicate}" - if limit: - query += f" LIMIT {limit}" + if predicate: + query += f" WHERE {predicate}" + if limit: + query += f" LIMIT {limit}" - query += ";" + query += ";" - result = self.run(query, fetch="all") - return "\n".join(str(row) for row in result) + result = self.run(query, fetch="all") + return "\n".join(str(row) for row in result) + except Exception as e: + """Format the error message""" + return f"Error: {e}\n{traceback.format_exc()}" def get_context(self) -> Dict[str, Any]: """Return db context that you may want in agent prompt.""" From a9e0b9533ab465b285baa0c9c68c48898fcb1869 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Thu, 16 May 2024 18:21:13 -0700 Subject: [PATCH 08/13] Changed comments to use Google Docstring format --- .../cassandra/cassandra_database_wrapper.py | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py index d6603083ad14a..01dfdd3cea086 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py @@ -145,21 +145,22 @@ def format_keyspace_to_markdown( def format_schema_to_markdown(self) -> str: """ Generates a markdown representation of the schema for all keyspaces and tables - within the CassandraDatabase instance. This method utilizes the - format_keyspace_to_markdown method to create markdown sections for each - keyspace, assembling them into a comprehensive schema document. + within the CassandraDatabase instance. - Iterates through each keyspace in the database, utilizing - format_keyspace_to_markdown to generate markdown for each keyspace's schema, - including details of its tables. These sections are concatenated to form a - single markdown document that represents the schema of the entire database or - the subset of keyspaces that have been resolved in this instance. + This method utilizes the format_keyspace_to_markdown method to create markdown + sections for each keyspace, assembling them into a comprehensive schema document. + + Iterates through each keyspace in the database, utilizing format_keyspace_to_markdown + to generate markdown for each keyspace's schema, including details of its tables. + These sections are concatenated to form a single markdown document that represents + the schema of the entire database or the subset of keyspaces that have been resolved + in this instance. Returns: - A markdown string that documents the schema of all resolved keyspaces and - their tables within this CassandraDatabase instance. This includes keyspace - names, table names, comments, columns, partition keys, clustering keys, - and indexes for each table. + str: A markdown string that documents the schema of all resolved keyspaces and + their tables within this CassandraDatabase instance. This includes keyspace names, + table names, comments, columns, partition keys, clustering keys, and indexes for + each table. """ schema = self._resolve_schema() output = "# Cassandra Database Schema\n\n" @@ -170,21 +171,22 @@ def format_schema_to_markdown(self) -> str: def _validate_cql(self, cql: str, type: str = "SELECT") -> str: """ Validates a CQL query string for basic formatting and safety checks. + Ensures that `cql` starts with the specified type (e.g., SELECT) and does not contain content that could indicate CQL injection vulnerabilities. - Parameters: - - cql (str): The CQL query string to be validated. - - type (str): The expected starting keyword of the query, used to verify - that the query begins with the correct operation type - (e.g., "SELECT", "UPDATE"). Defaults to "SELECT". + Args: + cql (str): The CQL query string to be validated. + type (str): The expected starting keyword of the query, used to verify + that the query begins with the correct operation type + (e.g., "SELECT", "UPDATE"). Defaults to "SELECT". Returns: - - str: The trimmed and validated CQL query string without a trailing semicolon. + str: The trimmed and validated CQL query string without a trailing semicolon. Raises: - - ValueError: If the value of `type` is not supported - - DatabaseError: If `cql` is considered unsafe + ValueError: If the value of `type` is not supported. + DatabaseError: If `cql` is considered unsafe. """ SUPPORTED_TYPES = ["SELECT"] if type and type.upper() not in SUPPORTED_TYPES: From 38c7b4f3a54e082dbba2b5299da9dbd9c1cf7a86 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Thu, 16 May 2024 21:09:57 -0600 Subject: [PATCH 09/13] build files --- .../tools/llama-index-tools-cassandra/BUILD | 4 +++- .../llama_index/tools/cassandra/BUILD | 1 + .../tools/llama-index-tools-cassandra/tests/BUILD | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/BUILD create mode 100644 llama-index-integrations/tools/llama-index-tools-cassandra/tests/BUILD diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD b/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD index db46e8d6c978c..0896ca890d8bf 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/BUILD @@ -1 +1,3 @@ -python_sources() +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/BUILD b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/tests/BUILD b/llama-index-integrations/tools/llama-index-tools-cassandra/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/tests/BUILD @@ -0,0 +1 @@ +python_tests() From 3f1b5226d4a7c090313179e0c2867cd46ef609cd Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Thu, 16 May 2024 21:10:30 -0600 Subject: [PATCH 10/13] nits --- .../llama_index/tools/cassandra/base.py | 4 +--- .../tools/llama-index-tools-cassandra/pyproject.toml | 7 +++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py index f18008a7134fa..08e42460cf518 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py @@ -1,13 +1,11 @@ """Tools for interacting with an Apache Cassandra database.""" from typing import List +from llama_index.core.bridge.pydantic import Field from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from llama_index.core.tools.tool_spec.base import BaseToolSpec - -from pydantic import Field - from llama_index.tools.cassandra.cassandra_database_wrapper import ( CassandraDatabase, ) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml index 805ab949bd2e2..6e2ea0842b00c 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/pyproject.toml @@ -7,6 +7,13 @@ check-filenames = true check-hidden = true skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" +[tool.llamahub] +contains_example = false +import_path = "llama_index.tools.cassandra" + +[tool.llamahub.class_authors] +CassandraDatabaseToolSpec = "pmcfadin" + [tool.mypy] disallow_untyped_defs = true exclude = ["_static", "build", "examples", "notebooks", "venv"] From 58aee1997dfb35ae561f6f3381c985ec002b7776 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Thu, 16 May 2024 21:12:29 -0600 Subject: [PATCH 11/13] link --- .../tools/llama-index-tools-cassandra/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/README.md b/llama-index-integrations/tools/llama-index-tools-cassandra/README.md index 1ce60e4aecfe9..3fca857df6dbd 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/README.md +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/README.md @@ -38,4 +38,4 @@ The toolkit leverages the Cassandra Query Language (CQL) and integrates with LLM ## Example Usage -Initialize the CassandraDatabase and set up the agent with the tools provided. Query the database by interacting with the agent as shown in the example [notebook](examples/casssandra.ipynb). +Initialize the CassandraDatabase and set up the agent with the tools provided. Query the database by interacting with the agent as shown in the example [notebook](https://docs.llamaindex.ai/en/latest/examples/tools/cassandra/). From 10696c2ff59bcaec278ee1f4210b7f069d67d868 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Fri, 17 May 2024 08:57:52 -0700 Subject: [PATCH 12/13] Removed BaseReader from base.py Not needed. --- .../llama_index/tools/cassandra/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py index 08e42460cf518..a7aed552dcb0f 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/base.py @@ -2,7 +2,6 @@ from typing import List from llama_index.core.bridge.pydantic import Field -from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from llama_index.core.tools.tool_spec.base import BaseToolSpec @@ -11,7 +10,7 @@ ) -class CassandraDatabaseToolSpec(BaseToolSpec, BaseReader): +class CassandraDatabaseToolSpec(BaseToolSpec): """Base tool for interacting with an Apache Cassandra database.""" db: CassandraDatabase = Field(exclude=True) From 7f9f17b426b4cfe4b979b5d3c71854a98581d770 Mon Sep 17 00:00:00 2001 From: Patrick McFadin Date: Mon, 20 May 2024 10:15:36 -0700 Subject: [PATCH 13/13] Changed out Pydantic import to LlamaIndex bridge. llama_index.core.bridge.pydantic --- .../llama_index/tools/cassandra/cassandra_database_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py index 01dfdd3cea086..ee7f96e50e879 100644 --- a/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py +++ b/llama-index-integrations/tools/llama-index-tools-cassandra/llama_index/tools/cassandra/cassandra_database_wrapper.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple, Union from cassandra.cluster import ResultSet, Session -from pydantic import BaseModel, Field, root_validator +from llama_index.core.bridge.pydantic import BaseModel, Field, root_validator IGNORED_KEYSPACES = [ "system",