chore: scaffolding first version

amunra · amunra · commit 4411f02e7e6e · 2023-06-27T19:09:04.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -157,4 +157,4 @@ cython_debug/
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+.idea/
diff --git a/DEV_NOTES.md b/DEV_NOTES.md
@@ -0,0 +1,7 @@
+# Developer's Notes
+
+## Cutting a release
+
+```shell
+poetry export --output requirements.txt
+```
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,20 @@
+[tool.poetry]
+name = "questdb-query"
+version = "0.1.0"
+description = "Fast query over HTTP(S)/CSV for QuestDB"
+readme = "README.md"
+packages = [{include = "questdb_query"}]
+authors = ["Adam Cimarosti <adam@questdb.io>"]
+license = "Apache License 2.0"
+repository = "https://github.com/questdb/py-questdb-query/"
+
+[tool.poetry.dependencies]
+python = "^3.9"
+numpy = "^1.25.0"
+pandas = "^2.0.2"
+pyarrow = "^12.0.1"
+aiohttp = {extras = ["speedups"], version = "^3.8.4"}
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/questdb_query/__init__.py b/questdb_query/__init__.py
@@ -0,0 +1,9 @@
+"""
+Query QuestDB over HTTP into Pandas or Numpy arrays.
+
+The primary implementation is in the `asynchronous` module, with a wrapper
+
+"""
+
+from .endpoint import Endpoint
+from .errors import QueryError
diff --git a/questdb_query/asynchronous.py b/questdb_query/asynchronous.py
@@ -0,0 +1,134 @@
+"""
+Async functions to query QuestDB over HTTP(S) via CSV into Pandas or Numpy.
+"""
+
+__all__ = ['pandas_query', 'numpy_query']
+
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from io import BytesIO
+
+import aiohttp
+import numpy as np
+import pandas as pd
+
+from .endpoint import Endpoint
+from .errors import QueryError
+
+
+def _new_session(endpoint):
+    auth = None
+    if endpoint.username is not None:
+        if endpoint.password is not None:
+            raise ValueError('Password specified without username')
+        auth = aiohttp.BasicAuth(endpoint.username, endpoint.password)
+    return aiohttp.ClientSession(auth=auth)
+
+
+async def _pre_query(session: aiohttp.ClientSession, endpoint: Endpoint, query: str) -> tuple[
+    list[tuple[str, (str, object)]], int]:
+    url = f'{endpoint.url}/exec'
+    params = [('query', query), ('count', 'true'), ('limit', '0')]
+    dtypes_map = {
+        'STRING': ('STRING', None),
+        'SYMBOL': ('SYMBOL', None),
+        'DOUBLE': ('DOUBLE', 'float64'),
+        'FLOAT': ('FLOAT', 'float32'),
+        'CHAR': ('CHAR', None),
+        'TIMESTAMP': ('TIMESTAMP', None)
+    }
+    async with session.get(url=url, params=params) as resp:
+        result = await resp.json()
+        if resp.status != 200:
+            raise QueryError.from_json(result)
+        columns = [
+            (col['name'], dtypes_map[col['type'].upper()])
+            for col in result['columns']]
+        count = result['count']
+        return columns, count
+
+
+async def _query_pandas(
+        session: aiohttp.ClientSession,
+        executor: ThreadPoolExecutor,
+        endpoint: Endpoint,
+        query: str,
+        result_schema: list[tuple[str, tuple[str, object]]],
+        limit_range: tuple[int, int]) -> pd.DataFrame:
+    url = f'{endpoint.url}/exp'
+    params = [
+        ('query', query),
+        ('limit', f'{limit_range[0]},{limit_range[1]}')]
+    async with session.get(url=url, params=params) as resp:
+        if resp.status != 200:
+            raise QueryError.from_json(await resp.json())
+        buf = await resp.content.read()
+        download_bytes = len(buf)
+        buf_reader = BytesIO(buf)
+        dtypes = {
+            col[0]: col[1][1]
+            for col in result_schema
+            if col[1][1] is not None}
+
+        def _read_csv():
+            df = pd.read_csv(buf_reader, dtype=dtypes, engine='pyarrow')
+            # Patch up the column types.
+            for col_schema in result_schema:
+                col_name = col_schema[0]
+                col_type = col_schema[1][0]
+                try:
+                    if col_type == 'TIMESTAMP':
+                        series = df[col_name]
+                        series = pd.to_datetime(series)
+                        df[col_name] = series
+                except Exception as e:
+                    raise ValueError(
+                        f'Failed to convert column {col_name} to type {col_type}: {e}\n{series}')
+            return df
+
+        loop = asyncio.get_running_loop()
+        df = await loop.run_in_executor(executor, _read_csv)
+        return df, download_bytes
+
+
+async def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> pd.DataFrame:
+    """
+    Query QuestDB via CSV to a Pandas DataFrame.
+    """
+    endpoint = endpoint or Endpoint()
+    with ThreadPoolExecutor(max_workers=chunks) as executor:
+        async with _new_session(endpoint) as session:
+            result_schema, row_count = await _pre_query(session, endpoint, query)
+            rows_per_spawn = row_count // chunks
+            limit_ranges = [
+                (
+                    i * rows_per_spawn,
+                    ((i + 1) * rows_per_spawn) if i < chunks - 1 else row_count
+                )
+                for i in range(chunks)]
+            tasks = [
+                asyncio.ensure_future(_query_pandas(
+                    session, executor, endpoint, query, result_schema, limit_range))
+                for limit_range in limit_ranges]
+            results = await asyncio.gather(*tasks)
+            sub_dataframes = [result[0] for result in results]
+            df = pd.concat(sub_dataframes)
+            if stats:
+                total_downloaded = sum(result[1] for result in results)
+                return df, total_downloaded
+            else:
+                return df
+
+
+async def numpy_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> dict[str, np.array]:
+    """
+    Query and obtain the result as a dict of columns.
+    Each column is a numpy array.
+    """
+    res = await pandas_query(query, endpoint, chunks, stats=stats)
+    df, stats_res = res if stats else (res, None)
+    # Calling `.to_numpy()` for each column is quite efficient and generally avoids copies.
+    # Pandas already stores columns as numpy.
+    # We go through Pandas as this allows us to get fast CSV parsing.
+    np_arrays = {col_name: df[col_name].to_numpy() for col_name in df}
+    return (np_arrays, stats_res) if stats else np_arrays
diff --git a/questdb_query/endpoint.py b/questdb_query/endpoint.py
@@ -0,0 +1,15 @@
+class Endpoint:
+    """
+    HTTP connection parameters into QuestDB
+    """
+    def __init__(self, host='127.0.0.1', port=None, https=True, username=None, password=None):
+        self.host = host
+        self.port = port or (443 if https else 9000)
+        self.https = https
+        self.username = username
+        self.password = password
+
+    @property
+    def url(self):
+        protocol = 'https' if self.https else 'http'
+        return f'{protocol}://{self.host}:{self.port}'
diff --git a/questdb_query/errors.py b/questdb_query/errors.py
@@ -0,0 +1,15 @@
+from typing import Optional
+
+
+class QueryError(Exception):
+    def __init__(self, message: str, query: str, position: Optional[int] = None):
+        super().__init__(message)
+        self.query = query
+        self.position = position
+
+    @classmethod
+    def from_json(cls, json: dict):
+        return cls(
+            message=json['error'],
+            query=json['query'],
+            position=json.get('position'))
diff --git a/questdb_query/synchronous.py b/questdb_query/synchronous.py
@@ -0,0 +1,32 @@
+"""
+A sync shim around the `asynchronous` module.
+"""
+
+__all__ = ['pandas_query', 'numpy_query']
+
+import asyncio
+
+import numpy as np
+import pandas as pd
+
+from . import asynchronous as a
+from .endpoint import Endpoint
+
+
+def _syncify(async_fn, *args, **kwargs):
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        loop = None
+    if loop is None:
+        return asyncio.run(async_fn(*args, **kwargs))
+    else:
+        return loop.run_until_complete(async_fn(*args, **kwargs))
+
+
+def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> pd.DataFrame:
+    return _syncify(a.pandas_query(query, endpoint, chunks, stats=stats))
+
+
+def numpy_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> dict[str, np.array]:
+    return _syncify(a.numpy_query(query, endpoint, chunks, stats=stats))
diff --git a/questdb_query/tool.py b/questdb_query/tool.py
@@ -0,0 +1,46 @@
+"""
+Benchmarking tool
+
+From the command line, run as::
+
+    python3 -m questdb_query.tool --help
+
+"""
+
+from .endpoint import Endpoint
+from .synchronous import numpy_query
+
+
+def _parse_args():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', type=str, default='localhost')
+    parser.add_argument('--port', type=int)
+    parser.add_argument('--https', action='store_true')
+    parser.add_argument('--username', type=str)
+    parser.add_argument('--password', type=str)
+    parser.add_argument('--chunks', type=int, default=1)
+    parser.add_argument('query', type=str)
+    return parser.parse_args()
+
+
+def main(args):
+    import time
+    endpoint = Endpoint(
+        host=args.host,
+        port=args.port,
+        https=args.https,
+        username=args.username,
+        password=args.password)
+    start_time = time.perf_counter()
+    np_arrs, total_downloaded = numpy_query(endpoint, args.query, args.chunks, stats=True)
+    elapsed = time.perf_counter() - start_time
+    print(f'Elapsed: {elapsed}')
+    bytes_throughput = total_downloaded / 1024.0 / 1024.0 / elapsed
+    print(
+        f'Data throughput: {bytes_throughput:.2f} MiB/sec (of downloaded CSV data)')
+
+
+if __name__ == "__main__":
+    args = _parse_args()
+    main(args)
diff --git a/tests/__init__.py b/tests/__init__.py