README.md, features and fixes.

amunra · amunra · commit 9e4adf71f528 · 2023-06-28T14:12:40.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+
+# poetry
+.python-version
diff --git a/README.md b/README.md
@@ -1,2 +1,200 @@
 # py-questdb-query
 Fast query over HTTP(S)/CSV for QuestDB
+
+## Installation
+
+To install (or upgrade):
+
+```shell
+python3 -m pip install -U git+https://github.com/questdb/py-questdb-query.git#questdb_query
+```
+
+If you need to uninstall it, run:
+
+```shell
+python3 -m pip uninstall questdb_query
+```
+
+## Basic Usage, querying into Numpy
+
+To query the database on localhost, just use the `numpy_query` function.
+
+Here's an example querying CPU utilisation data from a `localhost` database.
+
+```python
+from questdb_query import numpy_query
+
+np_arrs = numpy_query('''
+    select
+        timestamp, hostname, datacenter, usage_user, usage_nice
+    from
+        cpu
+    limit 10''')
+```
+
+The `np_arrs` object is a python `dict` which holds a numpy array per column, keyed by column name:
+```python
+>>> np_arrs
+{'timestamp': array(['2016-01-01T00:00:00.000000000', '2016-01-01T00:00:10.000000000',
+       '2016-01-01T00:00:20.000000000', '2016-01-01T00:00:30.000000000',
+       '2016-01-01T00:00:40.000000000', '2016-01-01T00:00:50.000000000',
+       '2016-01-01T00:01:00.000000000', '2016-01-01T00:01:10.000000000',
+       '2016-01-01T00:01:20.000000000', '2016-01-01T00:01:30.000000000'],
+      dtype='datetime64[ns]'), 'hostname': array(['host_0', 'host_1', 'host_2', 'host_3', 'host_4', 'host_5',
+       'host_6', 'host_7', 'host_8', 'host_9'], dtype=object), 'datacenter': array(['ap-southeast-2b', 'eu-west-1b', 'us-west-1b', 'us-west-2c',
+       'us-west-2b', 'eu-west-1b', 'eu-west-1b', 'us-west-1a',
+       'ap-southeast-2a', 'us-east-1a'], dtype=object), 'usage_user': array([1.39169048, 0.33846369, 0.        , 1.81511203, 0.84273104,
+       0.        , 0.        , 0.28085548, 0.        , 1.37192634]), 'usage_nice': array([0.30603088, 1.21496673, 0.        , 0.16688796, 0.        ,
+       2.77319521, 0.40332488, 1.81585253, 1.92844804, 2.12841919])}
+```
+
+If we wanted to calculate a (rather non-sensical) weighted average of `usage_user` and `usage_nice` we can
+do this by accessing the `numpy` columns:
+
+```python
+>>> np_arrs['usage_user'].dot(np_arrs['usage_nice'].T)
+4.5700692045031985
+```
+
+## Querying a remote database
+
+If your database is running on a remote host, specify an endpoint:
+
+```python
+from questdb_query import numpy_query, Endpoint
+
+endpoint = Endpoint(host='your.hostname.com', https=True, username='user', password='pass')
+
+np_arrs = numpy_query('select * from cpu limit 10', endpoint)
+```
+
+Note how the example above enables HTTPS and specifies a username and password for authentication.
+
+
+## Querying into Pandas
+
+You can also query into Pandas:
+
+```python
+from questdb_query import pandas_query, Endpoint
+
+endpoint = Endpoint(host='your.hostname.com', https=True, username='user', password='pass')
+
+df = pandas_query('select * from cpu limit 1000', endpoint)
+```
+
+This allows you, for example, to pre-aggregate results:
+
+```python
+>>> df = df[['region', 'usage_user', 'usage_nice']].groupby('region').mean()
+>>> df
+                usage_user  usage_nice
+region                                
+ap-northeast-1    8.163766    6.492334
+ap-southeast-1    6.511215    7.341863
+ap-southeast-2    6.788770    6.257839
+eu-central-1      7.392642    6.416479
+eu-west-1         7.213417    7.185956
+sa-east-1         7.143568    5.925026
+us-east-1         7.620643    7.243553
+us-west-1         6.286770    6.531977
+us-west-2         6.228692    6.439672
+```
+
+You can then switch over to numpy with a simple and fast conversion:
+
+```python
+>>> from questdb_query import pandas_to_numpy
+>>> np_arrs = pandas_to_numpy(df)
+>>> np_arrs
+{'usage_user': array([8.16376556, 6.51121543, 6.78876964, 7.3926419 , 7.21341716,
+       7.14356839, 7.62064304, 6.28677006, 6.22869169]), 'usage_nice': array([6.49233392, 7.34186348, 6.25783903, 6.41647863, 7.18595643,
+       5.92502642, 7.24355328, 6.53197733, 6.43967247]), 'region': array(['ap-northeast-1', 'ap-southeast-1', 'ap-southeast-2',
+       'eu-central-1', 'eu-west-1', 'sa-east-1', 'us-east-1', 'us-west-1',
+       'us-west-2'], dtype=object)}
+```
+
+## Benchmarking
+
+### From code
+
+Each query result also contains a `Stats` object with the performance summary which you can print.
+
+```python
+>>> from questdb_query import numpy_query
+>>> np_arrs = numpy_query('select * from cpu', chunks=8)
+>>> print(np_arrs.query_stats)
+Duration: 2.631s
+Millions of lines: 5.000
+Millions of lines/s: 1.901
+MiB: 1332.144
+MiB/s: 506.381
+```
+
+You can also extract individual fields:
+
+```python
+>>> np_arrs.query_stats
+Stats(duration_s=2.630711865, line_count=5000000, byte_count=1396853875, throughput_mbs=506.3814407360216, throughput_mlps=1.900626239810569)
+>>> np_arrs.query_stats.throughput_mlps
+1.900626239810569
+```
+
+### From the command line
+
+To get the best performance it may be useful to try queries with different hardware setups, chunk counts etc.
+
+You can run the benchmarking tool from the command line:
+
+```bash
+$ python3 -m questdb_query.tool --chunks 8 "select * from cpu"
+```
+```
+         hostname          region       datacenter  rack              os arch team  service  service_version service_environment  usage_user  usage_system  usage_idle  usage_nice  usage_iowait  usage_irq  usage_softirq  usage_steal  usage_guest  usage_guest_nice           timestamp
+0          host_0  ap-southeast-2  ap-southeast-2b    96     Ubuntu16.10  x86  CHI       11                0                test    1.391690      0.000000    2.644812    0.306031      1.194629   0.000000       0.000000     0.726996     0.000000          0.000000 2016-01-01 00:00:00
+1          host_1       eu-west-1       eu-west-1b    52  Ubuntu16.04LTS  x64  NYC        7                0          production    0.338464      1.951409    2.455378    1.214967      2.037935   0.000000       1.136997     1.022753     1.711183          0.000000 2016-01-01 00:00:10
+2          host_2       us-west-1       us-west-1b    69  Ubuntu16.04LTS  x64  LON        8                1          production    0.000000      2.800873    2.296324    0.000000      1.754139   1.531160       0.662572     0.000000     0.472402          0.312164 2016-01-01 00:00:20
+3          host_3       us-west-2       us-west-2c     8  Ubuntu16.04LTS  x86  LON       11                0                test    1.815112      4.412385    2.056344    0.166888      3.507148   3.276577       0.000000     0.000000     0.000000          1.496152 2016-01-01 00:00:30
+4          host_4       us-west-2       us-west-2b    83  Ubuntu16.04LTS  x64  NYC        6                0                test    0.842731      3.141248    2.199520    0.000000      2.943054   5.032342       0.391105     1.375450     0.000000          1.236811 2016-01-01 00:00:40
+...           ...             ...              ...   ...             ...  ...  ...      ...              ...                 ...         ...           ...         ...         ...           ...        ...            ...          ...          ...               ...                 ...
+624995  host_3995  ap-southeast-2  ap-southeast-2a    30  Ubuntu16.04LTS  x86  CHI       19                1             staging   33.238309     82.647341   17.272531   52.707720     71.718564  45.605728     100.000000    22.907723    78.130846         15.652954 2017-08-01 16:52:30
+624996  host_3996       us-west-2       us-west-2a    67     Ubuntu15.10  x64  CHI        9                0          production   33.344070     81.922739   16.653731   52.107537     71.844945  45.880606      99.835977    23.045458    76.468930         17.091646 2017-08-01 16:52:40
+624997  host_3997       us-west-2       us-west-2b    63     Ubuntu15.10  x86   SF        8                0          production   32.932095     80.662915   14.708377   53.354277     72.265215  44.803275      99.013038    20.375169    78.043473         17.870002 2017-08-01 16:52:50
+624998  host_3998       eu-west-1       eu-west-1b    53  Ubuntu16.04LTS  x86  CHI       11                1             staging   31.199818     80.994859   15.051577   51.923123     74.169828  46.453950      99.107213    21.004499    78.341154         18.880808 2017-08-01 16:53:00
+624999  host_3999       us-east-1       us-east-1c    87     Ubuntu16.10  x64   SF        8                1          production   30.310735     81.727637   15.413537   51.417897     74.973555  44.882255      98.821672    19.055040    78.094993         19.263652 2017-08-01 16:53:10
+
+[5000000 rows x 21 columns]
+
+Duration: 2.547s
+Millions of lines: 5.000
+Millions of lines/s: 1.963
+MiB: 1332.144
+MiB/s: 522.962
+```
+
+
+## Async operation
+
+The `numpy_query` and `pandas_query` functions are actually wrappers around `async` variants.
+
+If your application is already using `async`, then call those directly as it allows other parts of your application to
+perform work in parallel during the data download.
+
+The functions take identical arguments as their synchronous counterparts.
+
+```python
+import asyncio
+from questdb_query.asynchronous import numpy_query
+
+
+def main():
+    endpoint = Endpoint(host='your.hostname.com', https=True, username='user', password='pass')
+    np_arrs = await numpy_query('select * from cpu limit 10', endpoint)
+    print(np_arrs)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())
+
+```
+
diff --git a/questdb_query/__init__.py b/questdb_query/__init__.py
@@ -5,5 +5,9 @@
 
 """
 
+__version__ = '0.1.0'
+
 from .endpoint import Endpoint
 from .errors import QueryError
+from .synchronous import pandas_query, numpy_query
+from .pandas_util import pandas_to_numpy
diff --git a/questdb_query/asynchronous.py b/questdb_query/asynchronous.py
@@ -5,6 +5,7 @@
 __all__ = ['pandas_query', 'numpy_query']
 
 import asyncio
+import time
 from concurrent.futures import ThreadPoolExecutor
 from io import BytesIO
 
@@ -14,6 +15,8 @@
 
 from .endpoint import Endpoint
 from .errors import QueryError
+from .pandas_util import pandas_to_numpy
+from .stats import Stats
 
 
 def _new_session(endpoint):
@@ -79,7 +82,10 @@ def _read_csv():
                 try:
                     if col_type == 'TIMESTAMP':
                         series = df[col_name]
-                        series = pd.to_datetime(series)
+                        # Drop the UTC timezone during conversion.
+                        # This allows `.to_numpy()` on the series to
+                        # yield a `datetime64` dtype column.
+                        series = pd.to_datetime(series).dt.tz_convert(None)
                         df[col_name] = series
                 except Exception as e:
                     raise ValueError(
@@ -91,11 +97,12 @@ def _read_csv():
         return df, download_bytes
 
 
-async def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> pd.DataFrame:
+async def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1) -> pd.DataFrame:
     """
     Query QuestDB via CSV to a Pandas DataFrame.
     """
     endpoint = endpoint or Endpoint()
+    start_ts = time.perf_counter_ns()
     with ThreadPoolExecutor(max_workers=chunks) as executor:
         async with _new_session(endpoint) as session:
             result_schema, row_count = await _pre_query(session, endpoint, query)
@@ -113,22 +120,16 @@ async def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *
             results = await asyncio.gather(*tasks)
             sub_dataframes = [result[0] for result in results]
             df = pd.concat(sub_dataframes)
-            if stats:
-                total_downloaded = sum(result[1] for result in results)
-                return df, total_downloaded
-            else:
-                return df
+            end_ts = time.perf_counter_ns()
+            total_downloaded = sum(result[1] for result in results)
+            df.query_stats = Stats(end_ts - start_ts, row_count, total_downloaded)
+            return df
 
 
-async def numpy_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> dict[str, np.array]:
+async def numpy_query(query: str, endpoint: Endpoint = None, chunks: int = 1) -> dict[str, np.array]:
     """
     Query and obtain the result as a dict of columns.
     Each column is a numpy array.
     """
-    res = await pandas_query(query, endpoint, chunks, stats=stats)
-    df, stats_res = res if stats else (res, None)
-    # Calling `.to_numpy()` for each column is quite efficient and generally avoids copies.
-    # Pandas already stores columns as numpy.
-    # We go through Pandas as this allows us to get fast CSV parsing.
-    np_arrays = {col_name: df[col_name].to_numpy() for col_name in df}
-    return (np_arrays, stats_res) if stats else np_arrays
+    df = await pandas_query(query, endpoint, chunks)
+    return pandas_to_numpy(df)
diff --git a/questdb_query/endpoint.py b/questdb_query/endpoint.py
@@ -2,7 +2,7 @@ class Endpoint:
     """
     HTTP connection parameters into QuestDB
     """
-    def __init__(self, host='127.0.0.1', port=None, https=True, username=None, password=None):
+    def __init__(self, host='127.0.0.1', port=None, https=False, username=None, password=None):
         self.host = host
         self.port = port or (443 if https else 9000)
         self.https = https
diff --git a/questdb_query/pandas_util.py b/questdb_query/pandas_util.py
@@ -0,0 +1,27 @@
+__all__ = ['pandas_to_numpy']
+
+import numpy as np
+import pandas as pd
+
+from .stats import StatsDict
+
+
+def pandas_to_numpy(df: pd.DataFrame) -> dict[str, np.array]:
+    """
+    Convert a pandas dataframe into a dict containing numpy arrays, keyed by column name.
+
+    If the index is named, then convert that too.
+    """
+    # Calling `.to_numpy()` for each column is quite efficient and generally avoids copies.
+    # This is because Pandas internally already usually stores columns as numpy.
+    np_arrs = {col_name: df[col_name].to_numpy() for col_name in df}
+
+    # If the index is named, then convert that too.
+    if df.index.name:
+        np_arrs[df.index.name] = df.index.to_numpy()
+
+    # Carry across stats, if these are present.
+    if hasattr(df, 'query_stats'):
+        np_arrs = StatsDict(np_arrs, df.query_stats)
+
+    return np_arrs
diff --git a/questdb_query/stats.py b/questdb_query/stats.py
@@ -0,0 +1,60 @@
+__all__ = ['Stats', 'StatsDict']
+
+NS_IN_S = 1e9
+
+STATS_TEMPLATE = '''Duration: {duration_s:.3f}s
+Millions of lines: {line_count_millions:.3f}
+Millions of lines/s: {throughput_mlps:.3f}
+MiB: {byte_count_mib:.3f}
+MiB/s: {throughput_mbs:.3f}'''
+
+
+class Stats:
+    def __init__(self, duration_ns: int, line_count: int, byte_count: int):
+        self.duration_ns = duration_ns
+        self.line_count = line_count
+        self.byte_count = byte_count
+
+    @property
+    def duration_s(self) -> float:
+        """
+        How long the query took in seconds.
+        """
+        return self.duration_ns / NS_IN_S
+
+    @property
+    def throughput_mbs(self) -> float:
+        """
+        How many MiB/s were downloaded and parsed.
+        """
+        return self.byte_count / self.duration_ns * NS_IN_S / 1024 / 1024
+
+    @property
+    def throughput_mlps(self) -> float:
+        """
+        How many millions of lines per second were parsed.
+        """
+        return self.line_count / self.duration_ns * NS_IN_S / 1e6
+
+    def __repr__(self) -> str:
+        return (f'Stats(duration_s={self.duration_s}, '
+                f'line_count={self.line_count}, '
+                f'byte_count={self.byte_count}, '
+                f'throughput_mbs={self.throughput_mbs}, '
+                f'throughput_mlps={self.throughput_mlps})')
+
+    def __str__(self):
+        return STATS_TEMPLATE.format(
+            duration_s=self.duration_s,
+            line_count_millions=self.line_count / 1e6,
+            throughput_mbs=self.throughput_mbs,
+            byte_count_mib=self.byte_count / 1024 / 1024,
+            throughput_mlps=self.throughput_mlps)
+
+
+class StatsDict(dict):
+    """A dict with an additional .query_stats attribute."""
+
+    def __init__(self, other: dict, query_stats: Stats):
+        super().__init__(other)
+        self.query_stats = query_stats
diff --git a/questdb_query/synchronous.py b/questdb_query/synchronous.py
diff --git a/questdb_query/tool.py b/questdb_query/tool.py