Skip to content

Commit 4411f02

Browse files
committed
chore: scaffolding first version
1 parent f4b06ab commit 4411f02

File tree

11 files changed

+1291
-1
lines changed

11 files changed

+1291
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,4 +157,4 @@ cython_debug/
157157
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160-
#.idea/
160+
.idea/

DEV_NOTES.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Developer's Notes
2+
3+
## Cutting a release
4+
5+
```shell
6+
poetry export --output requirements.txt
7+
```

poetry.lock

Lines changed: 1012 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[tool.poetry]
2+
name = "questdb-query"
3+
version = "0.1.0"
4+
description = "Fast query over HTTP(S)/CSV for QuestDB"
5+
readme = "README.md"
6+
packages = [{include = "questdb_query"}]
7+
authors = ["Adam Cimarosti <adam@questdb.io>"]
8+
license = "Apache License 2.0"
9+
repository = "https://github.com/questdb/py-questdb-query/"
10+
11+
[tool.poetry.dependencies]
12+
python = "^3.9"
13+
numpy = "^1.25.0"
14+
pandas = "^2.0.2"
15+
pyarrow = "^12.0.1"
16+
aiohttp = {extras = ["speedups"], version = "^3.8.4"}
17+
18+
[build-system]
19+
requires = ["poetry-core"]
20+
build-backend = "poetry.core.masonry.api"

questdb_query/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
"""
2+
Query QuestDB over HTTP into Pandas or Numpy arrays.
3+
4+
The primary implementation is in the `asynchronous` module, with a wrapper
5+
6+
"""
7+
8+
from .endpoint import Endpoint
9+
from .errors import QueryError

questdb_query/asynchronous.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
"""
2+
Async functions to query QuestDB over HTTP(S) via CSV into Pandas or Numpy.
3+
"""
4+
5+
__all__ = ['pandas_query', 'numpy_query']
6+
7+
import asyncio
8+
from concurrent.futures import ThreadPoolExecutor
9+
from io import BytesIO
10+
11+
import aiohttp
12+
import numpy as np
13+
import pandas as pd
14+
15+
from .endpoint import Endpoint
16+
from .errors import QueryError
17+
18+
19+
def _new_session(endpoint):
20+
auth = None
21+
if endpoint.username is not None:
22+
if endpoint.password is not None:
23+
raise ValueError('Password specified without username')
24+
auth = aiohttp.BasicAuth(endpoint.username, endpoint.password)
25+
return aiohttp.ClientSession(auth=auth)
26+
27+
28+
async def _pre_query(session: aiohttp.ClientSession, endpoint: Endpoint, query: str) -> tuple[
29+
list[tuple[str, (str, object)]], int]:
30+
url = f'{endpoint.url}/exec'
31+
params = [('query', query), ('count', 'true'), ('limit', '0')]
32+
dtypes_map = {
33+
'STRING': ('STRING', None),
34+
'SYMBOL': ('SYMBOL', None),
35+
'DOUBLE': ('DOUBLE', 'float64'),
36+
'FLOAT': ('FLOAT', 'float32'),
37+
'CHAR': ('CHAR', None),
38+
'TIMESTAMP': ('TIMESTAMP', None)
39+
}
40+
async with session.get(url=url, params=params) as resp:
41+
result = await resp.json()
42+
if resp.status != 200:
43+
raise QueryError.from_json(result)
44+
columns = [
45+
(col['name'], dtypes_map[col['type'].upper()])
46+
for col in result['columns']]
47+
count = result['count']
48+
return columns, count
49+
50+
51+
async def _query_pandas(
52+
session: aiohttp.ClientSession,
53+
executor: ThreadPoolExecutor,
54+
endpoint: Endpoint,
55+
query: str,
56+
result_schema: list[tuple[str, tuple[str, object]]],
57+
limit_range: tuple[int, int]) -> pd.DataFrame:
58+
url = f'{endpoint.url}/exp'
59+
params = [
60+
('query', query),
61+
('limit', f'{limit_range[0]},{limit_range[1]}')]
62+
async with session.get(url=url, params=params) as resp:
63+
if resp.status != 200:
64+
raise QueryError.from_json(await resp.json())
65+
buf = await resp.content.read()
66+
download_bytes = len(buf)
67+
buf_reader = BytesIO(buf)
68+
dtypes = {
69+
col[0]: col[1][1]
70+
for col in result_schema
71+
if col[1][1] is not None}
72+
73+
def _read_csv():
74+
df = pd.read_csv(buf_reader, dtype=dtypes, engine='pyarrow')
75+
# Patch up the column types.
76+
for col_schema in result_schema:
77+
col_name = col_schema[0]
78+
col_type = col_schema[1][0]
79+
try:
80+
if col_type == 'TIMESTAMP':
81+
series = df[col_name]
82+
series = pd.to_datetime(series)
83+
df[col_name] = series
84+
except Exception as e:
85+
raise ValueError(
86+
f'Failed to convert column {col_name} to type {col_type}: {e}\n{series}')
87+
return df
88+
89+
loop = asyncio.get_running_loop()
90+
df = await loop.run_in_executor(executor, _read_csv)
91+
return df, download_bytes
92+
93+
94+
async def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> pd.DataFrame:
95+
"""
96+
Query QuestDB via CSV to a Pandas DataFrame.
97+
"""
98+
endpoint = endpoint or Endpoint()
99+
with ThreadPoolExecutor(max_workers=chunks) as executor:
100+
async with _new_session(endpoint) as session:
101+
result_schema, row_count = await _pre_query(session, endpoint, query)
102+
rows_per_spawn = row_count // chunks
103+
limit_ranges = [
104+
(
105+
i * rows_per_spawn,
106+
((i + 1) * rows_per_spawn) if i < chunks - 1 else row_count
107+
)
108+
for i in range(chunks)]
109+
tasks = [
110+
asyncio.ensure_future(_query_pandas(
111+
session, executor, endpoint, query, result_schema, limit_range))
112+
for limit_range in limit_ranges]
113+
results = await asyncio.gather(*tasks)
114+
sub_dataframes = [result[0] for result in results]
115+
df = pd.concat(sub_dataframes)
116+
if stats:
117+
total_downloaded = sum(result[1] for result in results)
118+
return df, total_downloaded
119+
else:
120+
return df
121+
122+
123+
async def numpy_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> dict[str, np.array]:
124+
"""
125+
Query and obtain the result as a dict of columns.
126+
Each column is a numpy array.
127+
"""
128+
res = await pandas_query(query, endpoint, chunks, stats=stats)
129+
df, stats_res = res if stats else (res, None)
130+
# Calling `.to_numpy()` for each column is quite efficient and generally avoids copies.
131+
# Pandas already stores columns as numpy.
132+
# We go through Pandas as this allows us to get fast CSV parsing.
133+
np_arrays = {col_name: df[col_name].to_numpy() for col_name in df}
134+
return (np_arrays, stats_res) if stats else np_arrays

questdb_query/endpoint.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
class Endpoint:
2+
"""
3+
HTTP connection parameters into QuestDB
4+
"""
5+
def __init__(self, host='127.0.0.1', port=None, https=True, username=None, password=None):
6+
self.host = host
7+
self.port = port or (443 if https else 9000)
8+
self.https = https
9+
self.username = username
10+
self.password = password
11+
12+
@property
13+
def url(self):
14+
protocol = 'https' if self.https else 'http'
15+
return f'{protocol}://{self.host}:{self.port}'

questdb_query/errors.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from typing import Optional
2+
3+
4+
class QueryError(Exception):
5+
def __init__(self, message: str, query: str, position: Optional[int] = None):
6+
super().__init__(message)
7+
self.query = query
8+
self.position = position
9+
10+
@classmethod
11+
def from_json(cls, json: dict):
12+
return cls(
13+
message=json['error'],
14+
query=json['query'],
15+
position=json.get('position'))

questdb_query/synchronous.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""
2+
A sync shim around the `asynchronous` module.
3+
"""
4+
5+
__all__ = ['pandas_query', 'numpy_query']
6+
7+
import asyncio
8+
9+
import numpy as np
10+
import pandas as pd
11+
12+
from . import asynchronous as a
13+
from .endpoint import Endpoint
14+
15+
16+
def _syncify(async_fn, *args, **kwargs):
17+
try:
18+
loop = asyncio.get_running_loop()
19+
except RuntimeError:
20+
loop = None
21+
if loop is None:
22+
return asyncio.run(async_fn(*args, **kwargs))
23+
else:
24+
return loop.run_until_complete(async_fn(*args, **kwargs))
25+
26+
27+
def pandas_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> pd.DataFrame:
28+
return _syncify(a.pandas_query(query, endpoint, chunks, stats=stats))
29+
30+
31+
def numpy_query(query: str, endpoint: Endpoint = None, chunks: int = 1, *, stats: bool = False) -> dict[str, np.array]:
32+
return _syncify(a.numpy_query(query, endpoint, chunks, stats=stats))

questdb_query/tool.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Benchmarking tool
3+
4+
From the command line, run as::
5+
6+
python3 -m questdb_query.tool --help
7+
8+
"""
9+
10+
from .endpoint import Endpoint
11+
from .synchronous import numpy_query
12+
13+
14+
def _parse_args():
15+
import argparse
16+
parser = argparse.ArgumentParser()
17+
parser.add_argument('--host', type=str, default='localhost')
18+
parser.add_argument('--port', type=int)
19+
parser.add_argument('--https', action='store_true')
20+
parser.add_argument('--username', type=str)
21+
parser.add_argument('--password', type=str)
22+
parser.add_argument('--chunks', type=int, default=1)
23+
parser.add_argument('query', type=str)
24+
return parser.parse_args()
25+
26+
27+
def main(args):
28+
import time
29+
endpoint = Endpoint(
30+
host=args.host,
31+
port=args.port,
32+
https=args.https,
33+
username=args.username,
34+
password=args.password)
35+
start_time = time.perf_counter()
36+
np_arrs, total_downloaded = numpy_query(endpoint, args.query, args.chunks, stats=True)
37+
elapsed = time.perf_counter() - start_time
38+
print(f'Elapsed: {elapsed}')
39+
bytes_throughput = total_downloaded / 1024.0 / 1024.0 / elapsed
40+
print(
41+
f'Data throughput: {bytes_throughput:.2f} MiB/sec (of downloaded CSV data)')
42+
43+
44+
if __name__ == "__main__":
45+
args = _parse_args()
46+
main(args)

0 commit comments

Comments
 (0)