In [45]:
! pip3 uninstall -y vastdb

Found existing installation: vastdb 1.3.4
Uninstalling vastdb-1.3.4:
  Successfully uninstalled vastdb-1.3.4


In [46]:
! pip3 install git+https://github.com/snowch/vastdb_sdk.git

Collecting git+https://github.com/snowch/vastdb_sdk.git
  Cloning https://github.com/snowch/vastdb_sdk.git to /tmp/pip-req-build-2fiaivum
  Running command git clone --filter=blob:none --quiet https://github.com/snowch/vastdb_sdk.git /tmp/pip-req-build-2fiaivum
  Resolved https://github.com/snowch/vastdb_sdk.git to commit 5139d10b2884104b4b7ac6583c6e6cc5464c97b3
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: vastdb
  Building wheel for vastdb (setup.py) ... [?25ldone
[?25h  Created wheel for vastdb: filename=vastdb-1.3.4-py3-none-any.whl size=286994 sha256=01d0996b561373ab9395557d73cbf6cfc36325b89c214434d8562566d7c6498d
  Stored in directory: /tmp/pip-ephem-wheel-cache-gr2sl_0v/wheels/b1/28/4c/4ce72b85d2226169cfcbfd1e7a3612c2f67f9dba92e95dd4e6
Successfully built vastdb
Installing collected packages: vastdb
Successfully installed vastdb-1.3.4


In [47]:
!pip3 install --quiet ray aiohttp aiohttp_cors opencensus 
!pip3 install -U grpcio



In [48]:
import logging
from typing import Dict, List, Optional
from dataclasses import dataclass
import backoff
import pyarrow as pa
import vastdb
from vastdb.config import QueryConfig
from ray.data.block import Block, BlockMetadata
from ray.data.datasource.datasource import Datasource, ReadTask

logger = logging.getLogger(__name__)

class VastDBDatasource(Datasource):
    """Datasource for reading from VastDB tables."""

    def __init__(
        self,
        endpoint: str,
        access_key: str,
        secret_key: str,
        bucket: str,
        schema: str,
        table: str,
        columns: Optional[List[str]] = None,
        predicate: Optional[str] = None,
        query_config: Optional[Dict] = None,
    ):
        """Initialize VastDB datasource."""
        self._endpoint = endpoint
        self._access_key = access_key
        self._secret_key = secret_key
        self._bucket = bucket
        self._schema = schema
        self._table = table
        self._columns = columns
        self._predicate = predicate
        self._query_config = query_config or {}
        self._session = None
        self._table_ref = None

    def _get_or_create_session(self):
        if self._session is None:
            self._session = vastdb.connect(
                endpoint=self._endpoint,
                access=self._access_key,
                secret=self._secret_key
            )
            with self._session.transaction() as tx:
                bucket = tx.bucket(self._bucket)
                schema = bucket.schema(self._schema)
                self._table_ref = schema.table(self._table)
                self._table_schema = self._table_ref.columns()

    def estimate_inmemory_data_size(self) -> Optional[int]:
        self._get_or_create_session()
        return None

    def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
        """Create read tasks for parallel processing."""
        self._get_or_create_session()

        def make_block(
            endpoint: str,
            access_key: str,
            secret_key: str,
            bucket: str,
            schema_name: str,
            table: str,
            columns: Optional[List[str]],
            predicate: Optional[str],
            config: QueryConfig,
            split_index: int,
        ) -> Block:
            session = vastdb.connect(
                endpoint=endpoint,
                access=access_key,
                secret=secret_key
            )
            with session.transaction() as tx:
                bucket = tx.bucket(bucket)
                schema = bucket.schema(schema_name)
                table_ref = schema.table(table)

                # Use the monkey-patched method
                reader = table_ref.select_with_splits(
                    columns=columns,
                    predicate=predicate,
                    config=config,
                    split_id=split_index
                )
                batches = reader.read_all()

            return pa.Table.from_batches(batches)

        # Create a read task for each split
        read_tasks = []
        for i in range(parallelism):
            metadata = BlockMetadata(
                num_rows=None,
                size_bytes=None,
                schema=self._table_schema,
                input_files=None,
                exec_stats=None,
            )

            make_block_args = (
                self._endpoint,
                self._access_key,
                self._secret_key,
                self._bucket,
                self._schema,
                self._table,
                self._columns,
                self._predicate,
                QueryConfig(**self._query_config),
                i,  # split_index
            )

            read_task = ReadTask(
                lambda args=make_block_args: [make_block(*args)],
                metadata,
            )
            read_tasks.append(read_task)

        return read_tasks

In [49]:
import ray
from ibis import _

ray.shutdown() if ray.is_initialized() else None
ray.init(_temp_dir="/tmp/ray_spill")

2025-03-02 12:09:53,736	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.10
Ray version:,2.43.0
Dashboard:,http://127.0.0.1:8265


[36m(ReadVastDB pid=637723)[0m rolling back txid=05b7500000001131 due to:
[36m(ReadVastDB pid=637723)[0m Traceback (most recent call last):
[36m(ReadVastDB pid=637723)[0m   File "/tmp/ipykernel_624451/223435526.py", line 85, in make_block
[36m(ReadVastDB pid=637723)[0m   File "/opt/conda/lib/python3.11/site-packages/vastdb/table.py", line 473, in select_with_splits
[36m(ReadVastDB pid=637723)[0m     process_split()
[36m(ReadVastDB pid=637723)[0m   File "/opt/conda/lib/python3.11/site-packages/vastdb/table.py", line 471, in process_split
[36m(ReadVastDB pid=637723)[0m     split_state.process_split(host_api, record_batches_queue, check_stop)
[36m(ReadVastDB pid=637723)[0m   File "/opt/conda/lib/python3.11/site-packages/vastdb/table.py", line 88, in process_split
[36m(ReadVastDB pid=637723)[0m     for stream_id, next_row_id, table_chunk in response_iter:
[36m(ReadVastDB pid=637723)[0m   File "/opt/conda/lib/python3.11/site-packages/vastdb/_internal.py", line 1951, in p

In [50]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3A_ICEBERG_URI=s3a://csnow-bucket/iceberg/
---



In [51]:
from ibis import _

#predicate = (_.text.contains('BigData'))

vastDB = VastDBDatasource(
            endpoint = VASTDB_ENDPOINT,
            access_key = VASTDB_ACCESS_KEY,
            secret_key = VASTDB_SECRET_KEY,
            bucket = VASTDB_TWITTER_INGEST_BUCKET,
            schema = VASTDB_TWITTER_INGEST_SCHEMA,
            table = VASTDB_TWITTER_INGEST_TABLE,
            columns = None,
            predicate = None,
            query_config = None
        )

ds = ray.data.read_datasource(vastDB)

In [52]:
ds.count()

2025-03-02 12:09:54,941	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray_spill/session_2025-03-02_12-09-52_740788_624451/logs/ray-data
2025-03-02 12:09:54,943	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadVastDB] -> AggregateNumRows[AggregateNumRows]


Running 0: 0.00 row [00:00, ? row/s]

- ReadVastDB 1: 0.00 row [00:00, ? row/s]

- AggregateNumRows 2: 0.00 row [00:00, ? row/s]

2025-03-02 12:09:57,327	ERROR serialization.py:462 -- Failed to unpickle serialized exception
Traceback (most recent call last):
  File "python/ray/_raylet.pyx", line 479, in ray._raylet.ObjectRefGenerator._next_sync
  File "python/ray/_raylet.pyx", line 4847, in ray._raylet.CoreWorker.try_read_next_object_ref_stream
  File "python/ray/includes/common.pxi", line 81, in ray._raylet.check_status
ray.exceptions.ObjectRefStreamEndOfStreamError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py", line 104, in on_data_ready
    meta = ray.get(next(self._streaming_gen))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "python/ray/_raylet.pyx", line 328, in ray._raylet.ObjectRefGenerator.__next__
  File "python/ray/_raylet.pyx", line 497, in ray._raylet.ObjectRefGenerator._next_sync
StopIteration

During handling of the above exce

RaySystemError: System error: Failed to unpickle serialized exception
traceback: Traceback (most recent call last):
  File "python/ray/_raylet.pyx", line 479, in ray._raylet.ObjectRefGenerator._next_sync
  File "python/ray/_raylet.pyx", line 4847, in ray._raylet.CoreWorker.try_read_next_object_ref_stream
  File "python/ray/includes/common.pxi", line 81, in ray._raylet.check_status
ray.exceptions.ObjectRefStreamEndOfStreamError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py", line 104, in on_data_ready
    meta = ray.get(next(self._streaming_gen))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "python/ray/_raylet.pyx", line 328, in ray._raylet.ObjectRefGenerator.__next__
  File "python/ray/_raylet.pyx", line 497, in ray._raylet.ObjectRefGenerator._next_sync
StopIteration

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/ray/exceptions.py", line 51, in from_ray_exception
    return pickle.loads(ray_exception.serialized_exception)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: HttpError.__init__() missing 5 required positional arguments: 'message', 'method', 'url', 'status', and 'headers'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/ray/_private/serialization.py", line 460, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/ray/_private/serialization.py", line 342, in _deserialize_object
    return RayError.from_bytes(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/ray/exceptions.py", line 45, in from_bytes
    return RayError.from_ray_exception(ray_exception)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/ray/exceptions.py", line 54, in from_ray_exception
    raise RuntimeError(msg) from e
RuntimeError: Failed to unpickle serialized exception
