In [1]:
!pip3 install --quiet ray vastdb aiohttp aiohttp_cors opencensus 
!pip3 install -U grpcio



In [13]:
import ray

ray.shutdown() if ray.is_initialized() else None

In [None]:
import ray

ray.init(_temp_dir="/tmp/ray_spill")

In [15]:
! ray list tasks

No resource in the cluster
[0m

In [16]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3A_ICEBERG_URI=s3a://csnow-bucket/iceberg/
---



In [17]:
import logging
from typing import Dict, List, Optional
from dataclasses import dataclass
import backoff

import pyarrow as pa
import vastdb
from vastdb.config import QueryConfig
from ray.data.block import Block, BlockMetadata
from ray.data.datasource.datasource import Datasource, ReadTask

logger = logging.getLogger(__name__)

class VastDBDatasource(Datasource):
    """Datasource for reading from VastDB tables."""

    def __init__(
        self,
        endpoint: str,
        access_key: str,
        secret_key: str,
        bucket: str,
        schema: str,
        table: str,
        columns: Optional[List[str]] = None,
        predicate: Optional[str] = None,
        query_config: Optional[Dict] = None,
    ):
        """Initialize VastDB datasource."""
        self._endpoint = endpoint
        self._access_key = access_key
        self._secret_key = secret_key
        self._bucket = bucket
        self._schema = schema
        self._table = table
        self._columns = columns
        self._predicate = predicate
        self._query_config = query_config or {}
        self._session = None
        self._table_ref = None

    def _get_or_create_session(self):
        if self._session is None:
            import vastdb
            self._session = vastdb.connect(
                endpoint=self._endpoint,
                access=self._access_key,
                secret=self._secret_key
            )
            with self._session.transaction() as tx:
                bucket = tx.bucket(self._bucket)
                schema = bucket.schema(self._schema)
                self._table_ref = schema.table(self._table)
                self._table_schema = self._table_ref.columns()

    def estimate_inmemory_data_size(self) -> Optional[int]:
        self._get_or_create_session()
        return None

    def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
        """Create read tasks for parallel processing."""
        self._get_or_create_session()
        
        # Create base query config with proper defaults
        base_config = QueryConfig(
            num_splits=parallelism,
            num_sub_splits=4,
            use_semi_sorted_projections=True,
            limit_rows_per_sub_split=128 * 1024,
            num_row_groups_per_sub_split=8
        )
        
        # Update with any user-provided config
        for key, value in self._query_config.items():
            if hasattr(base_config, key):
                setattr(base_config, key, value)
    
        def make_block(
            endpoint: str,
            access_key: str,
            secret_key: str,
            bucket: str,
            schema_name: str,
            table: str,
            columns: Optional[List[str]],
            predicate: Optional[str],
            config: QueryConfig,
            split_index: int,
        ) -> Block:
            import vastdb
            
            session = vastdb.connect(
                endpoint=endpoint,
                access=access_key,
                secret=secret_key
            )
            
            with session.transaction() as tx:
                bucket = tx.bucket(bucket)
                schema = bucket.schema(schema_name)
                table = schema.table(table)
                
                # Create a new config instance for this split
                split_config = QueryConfig(
                    num_splits=config.num_splits,
                    num_sub_splits=config.num_sub_splits,
                    use_semi_sorted_projections=config.use_semi_sorted_projections,
                    limit_rows_per_sub_split=config.limit_rows_per_sub_split,
                    num_row_groups_per_sub_split=config.num_row_groups_per_sub_split,
                    semi_sorted_projection_name=config.semi_sorted_projection_name,

                    # query_id does not allow partitioning splits?
                    query_id=f"{config.query_id}_split_{split_index}",
                    queue_priority=config.queue_priority
                )
                
                reader = table.select(
                    columns=columns,
                    predicate=predicate,
                    config=split_config
                )
                
                return reader.read_all()
    
        # Create a read task for each split
        read_tasks = []
        for i in range(parallelism):
            metadata = BlockMetadata(
                num_rows=None,
                size_bytes=None, 
                schema=self._table_schema,
                input_files=None,
                exec_stats=None,
            )
            
            make_block_args = (
                self._endpoint,
                self._access_key,
                self._secret_key,
                self._bucket,
                self._schema,
                self._table,
                self._columns,
                self._predicate,
                base_config,
                i,  # split_index
            )
            
            read_task = ReadTask(
                lambda args=make_block_args: [make_block(*args)],
                metadata,
            )
            read_tasks.append(read_task)
    
        return read_tasks

In [18]:
import ray
from ibis import _

predicate = (_.text.contains('BigData'))

vastDB = VastDBDatasource(
            endpoint = VASTDB_ENDPOINT,
            access_key = VASTDB_ACCESS_KEY,
            secret_key = VASTDB_SECRET_KEY,
            bucket = VASTDB_TWITTER_INGEST_BUCKET,
            schema = VASTDB_TWITTER_INGEST_SCHEMA,
            table = VASTDB_TWITTER_INGEST_TABLE,
            columns = None,
            predicate = predicate,
            query_config = None
        )

ds = ray.data.read_datasource(vastDB)

In [19]:
ds.schema()

Column      Type
------      ----
created_at  int64
id          int64
id_str      string
text        string

In [20]:
ds.take(1)

2025-02-17 23:02:26,615	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray_spill/session_2025-02-17_23-02-05_547675_111390/logs/ray-data
2025-02-17 23:02:26,616	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadVastDB] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- ReadVastDB 1: 0.00 row [00:00, ? row/s]

- limit=1 2: 0.00 row [00:00, ? row/s]

[{'created_at': 1732218546737,
  'id': 2385707629080799740,
  'id_str': '2385707629080799740',
  'text': 'finally got how brilliant BigData is!'}]

In [21]:
! ray status

Node status
---------------------------------------------------------------
Active:
 1 node_0f3c8e09fc6b32909715b64ea93f59dce9ae3c4099ba907a1e4d7c6a
Pending:
 (no pending nodes)
Recent failures:
 (no failures)

Resources
---------------------------------------------------------------
Usage:
 6.0/16.0 CPU
 0B/13.77GiB memory
 740.14KiB/6.89GiB object_store_memory

Demands:
 {'CPU': 1}: 20+ from request_resources()
[0m

In [22]:
! ray list tasks


Stats:
------------------------------
Total: 44

Table:
------------------------------
    TASK_ID                                             ATTEMPT_NUMBER  NAME                                                STATE       JOB_ID  ACTOR_ID                          TYPE                 FUNC_OR_CLASS_NAME                      PARENT_TASK_ID                                    NODE_ID                                                   WORKER_ID                                                   WORKER_PID  ERROR_TYPE
 0  0e79562c1e26791dbdda415d322550205aa23b2901000000                 0  _StatsActor.update_execution_metrics                FAILED    01000000  bdda415d322550205aa23b2901000000  ACTOR_TASK           _StatsActor.update_execution_metrics    ffffffffffffffffffffffffffffffffffffffff01000000                                                                                                                                    OUT_OF_MEMORY
 1  10d0edfb9cfb2004ffffffffffffffffffffffff01000