In [1]:
# pip uninstall -y "daft[ray]" "ray[client]==2.41.0"

In [2]:
%%capture --no-stderr
%pip install "ray[client]==2.41.0"
%pip install "daft[ray]" --no-deps
%pip install vastdb

In [3]:
!pip list | grep -E "(ray|daft)"

daft                      0.4.8
ray                       2.41.0


In [4]:
import pyarrow as pa
import vastdb
import daft
import ray
import traceback
import sys
import time
from typing import Dict, Any, Optional

@ray.remote
class VastDataSource:
    def __init__(
        self, 
        endpoint: str, 
        access_key: str, 
        secret_key: str,
        bucket_name: str, 
        schema_name: str, 
        table_name: str
    ):
        """
        Initialize VAST data source connection to be run on Ray cluster
        """
        self.endpoint = endpoint
        self.access_key = access_key
        self.secret_key = secret_key
        self.bucket_name = bucket_name
        self.schema_name = schema_name
        self.table_name = table_name
        
        # Verbose print for initialization
        print(f"[INIT] VastDataSource initialized for table: {table_name}", file=sys.stderr)
    
    def verbose_batch_reading(self, reader: pa.RecordBatchReader):
        """
        Verbose batch reading with detailed diagnostics
        """
        print(f"[DEBUG] Starting batch reading for {self.table_name}", file=sys.stderr)
        
        try:
            import socket
            import os
        
            # Capture host information
            hostname = socket.gethostname()
            ip_address = socket.gethostbyname(hostname)
            pid = os.getpid()
            
            # Print detailed host information
            print(f"[HOST] Hostname: {hostname}", file=sys.stderr)
            print(f"[DEBUG] Reader type: {type(reader)}", file=sys.stderr)
            print(f"[DEBUG] Reader schema: {reader.schema}", file=sys.stderr)
            
            # Batch collection with verbose logging
            batches = []
            batch_count = 0
            start_time = time.time()
            
            try:
                # Attempt different iteration methods
                for batch in reader:
                    print(f"[DEBUG] Collected batch {batch_count}", file=sys.stderr)
                    batches.append(batch)
                    batch_count += 1
                    
                    # Safety break to prevent infinite loops
                    if batch_count >= 1000:  # Adjust as needed
                        print(f"[WARNING] Reached batch limit of 1000", file=sys.stderr)
                        break
            except Exception as iter_err:
                print(f"[ERROR] Iteration error: {iter_err}", file=sys.stderr)
                traceback.print_exc(file=sys.stderr)
            
            # Timing and summary
            end_time = time.time()
            print(f"[DEBUG] Batch reading summary:", file=sys.stderr)
            print(f"  Total batches: {batch_count}", file=sys.stderr)
            print(f"  Total time: {end_time - start_time:.2f} seconds", file=sys.stderr)
            
            return batches
        
        except Exception as e:
            print(f"[CRITICAL] Batch reading failed: {e}", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            return []

    def get_vast_data(self, columns=None, predicate=None, internal_row_id=False):
        """
        Retrieve data from VAST database with verbose error handling
        """
        print(f"[METHOD] get_vast_data called for {self.table_name}", file=sys.stderr)
        print(f"[PARAMS] Columns: {columns}, Predicate: {predicate}", file=sys.stderr)

        try:
            from vastdb.config import QueryConfig

            # config = QueryConfig(
            #   num_splits=1,                	 
            #   num_sub_splits=1,              
            #   limit_rows_per_sub_split=10,   
            # )

            # Connection establishment with verbose logging
            try:
                session = vastdb.connect(
                    endpoint=self.endpoint,
                    access=self.access_key,
                    secret=self.secret_key
                )
                print("[CONNECTION] VAST database connection established", file=sys.stderr)
            except Exception as conn_err:
                print(f"[ERROR] Connection failed: {conn_err}", file=sys.stderr)
                traceback.print_exc(file=sys.stderr)
                raise
            
            with session.transaction() as tx:
                bucket = tx.bucket(self.bucket_name)
                schema = bucket.schema(self.schema_name)
                table = schema.table(self.table_name)
                
                try:
                    # Select data with comprehensive error handling
                    reader: pa.RecordBatchReader = table.select(
                        columns=columns, 
                        # config=config
                    )
                    
                    # Verbose batch reading
                    batches = self.verbose_batch_reading(reader)
                    
                    if not batches:
                        print("[WARNING] No batches collected", file=sys.stderr)
                        return pa.Table.from_batches([])
                    
                    # Convert to table
                    result_table = pa.Table.from_batches(batches)
                    
                    print(f"[SUCCESS] Retrieved table with {result_table.num_rows} rows", file=sys.stderr)
                    return result_table
                
                except Exception as select_err:
                    print(f"[ERROR] Data selection failed: {select_err}", file=sys.stderr)
                    traceback.print_exc(file=sys.stderr)
                    raise
        
        except Exception as e:
            print(f"[CRITICAL] Comprehensive error in get_vast_data: {e}", file=sys.stderr)
            traceback.print_exc(file=sys.stderr)
            raise ValueError(f"Error reading data from VAST: {e}")

def create_vast_table(
    endpoint: str, 
    access_key: str, 
    secret_key: str,
    bucket_name: str, 
    schema_name: str, 
    table_name: str,
    columns: Optional[list] = None,
    predicate: Optional[Any] = None,
    internal_row_id: Optional[bool] = None,
    context = None
) -> 'daft.DataFrame':
    """
    Create a Daft DataFrame from a VAST data source running on Ray cluster
    """
    print(f"[FUNCTION] create_vast_table called for {table_name}", file=sys.stderr)
    
    try:
        # Create a Ray actor for the VAST data source
        vast_actor = VastDataSource.remote(
            endpoint, access_key, secret_key, 
            bucket_name, schema_name, table_name
        )
        
        # Retrieve Daft DataFrame via Ray with optional column and predicate filtering
        daft_dataframe = ray.get(vast_actor.get_vast_data.remote(columns, predicate, internal_row_id))
        
        print(f"[SUCCESS] Created Daft DataFrame from {table_name}", file=sys.stderr)
        return daft.from_arrow(daft_dataframe)
    
    except Exception as e:
        print(f"[ERROR] Failed to create VAST table: {e}", file=sys.stderr)
        traceback.print_exc(file=sys.stderr)
        raise

In [5]:
import ray
from daft import context

# Initialize Ray with the specified address
if not ray.is_initialized():
    ray.init(address="ray://10.143.11.241:30001", runtime_env={"pip": ["daft", "vastdb", "pyarrow"]})

    if not context.get_context():
        context.set_runner_ray(address="ray://10.143.11.241:30001")


2025-03-26 11:24:05,639	INFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: log_to_driver
SIGTERM handler is not set because current thread is not the main thread.
[36m(VastDataSource pid=5119)[0m [INIT] VastDataSource initialized for table: tweets
[36m(VastDataSource pid=5119)[0m [METHOD] get_vast_data called for tweets
[36m(VastDataSource pid=5119)[0m [PARAMS] Columns: None, Predicate: None
[36m(VastDataSource pid=5119)[0m [CONNECTION] VAST database connection established
[36m(VastDataSource pid=5119)[0m [DEBUG] Starting batch reading for tweets
[36m(VastDataSource pid=5119)[0m [HOST] Hostname: raycluster-kuberay-head-l7vdr
[36m(VastDataSource pid=5119)[0m [DEBUG] Reader type: <class 'pyarrow.lib.RecordBatchReader'>
[36m(VastDataSource pid=5119)[0m [DEBUG] Reader schema: created_at: int64
[36m(VastDataSource pid=5119)[0m   -- field metadata --
[36m(VastDataSource pid=5119)[0m   VAST:column_id: '64'
[36m(VastDataSource pid=5119)

In [6]:
# Create Daft DataFrame from VAST
df = create_vast_table(
    endpoint='http://172.200.204.2:80',
    access_key='Y5101AQQTB1PUAEKQXN5',
    secret_key='bsqwYOcsvfXxsvtTYruCT24c3w1E1Y8iBpmyoLGr',
    bucket_name='csnow-db',
    schema_name='social_media',
    table_name='tweets',
    internal_row_id=True
)

[FUNCTION] create_vast_table called for tweets
[SUCCESS] Created Daft DataFrame from tweets


In [7]:
df.show()

created_at Int64,id Int64,id_str Utf8,text Utf8
1732192043316,-3857228969640172604,-3857228969640172604,can't believe how perfect Cybersecurity is!
1732192043316,-7563449468421012864,-7563449468421012864,ready to see how kind Cybersecurity is!
1732192043316,2924690658133042582,2924690658133042582,looking forward to see how smart Security is!
1732192043316,-1541924140557640657,-1541924140557640657,so excited about how brilliant OpenSource is!
1732192043316,-6694075954799652567,-6694075954799652567,impressed with how interesting CloudEngineer is!
1732192043316,7817961198609213798,7817961198609213798,motivated by how clever BigData is!
1732192043316,6125889167885277710,6125889167885277710,inspired by how interesting DevOpsEngineering is!
1732192043316,2565220354201594939,2565220354201594939,can't wait to see how cool MachineLearning is!


In [8]:
df.count().show()

count UInt64
2588006


In [9]:
# ray.shutdown()