# Query Vast DB

## Vast DB endpoint

In [1]:
import os

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

# Use NYT BUCKET (DB) for now
VASTDB_NYT_BUCKET=os.getenv("VASTDB_NYT_BUCKET")

bucket_name = VASTDB_NYT_BUCKET
schema_name = 'cosmology'
table_name = 'particles'

## Python SDK Connection

In [2]:
# Source: https://vast-data.github.io/data-platform-field-docs/vast_database/ingestion/python_sdk_parquet_import.html

import io
import os
import pyarrow as pa
from pyarrow import csv as pa_csv
import pyarrow.parquet as pq
from io import StringIO
import numpy as np
import pandas as pd
import vastdb
from vastdb.config import QueryConfig

def connect_to_vastdb(endpoint, access_key, secret_key):
    """Connects to VastDB."""
    try:
        session = vastdb.connect(endpoint=endpoint, access=access_key, secret=secret_key)
        print("Connected to VastDB")
        return session
    except Exception as e:
        raise RuntimeError(f"Failed to connect to VastDB: {e}") from e

def query_vastdb(session, bucket_name, schema_name, table_name, limit=None):
    """Writes data to VastDB."""
    with session.transaction() as tx:
        bucket = tx.bucket(bucket_name)
        schema = bucket.schema(schema_name, fail_if_missing=False) or bucket.create_schema(schema_name)
        table = schema.table(table_name, fail_if_missing=False) or schema.create_table(table_name, pa_table.schema)

        if limit:
            # See: https://vast-data.github.io/data-platform-field-docs/vast_database/sdk_ref/limit_n.html
            config = QueryConfig(
                num_splits=1,                	  # Manually specify 1 split
                num_sub_splits=1,                 # Each split will be divided into 1 subsplits
                limit_rows_per_sub_split=limit,   # Each subsplit will process 10 rows at a time
            )
            batches = table.select(config=config)
            first_batch = next(batches)
            return first_batch.to_pandas()
        else:
            return table.select().read_all().to_pandas()

In [3]:
session = connect_to_vastdb(VASTDB_ENDPOINT, VASTDB_ACCESS_KEY, VASTDB_SECRET_KEY)

Connected to VastDB


### Compute the Total Mass (PartType0)

We have only loaded PartType0 into the DB

In [4]:
import duckdb
conn = duckdb.connect()

with session.transaction() as tx:
    table = tx.bucket(bucket_name).schema(schema_name).table(table_name)
    batches = table.select(columns=['Mass'])
    print(conn.execute(
    """
    SELECT SUM(Mass) FROM batches
    """
    ).arrow())


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

pyarrow.Table
sum(Mass): double
----
sum(Mass): [[89401.89030224655]]


### Average Velocity of Gas Particles (PartType0)

In [5]:
import duckdb
conn = duckdb.connect()

with session.transaction() as tx:
    table = tx.bucket(bucket_name).schema(schema_name).table(table_name)
    batches = table.select(columns=['Velocity'])
    print(conn.execute(
    """
    SELECT 
       AVG(Velocity[1]) AS AvgVelocity_X,
       AVG(Velocity[2]) AS AvgVelocity_Y,
       AVG(Velocity[3]) AS AvgVelocity_Z
    FROM batches
    """
    ).arrow())

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

pyarrow.Table
AvgVelocity_X: double
AvgVelocity_Y: double
AvgVelocity_Z: double
----
AvgVelocity_X: [[12.27964513190702]]
AvgVelocity_Y: [[9.441816902891075]]
AvgVelocity_Z: [[123.35455341785347]]
