# Query Vast Catalog with Python SDK

See: 
- https://support.vastdata.com/s/article/UUID-b7cf8427-3f50-ac3a-b1fe-c92649823ee7
- https://vastdb-sdk.readthedocs.io/en/latest/

In [1]:
! pip3 install --quiet vastdb pandas==2.0.3 numpy==1.25.0
! pip3 show vastdb

Name: vastdb
Version: 1.3.3
Summary: VAST Data SDK
Home-page: https://github.com/vast-data/vastdb_sdk
Author: VAST DATA
Author-email: hello@vastdata.com
License: Copyright (C) VAST Data Ltd.
Location: /opt/conda/lib/python3.11/site-packages
Requires: aws-requests-auth, backoff, flatbuffers, ibis-framework, pyarrow, requests, xmltodict
Required-by: 


In [2]:
import os

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
""")


---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
---



In [4]:
import pyarrow as pa
import vastdb

session = vastdb.connect(
    endpoint=VASTDB_ENDPOINT,
    access=VASTDB_ACCESS_KEY,
    secret=VASTDB_SECRET_KEY)

with session.transaction() as tx:
    table = tx.catalog().select(['element_type']).read_all()
    df = table.to_pandas()

    total_elements = len(df)
    print(f"Total elements in the catalog: {total_elements}")

    file_count = (df['element_type'] == 'FILE').sum()
    print(f"Number of files/objects: {file_count}")

    distinct_elements = df['element_type'].unique()
    print("Distinct element types on the system:")
    print(distinct_elements)

del table
del df

Total elements in the catalog: 79701906
Number of files/objects: 79615136
Distinct element types on the system:
['FILE' 'DIR' 'TABLE' 'SCHEMA']


In [21]:
import pandas as pd
from ibis import _

PREDICATE = (_.owner_name == "Chris Snow") 
COLUMNS = ['creation_time', 'search_path', 'name']

pd.set_option('display.max_colwidth', None)

with session.transaction() as tx:
    table = tx.catalog().select(columns=COLUMNS, predicate=PREDICATE).read_all()
    df = table.to_pandas()

df.head()

Unnamed: 0,creation_time,search_path,name
0,2024-11-19 12:53:59.945971367,/csnow-bucket/iceberg/twitter_data-00749267c908442790921da3789cf6a0/metadata/,20241119_125357_00007_ne63c-28eaed13-c1c4-43ad-be97-288642eba558.stats
1,2024-11-19 12:54:01.073948904,/csnow-bucket/iceberg/twitter_data-00749267c908442790921da3789cf6a0/data/,20241119_125400_00009_ne63c-23e8463b-ace9-41c9-ac46-851f47a669cf.parquet
2,2024-11-21 07:11:10.901219362,/csnow-bucket/iceberg/twitter_data-d2fa3b2406904396b6ec891519a9d32f/metadata/,snap-6030622649746191853-1-aefc2b38-fc4b-4786-a117-475cd2bd28db.avro
3,2024-11-27 13:50:26.442020791,/csnow-bucket/iceberg/twitter_data-d2fa3b2406904396b6ec891519a9d32f/metadata/,b85c6a67-fea1-444c-aaa4-fe35b188cd96-m0.avro
4,2024-11-21 07:11:12.521849576,/csnow-bucket/iceberg/twitter_data-d2fa3b2406904396b6ec891519a9d32f/data/,20241121_071113_00838_hitua-9e17b194-5cea-4951-a10f-32e158cac854.parquet
