# Catalog - Python SDK Examples

In [1]:
!pip install --quiet vastdb

## S3 Metadata (Tags)

In [2]:
! s3cmd_configure.sh # custom script to setup s3cmd connection detail

In [3]:
! s3cmd ls s3://csnow-bucket/nyt/ | awk 'NR<=10' # just show a few rows (10)

2025-01-22 14:57    481274128  s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet
2024-12-09 15:54    458846485  s3://csnow-bucket/nyt/yellow_tripdata_2009-02.parquet
2024-12-09 15:55    498225013  s3://csnow-bucket/nyt/yellow_tripdata_2009-03.parquet
2024-12-09 15:56    494119681  s3://csnow-bucket/nyt/yellow_tripdata_2009-04.parquet
2024-12-09 15:56    515203538  s3://csnow-bucket/nyt/yellow_tripdata_2009-05.parquet
2024-12-09 15:57    491071155  s3://csnow-bucket/nyt/yellow_tripdata_2009-06.parquet
2024-12-09 15:57    471955931  s3://csnow-bucket/nyt/yellow_tripdata_2009-07.parquet
2024-12-09 15:58    477035993  s3://csnow-bucket/nyt/yellow_tripdata_2009-08.parquet
2024-12-09 15:58    488439674  s3://csnow-bucket/nyt/yellow_tripdata_2009-09.parquet
2024-12-09 15:59    543738743  s3://csnow-bucket/nyt/yellow_tripdata_2009-10.parquet


### Remove previous user tags

In [4]:
! s3cmd modify --remove-header='x-amz-meta-foo' s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

modify: 's3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet'  [1 of 1]


In [5]:
! s3cmd info s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet (object):
   File size: 481274128
   Last mod:  Wed, 22 Jan 2025 15:52:43 GMT
   MIME type: binary/octet-stream
   Storage:   STANDARD
   MD5 sum:   d41d8cd98f00b204e9800998ecf8427e-58
   SSE:       none
   Policy:    none
   CORS:      none
   ACL:       Chris Snow: FULL_CONTROL


### Add user tags

In [6]:
! s3cmd modify --add-header=x-amz-meta-foo:bar s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

modify: 's3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet'  [1 of 1]


In [7]:
! s3cmd info s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet

s3://csnow-bucket/nyt/yellow_tripdata_2009-01.parquet (object):
   File size: 481274128
   Last mod:  Wed, 22 Jan 2025 15:52:45 GMT
   MIME type: binary/octet-stream
   Storage:   STANDARD
   MD5 sum:   d41d8cd98f00b204e9800998ecf8427e-58
   SSE:       none
   Policy:    none
   CORS:      none
   ACL:       Chris Snow: FULL_CONTROL
   x-amz-meta-foo: bar


### Create VAST DB Catalog Session

In [8]:
import os

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
""")


---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
---



In [9]:
import pyarrow as pa
import vastdb

session = vastdb.connect(
    endpoint=VASTDB_ENDPOINT,
    access=VASTDB_ACCESS_KEY,
    secret=VASTDB_SECRET_KEY)

In [10]:
field_names = ['element_type'] # Only need the element_type field for counting

with session.transaction() as tx:

    # we will work with patable functionality
    pa_table = tx.catalog().select(columns=field_names).read_all()

    # you can also work with pandas df, but pa_table is probably quicker
    pdf = pa_table.to_pandas()

### How many elements are in the catalog


In [11]:
total_elements = pa_table.num_rows
print(f"Total elements in the catalog: {total_elements}")

Total elements in the catalog: 84836469


### How many files/objects?


In [12]:
file_count = pa_table.filter(pa.compute.field("element_type") == "FILE").num_rows
print(f"Number of files/objects: {file_count}")

Number of files/objects: 84692655


### How many directories?


In [13]:
dir_count = pa_table.filter(pa.compute.field("element_type") == "DIR").num_rows
print(f"Number of directories: {dir_count}")

Number of directories: 143671


### How many Database tables?

In [14]:
dir_count = pa_table.filter(pa.compute.field("element_type") == "TABLE").num_rows
print(f"Number of directories: {dir_count}")

Number of directories: 25


### What are all of the elements on my system anyway?


In [15]:
group_counts = pa.TableGroupBy(pa_table,"element_type").aggregate([])
group_counts.to_pandas()

Unnamed: 0,element_type
0,FILE
1,DIR
2,SYMLINK
3,SCHEMA
4,TABLE


### Alternative count approach

In [16]:
group_counts = pa.TableGroupBy(pa_table,"element_type").aggregate([([], "count_all")])
group_counts.to_pandas()

Unnamed: 0,element_type,count_all
0,FILE,84692655
1,DIR,143671
2,SYMLINK,106
3,TABLE,25
4,SCHEMA,12


### Simplified example of count of elements returned from parallel execution

The query_iterator iteratively executes a query on a database table, returning results in chunks as PyArrow RecordBatches, enabling efficient handling of large datasets by processing data in smaller, manageable segments.
Simplified example of count of elements returned from parallel execution.

In [17]:
def query_and_count_elements(session, field_names):

    with session.transaction() as tx:
        # batch reader
        reader = tx.catalog().select(columns=field_names)

        elements_count = 0
        for record_batch in reader:
            elements_count += record_batch.num_rows

        return elements_count

# Query Parameters
field_names = ['element_type']  # Only need the element_type field for counting

# Perform the query
total_elements = query_and_count_elements(session, field_names)
print(f"Total elements in the catalog: {total_elements}")

Total elements in the catalog: 84836469


### Simple Filtering


- first filter by pushing down to the DB predicates to search only for changes since today

In [43]:
import ibis
import time
import pyarrow.compute as pc
from datetime import datetime
from ibis import _


# today's date - should pick
date_str = time.strftime('%Y-%m-%d')

# Convert date string to epoch timestamp (in seconds)
epoch_seconds = int(time.mktime(time.strptime(date_str, '%Y-%m-%d')))

# Create an Ibis literal with epoch time and the correct data type
predicate = (_.mtime >= ibis.literal(epoch_seconds, type='timestamp'))

field_names = ['name', 'creation_time', 'uid', 'owner_name', 'size', 'user_metadata', 'user_tags']

with session.transaction() as tx:
    # batch reader
    table = tx.catalog().select(columns=field_names, predicate=predicate).read_all()
    df = table.to_pandas()

df

Unnamed: 0,name,creation_time,uid,owner_name,size,user_metadata,user_tags
0,file_15624.txt,2025-01-22 14:45:21.141552984,5069,John Gorski,20480,,
1,file_2386.txt,2025-01-22 14:46:03.517637761,5069,John Gorski,20480,,
2,file_21513.txt,2025-01-22 14:45:35.738343786,5069,John Gorski,20480,,
3,subfolder_1,2025-01-22 14:45:52.032227101,5069,John Gorski,4096,,
4,file_14682.txt,2025-01-22 14:46:06.120785539,5069,John Gorski,20480,,
...,...,...,...,...,...,...,...
84177,file_27536.txt,2025-01-22 15:21:04.826954108,5069,John Gorski,20480,,
84178,file_15986.txt,2025-01-22 15:20:28.161570268,5069,John Gorski,20480,,
84179,file_29581.txt,2025-01-22 15:19:00.046412901,5069,John Gorski,20480,,
84180,file_8016.txt,2025-01-22 15:19:17.964053479,5069,John Gorski,20480,,


- then post filter the returned dataset to search for user_meta data not null
- we should see the file we updated using s3cmd

In [44]:
df[df['user_tags'].notnull()]

Unnamed: 0,name,creation_time,uid,owner_name,size,user_metadata,user_tags
24735,yellow_tripdata_2009-01.parquet,2024-12-09 15:54:27.143172336,5102,Chris Snow,481274128,"[(foo, bar)]","[(key1, value1), (key2, value2)]"


# Unmigrated Code

All of the following code needs to be migrated to the new VastDB api

### Query for Specific File Types Across Different Users:


In [14]:
field_names = ['uid', 'owner_name', 'element_type']
filters = {
    'element_type': ['eq FILE', 'eq TABLE', 'eq DIR'],
    'uid': ['eq 500', 'eq 1000']
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
display(df)

Unnamed: 0,uid,owner_name,element_type
0,1000,vastdata,FILE
1,1000,vastdata,FILE
2,1000,vastdata,FILE
3,1000,vastdata,FILE
4,1000,vastdata,FILE
...,...,...,...
477787,1000,vastdata,FILE
477788,1000,vastdata,FILE
477789,1000,vastdata,FILE
477790,1000,vastdata,FILE


### Query for Objects Based on User and Specific Extensions


In [16]:
field_names = ['uid', 'extension', 'size']
filters = {
    'uid': ['eq 1000', 'eq 555'],
    'extension': ['eq log', 'eq ldb']  # looking for log and ldb files
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
display(df)

Unnamed: 0,uid,extension,size
0,1000,log,45511
1,1000,log,4050387
2,1000,log,1730
3,1000,log,104
4,1000,log,240974
5,1000,log,47233
6,1000,log,73391
7,1000,log,77396
8,1000,log,47334
9,1000,log,40836


### Query for Specific File Types with Size Constraints


In [63]:
field_names = ['element_type', 'size', 'name']
filters = {
    'element_type': ['eq FILE'],
    'size': ['gt 50000', 'lt 1000000']  # size between 50 KB and 1 MB
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
display(df)

Unnamed: 0,element_type,size,name
0,FILE,442558237,2012-02.data.parquet
1,FILE,16777216,upload
2,FILE,146758259,2018-04.data.parquet
3,FILE,144467145,2018-10.data.parquet
4,FILE,390453487,2013-09.data.parquet
...,...,...,...
158,FILE,67108864,upload
159,FILE,346955670,2014-01.data.parquet
160,FILE,376498312,2013-01.data.parquet
161,FILE,145782225,2017-11.data.parquet


### Query for Large TABLE Objects by Specific Users


In [None]:
field_names = ['uid', 'owner_name', 'size', 'element_type']
filters = {
    'uid': ['eq 555'],
    'element_type': ['eq TABLE'],
    'size': ['gt 10000000']  # greater than 10 MB
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
print(df)

### Timestamp Filtering
Query by birthdate: VAST uses a “creation_time” column to indicate when a new element is created:
This will output all objects linked after noon on September 1st. It will not output files that have been moved to a new path.

NOTE : Same method can be applied for acces-time (atime), modification-time (mtime) & metadata-update-times (ctime).


In [None]:
# i.e: SELECT CONCAT(parent_path, name) FROM vast_big_catalog_table WHERE creation_time > TIMESTAMP '2023-09-01 12:00:01'

# Set the timestamp for comparison
timestamp_birthdate = pd.Timestamp('2023-09-01 12:00:01')

# Convert the timestamp to an integer
timestamp_birthdate_int = int(timestamp_birthdate.timestamp())

# Query the database
field_names = ['creation_time', 'parent_path', 'name']
filters = {'creation_time': [f'gt {timestamp_birthdate_int}']}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()

# Filter and concatenate paths
df_filtered = df[df['creation_time'] > timestamp_birthdate]
df_filtered['full_path'] = df_filtered['parent_path'] + df_filtered['name']

# Print result
print("Objects created after 2023-09-01 12:00:01:")
display(df_filtered['full_path'])

### Reporting
Simple queries to tell you basic statistics on a section of the namespace
Report statistics on parts of the namespace - Summarizing files of a certain type (FILE), belonging to a specific user (uid=555), and located in a certain path (/parquet-files-bucket)

In [62]:
import numpy as np

# Query the database
field_names = ['uid', 'used', 'size']
filters = {
    'search_path': ['eq /nyc-taxi'],
    'element_type': ['eq FILE']
}
table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()

# Check if DataFrame is empty
if df.empty:
    print("No data returned from query. Please check filters and field names.")
else:
    # Perform aggregations
    users_count = df['uid'].nunique()
    files_count = len(df)
    kb_used_sum = df['used'].sum() / 1000
    avg_size_kb = df['size'].mean() / 1000

    # Formatting results
    formatted_results = {
        'users': f"{users_count:,d}",
        'Files': f"{files_count:,d}",
        'KB_Used': f"{kb_used_sum:,.0f}",
        'Avg_Size_KB': f"{avg_size_kb:,.2f}"
    }

    # Print formatted results
    print("Aggregated Results:")
    print(formatted_results)

No data returned from query. Please check filters and field names.


### Capacity Grouping & Usage report
Here’s a report on all the users on the system:
Get Files across whole system('/'), group by owner_name, sum files, total and average size in kilobytes, oldest creation time, and most recent access time for each file owner.
Note - display is a IPython function which aggregates results in table format


In [59]:
from IPython.display import display
import pandas as pd
import numpy as np

# Querying the database
filters = {
    'element_type': ['eq FILE'],
    'search_path': ['eq /']
}
field_names = ['owner_name', 'used', 'size', 'creation_time', 'atime']

table = vastdb_session.query('vast-big-catalog-bucket', 'vast_big_catalog_schema', 'vast_big_catalog_table', filters=filters, field_names=field_names, num_sub_splits=8)
df = table.to_pandas()
pd.options.display.max_columns = None

# Aggregating data
aggregated_data = df.groupby('owner_name').agg(
    Files=('owner_name', 'count'),
    KB_Used=('used', lambda x: np.sum(x)/1000),
    Avg_Size_KB=('size', lambda x: np.mean(x)/1000),
    Oldest_data=('creation_time', 'min'),
    Last_access=('atime', 'max')
)

# Formatting results
aggregated_data['Files'] = aggregated_data['Files'].apply(lambda x: f"{x:,d}")
aggregated_data['KB_Used'] = aggregated_data['KB_Used'].apply(lambda x: f"{x:,.0f}")
aggregated_data['Avg_Size_KB'] = aggregated_data['Avg_Size_KB'].apply(lambda x: f"{x:,.2f}")

display(aggregated_data)

Unnamed: 0_level_0,Files,KB_Used,Avg_Size_KB,Oldest_data,Last_access
owner_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,40,0,0.0,2024-03-05 15:21:31.916288773,2024-03-05 15:25:42.213137875
trinos3,123,30963037,251732.01,2024-03-05 15:00:39.594759193,2024-03-05 15:25:42.864619921


### Catalog Snapshots Comparisons
You can access catalog snapshot by navigating the schema space.
The most obvious use of snapshot comparisons is delete detection, followed by move detection.
Delete detection
Query Returns: This script compares the current state with a specific historical snapshot, identifying files present in the current table but not in the snapshot, based on their element_type and search_path.
Access to Snapshot: Access to a snapshot works by querying a specific schema directory (representing the snapshot) within the bucket


In [None]:
def query_table(schema):
    table = vastdb_session.query('vast-big-catalog-bucket', schema, 'vast_big_catalog_table', filters=filters, num_sub_splits=8)
    df = table.to_pandas()
    df['full_path'] = df['parent_path'] + df['name']
    return set(df['full_path'])

# Query Filters
filters = {
    'element_type': ['eq FILE'],
    'search_path': ['eq /']
}

# Query the current table and the snapshot
current_set = query_table('vast_big_catalog_schema')
snapshot_set = query_table('.snapshot/bc_table_2023-12-10_13_53_36/vast_big_catalog_schema')

# Find differences (Current Table vs Snapshot)
difference = current_set - snapshot_set

# Output
if difference:
    print(f"[INFO] Found {len(difference)} files in the current table but not in the snapshot:")
    for item in difference:
        print(item)
else:
    print("[INFO] No differences found")