# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [1]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3A_ICEBERG_URI=s3a://csnow-bucket/iceberg/
---



## Specify other Environment Variables

In [2]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [3]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option("max_colwidth", 150)

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
     # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # ICEBERG
    ("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog"),
    ("spark.sql.catalog.iceberg.type", "hive"),
    ("spark.sql.catalog.iceberg.uri", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
    # S3A
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
    # Hive
    ("hive.metastore.uris", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

import logging

# Set logging for a specific class/package
logging.getLogger("com.example.HelloWorldCatalog").setLevel(logging.DEBUG)

print("Spark successfully loaded")

Spark successfully loaded


In [4]:
spark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from datetime import date
import io
from contextlib import redirect_stdout
import re

def create_test_table(spark):
    table_name = "ndb.`csnow-db`.nyt.ppdtest"
    data = [(1, "apple", 10.5, date(2023, 10, 26), True),
            (2, "banana", 20.0, date(2023, 10, 27), False),
            (3, "orange", 15.7, date(2023, 10, 28), True)]
    df = spark.createDataFrame(data, ["id", "fruit", "price", "date", "is_ripe"])
    df.write.mode("overwrite").saveAsTable(table_name)
    return table_name

def get_explain_plan(df):
    buffer = io.StringIO()
    with redirect_stdout(buffer):
        df.explain(True)
    return buffer.getvalue()

def extract_individual_predicates(text):
    match = re.search(r"pushed_down_predicates=\[(.*?)\](?:}| RuntimeFilters)", text, re.DOTALL)
    if match:
        predicates_str = match.group(1)
        if not predicates_str:
            return []
        predicates = []
        current_predicate = ""
        bracket_level = 0
        in_quotes = False
        for char in predicates_str:
            if char == '[':
                bracket_level += 1
                current_predicate += char
            elif char == ']':
                bracket_level -= 1
                current_predicate += char
            elif char == "'" or char == '"':
                in_quotes = not in_quotes
                current_predicate += char
            elif char == ',' and bracket_level == 0 and not in_quotes:
                predicates.append(current_predicate.strip())
                current_predicate = ""
            else:
                current_predicate += char
        predicates.append(current_predicate.strip())
        if predicates:
            return predicates[1:]
        return []
    return []

def check_pushdown(explain_str, query):
    if "Filter" in explain_str and "VastScan" in explain_str:
        predicates_list = extract_individual_predicates(explain_str)
        if predicates_list:
            if "year" in query or "LIKE" in query:
                return any("IS NOT NULL" in pred for pred in predicates_list)
            elif "NOT is_ripe" in query:
                return any("is_ripe IS NOT NULL" in pred for pred in predicates_list) and any("is_ripe > true, is_ripe < true" in pred for pred in predicates_list)
            elif "LIKE '%app%'" in query:
                return any("like" in pred.lower() and "'%app%'" in pred.lower() for pred in predicates_list)
            else:
                cleaned_query = query.replace("'", "").replace(" = ", " = ").lower()
                query_terms = cleaned_query.split("and")
                return all(any(term.strip().lower() in pred.lower() for pred in predicates_list) for term in query_terms)
        return False
    return False

def analyze_predicate_pushdown(spark, table_name):
    queries = [("id = 1", "id", "equality"),
               ("fruit = 'apple'", "fruit", "equality"),
               ("price > 15", "price", "greater than"),
               ("date = '2023-10-26'", "date", "equality"),
               ("is_ripe = true", "is_ripe", "equality"),
               ("fruit LIKE 'a%'", "fruit", "LIKE"),
               ("price >= 10 AND price <= 20", "price", "between"),
               ("year(date) = 2023", "date", "year function"),
               ("NOT is_ripe", "is_ripe", "NOT"),
               ("fruit LIKE '%app%'", "fruit", "substr match (LIKE)")]

    results = []
    df_schema = spark.table("ndb.`csnow-db`.nyt.ppdtest").schema

    for query, column, predicate_type in queries:
        df = spark.sql(f"SELECT * FROM {table_name} WHERE {query}")
        explain_str = get_explain_plan(df)
        pushed_down = check_pushdown(explain_str, query)
        predicates = extract_individual_predicates(explain_str)
        try:
            column_type = df_schema[column].dataType.typeName()
        except KeyError:
            column_type = "N/A"
        results.append((column, column_type, predicate_type, query, predicates, pushed_down))

    return results

def print_matrix(results):
    print("\nPredicate Pushdown Analysis Matrix")
    print("-" * 141)
    print(f"| {'Column':<15} | {'Col Type':<10} | {'Predicate Type':<20} | {'Query':<30} | {'Pushed Down Predicates':<50} |")
    print("-" * 141)
    for column, column_type, predicate_type, query, predicates, pushed_down in results:
        predicates_str = ", ".join(map(str, predicates))
        print(f"| {column:<15} | {column_type:<10} | {predicate_type:<20} | {query:<30} | {predicates_str:<50} |")
    print("-" * 141)


if __name__ == "__main__":
    spark = SparkSession.builder.getOrCreate()
    table_name = create_test_table(spark)
    results = analyze_predicate_pushdown(spark, table_name)
    print_matrix(results)
    spark.stop()


Predicate Pushdown Analysis Matrix
---------------------------------------------------------------------------------------------------------------------------------------------
| Column          | Col Type   | Predicate Type       | Query                          | Pushed Down Predicates                             |
---------------------------------------------------------------------------------------------------------------------------------------------
| id              | long       | equality             | id = 1                         | [id = 1]                                           |
| fruit           | string     | equality             | fruit = 'apple'                | [fruit = 'apple']                                  |
| price           | double     | greater than         | price > 15                     | [price > 15.0]                                     |
| date            | date       | equality             | date = '2023-10-26'            | [date = 19656]         