# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [4]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=10.143.11.241
---
VASTDB_ENDPOINT=http://172.200.204.2:80
VASTDB_ACCESS_KEY=QXN5
VASTDB_SECRET_KEY=****oLGr
VASTDB_TWITTER_INGEST_BUCKET=csnow-db
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---
S3_ENDPOINT=http://172.200.204.2:80
S3_ACCESS_KEY=QXN5
S3_SECRET_KEY=****oLGr
S3A_ICEBERG_URI=s3a://csnow-bucket/iceberg/
---



## Specify other Environment Variables

In [5]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [6]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option("max_colwidth", 150)

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
     # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # ICEBERG
    ("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog"),
    ("spark.sql.catalog.iceberg.type", "hive"),
    ("spark.sql.catalog.iceberg.uri", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
    # S3A
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
    # Hive
    ("hive.metastore.uris", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

import logging

# Set logging for a specific class/package
logging.getLogger("com.example.HelloWorldCatalog").setLevel(logging.DEBUG)

print("Spark successfully loaded")

Spark successfully loaded


In [36]:
spark

In [10]:
spark.sql("""
SELECT * FROM ndb.`csnow-db`.social_media.tweets
WHERE text like 'a%'
""").show(truncate=False)

+-------------+--------------------+--------------------+----------------------------------------------------+
|created_at   |id                  |id_str              |text                                                |
+-------------+--------------------+--------------------+----------------------------------------------------+
|1732208274971|882386532907502832  |882386532907502832  |amazed by how funny WebDevelopment is!              |
|1732208275072|-4676456788502095242|-4676456788502095242|amazed by how beautiful SoftwareDevelopment is!     |
|1732208275174|-5688720706227868441|-5688720706227868441|amazed by how nice MachineLearningEngineering is!   |
|1732208275376|2218971738474188819 |2218971738474188819 |amazed by how nice CloudEngineer is!                |
|1732208275579|-887772990636552065 |-887772990636552065 |amazed by how awesome Cybersecurity is!             |
|1732208275680|-6375634967166087454|-6375634967166087454|amazed by how nice IoT is!                          |
|

In [13]:
df = spark.sql("""
SELECT * FROM ndb.`csnow-db`.social_media.tweets
WHERE created_at > 123456
""")

df.explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('created_at > 123456)
   +- 'UnresolvedRelation [ndb, csnow-db, social_media, tweets], [], false

== Analyzed Logical Plan ==
created_at: bigint, id: bigint, id_str: string, text: string
Project [created_at#126L, id#127L, id_str#128, text#129]
+- Filter (created_at#126L > cast(123456 as bigint))
   +- SubqueryAlias ndb.`csnow-db`.social_media.tweets
      +- RelationV2[created_at#126L, id#127L, id_str#128, text#129] ndb.`csnow-db`.social_media.tweets csnow-db/social_media/tweets

== Optimized Logical Plan ==
Filter (isnotnull(created_at#126L) AND (created_at#126L > 123456))
+- RelationV2[created_at#126L, id#127L, id_str#128, text#129] csnow-db/social_media/tweets

== Physical Plan ==
*(1) ColumnarToRow
+- BatchScan csnow-db/social_media/tweets[created_at#126L, id#127L, id_str#128, text#129] VastScan{schemed_name=(csnow-db/social_media/tweets, 1329443212), pushed_down_limit=null, pushed_down_predicates=[[created_at IS NOT NULL], [create

In [12]:
df = spark.sql("""
SELECT * FROM ndb.`csnow-db`.social_media.tweets
WHERE text = 'a'
""")

df.explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('text = a)
   +- 'UnresolvedRelation [ndb, csnow-db, social_media, tweets], [], false

== Analyzed Logical Plan ==
created_at: bigint, id: bigint, id_str: string, text: string
Project [created_at#114L, id#115L, id_str#116, text#117]
+- Filter (text#117 = a)
   +- SubqueryAlias ndb.`csnow-db`.social_media.tweets
      +- RelationV2[created_at#114L, id#115L, id_str#116, text#117] ndb.`csnow-db`.social_media.tweets csnow-db/social_media/tweets

== Optimized Logical Plan ==
Filter (isnotnull(text#117) AND (text#117 = a))
+- RelationV2[created_at#114L, id#115L, id_str#116, text#117] csnow-db/social_media/tweets

== Physical Plan ==
*(1) ColumnarToRow
+- BatchScan csnow-db/social_media/tweets[created_at#114L, id#115L, id_str#116, text#117] VastScan{schemed_name=(csnow-db/social_media/tweets, 1806513283), pushed_down_limit=null, pushed_down_predicates=[[text IS NOT NULL], [text = 'a']]} RuntimeFilters: []



In [11]:
df = spark.sql("""
SELECT * FROM ndb.`csnow-db`.social_media.tweets
WHERE text like 'a%'
""")

df.explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter 'text LIKE a%
   +- 'UnresolvedRelation [ndb, csnow-db, social_media, tweets], [], false

== Analyzed Logical Plan ==
created_at: bigint, id: bigint, id_str: string, text: string
Project [created_at#102L, id#103L, id_str#104, text#105]
+- Filter text#105 LIKE a%
   +- SubqueryAlias ndb.`csnow-db`.social_media.tweets
      +- RelationV2[created_at#102L, id#103L, id_str#104, text#105] ndb.`csnow-db`.social_media.tweets csnow-db/social_media/tweets

== Optimized Logical Plan ==
Filter (isnotnull(text#105) AND StartsWith(text#105, a))
+- RelationV2[created_at#102L, id#103L, id_str#104, text#105] csnow-db/social_media/tweets

== Physical Plan ==
*(1) Filter StartsWith(text#105, a)
+- *(1) ColumnarToRow
   +- BatchScan csnow-db/social_media/tweets[created_at#102L, id#103L, id_str#104, text#105] VastScan{schemed_name=(csnow-db/social_media/tweets, -350936998), pushed_down_limit=null, pushed_down_predicates=[[text IS NOT NULL]]} RuntimeFilters:

In [35]:
df = spark.sql("""
SELECT * FROM ndb.`csnow-db`.social_media.tweets
WHERE substring(text, 1, 1) = 'a'
""")

df.explain(True)

== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('substring('text, 1, 1) = a)
   +- 'UnresolvedRelation [ndb, csnow-db, social_media, tweets], [], false

== Analyzed Logical Plan ==
created_at: bigint, id: bigint, id_str: string, text: string
Project [created_at#150L, id#151L, id_str#152, text#153]
+- Filter (substring(text#153, 1, 1) = a)
   +- SubqueryAlias ndb.`csnow-db`.social_media.tweets
      +- RelationV2[created_at#150L, id#151L, id_str#152, text#153] ndb.`csnow-db`.social_media.tweets csnow-db/social_media/tweets

== Optimized Logical Plan ==
Filter (isnotnull(text#153) AND (substring(text#153, 1, 1) = a))
+- RelationV2[created_at#150L, id#151L, id_str#152, text#153] csnow-db/social_media/tweets

== Physical Plan ==
*(1) Filter (substring(text#153, 1, 1) = a)
+- *(1) ColumnarToRow
   +- BatchScan csnow-db/social_media/tweets[created_at#150L, id#151L, id_str#152, text#153] VastScan{schemed_name=(csnow-db/social_media/tweets, 1672398646), pushed_down_limit=null, pushed_down_p