# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [1]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=se-var-vastdb-ingest
---
VASTDB_ENDPOINT=http://172.200.201.1:80
VASTDB_ACCESS_KEY=DYF8
VASTDB_SECRET_KEY=****Usuu
---
S3_ENDPOINT=http://172.200.201.1:80
S3_ACCESS_KEY=DYF8
S3_SECRET_KEY=****Usuu
---



## Specify other Environment Variables

In [2]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [3]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option("max_colwidth", 150)

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.sql.execution.arrow.pyspark.enabled", "false"),
     # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # ICEBERG
    ("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog"),
    ("spark.sql.catalog.iceberg.type", "hive"),
    ("spark.sql.catalog.iceberg.uri", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
    # S3A
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded")

Spark successfully loaded


In [4]:
sc

## Connect to Vast DB

### Specify Environment

In [1]:
BUCKET_NAME = VASTDB_TWITTER_INGEST_BUCKET
DATABASE_SCHEMA = VASTDB_TWITTER_INGEST_SCHEMA
TABLE_NAME = VASTDB_TWITTER_INGEST_TABLE

NameError: name 'VASTDB_TWITTER_INGEST_BUCKET' is not defined

### Connect and run a query

In [6]:
DATABASE_FULLNAME = f"ndb.`{BUCKET_NAME}`.`{DATABASE_SCHEMA}`"

spark.sql(f"create schema if not exists {DATABASE_FULLNAME}")

# Set the database name so we don't have to fully qualify all object names
# https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-usedb.html
# spark.sql(f"use {DATABASE_FULLNAME}")

print(f"Using {DATABASE_FULLNAME=}")

Using DATABASE_FULLNAME='ndb.csnowdb.twitter_import'


In [7]:
df = spark.sql(f"""
SELECT 
  *
FROM 
  {DATABASE_FULLNAME}.{TABLE_NAME}
WHERE
  created_at in (
    SELECT 
      MAX(created_at) latest_created_at
    FROM 
      {DATABASE_FULLNAME}.{TABLE_NAME}
  )
ORDER BY id
LIMIT 1
""")

In [8]:
df.toPandas()

Unnamed: 0,created_at,id,id_str,text
0,1730802162133,-8030495663380004494,-8030495663380004494,impressed with how wonderful MachineLearningEngineering is!


## Connect to Iceberg on Vast S3

### Specify Environment

In [9]:
spark.sql("SHOW DATABASES in iceberg").toPandas()

Unnamed: 0,namespace
0,default
1,social_media


In [10]:
spark.sql("SHOW TABLES in iceberg.social_media").toPandas()

Unnamed: 0,namespace,tableName,isTemporary
0,social_media,twitter_data,False


In [11]:
spark.sql("DESCRIBE EXTENDED iceberg.social_media.twitter_data").toPandas()

Unnamed: 0,col_name,data_type,comment
0,created_at,bigint,
1,id,bigint,
2,id_str,string,
3,text,string,
4,,,
5,# Metadata Columns,,
6,_spec_id,int,
7,_partition,struct<>,
8,_file,string,
9,_pos,bigint,


In [12]:
result_df = spark.sql("""
SELECT 
    * 
FROM 
    iceberg.social_media.twitter_data
""")
result_df.toPandas()

Unnamed: 0,created_at,id,id_str,text
0,1730801736976,-3640497172244038836,-3640497172244038836,totally in love with how lovely Cybersecurity is!
1,1730801736465,-9106780994739652073,-9106780994739652073,looking forward to see how incredible MachineLearningEngineering is!
2,1730801736465,-5100121538830561777,-5100121538830561777,totally in love with how cool WebDevelopment is!
3,1730801736465,-5739059549601626058,-5739059549601626058,just discovered how awesome WebDevelopment is!
4,1730801736567,-3162480303247874826,-3162480303247874826,prepared for how incredible SecurityEngineering is!
...,...,...,...,...
95,1730801738513,116197036578620183,116197036578620183,prepared for how great MobileDevelopment is!
96,1730801738513,7501468125245342077,7501468125245342077,blown away by how kind Security is!
97,1730801738717,3062939358194572779,3062939358194572779,impressed with how nice DevOps is!
98,1730801738921,-6440107327848405237,-6440107327848405237,totally in love with how perfect Cloud is!
