# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [1]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
""")


---
DOCKER_HOST_OR_IP=se-var-vastdb-ingest
---
VASTDB_ENDPOINT=http://172.200.201.1:80
VASTDB_ACCESS_KEY=DYF8
VASTDB_SECRET_KEY=****Usuu
---
S3_ENDPOINT=http://172.200.201.1:80
S3_ACCESS_KEY=DYF8
S3_SECRET_KEY=****Usuu
---



## Specify other Environment Variables

In [2]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [3]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
     # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # ICEBERG
    ("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog"),
    ("spark.sql.catalog.iceberg.type", "hive"),
    ("spark.sql.catalog.iceberg.uri", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
    # S3A
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
    # HIVE METASTORE
    ("hive.metastore.uris", f"thrift://{DOCKER_HOST_OR_IP}:9083")
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded")

Spark successfully loaded


In [4]:
sc

## Connect to Vast DB

### Specify Environment

In [5]:
BUCKET_NAME = 'csnowdb'
DATABASE_SCHEMA='twitter_import'
TABLE_NAME='twitter_data'

### Connect and run a query

In [6]:
DATABASE_FULLNAME = f"ndb.{BUCKET_NAME}.{DATABASE_SCHEMA}"

spark.sql(f"create schema if not exists {DATABASE_FULLNAME}")

# Set the database name so we don't have to fully qualify all object names
# https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-usedb.html
# spark.sql(f"use {DATABASE_FULLNAME}")

print(f"Using {DATABASE_FULLNAME=}")

Using DATABASE_FULLNAME='ndb.csnowdb.twitter_import'


In [7]:
df = spark.sql(f"""
SELECT 
  *
FROM 
  {DATABASE_FULLNAME}.{TABLE_NAME}
WHERE
  created_at in (
    SELECT 
      MAX(created_at) latest_created_at
    FROM 
      {DATABASE_FULLNAME}.{TABLE_NAME}
  )
ORDER BY id
LIMIT 1
""")

In [8]:
df.show()

+--------------------+--------------------+--------------------+--------------------+
|          created_at|                  id|              id_str|                text|
+--------------------+--------------------+--------------------+--------------------+
|2024/11/02 19:30:...|-9199571530558861734|-9199571530558861734|eager to see how ...|
+--------------------+--------------------+--------------------+--------------------+



## Connect to Vast S3

### Specify Environment

In [9]:
spark.sql("SHOW DATABASES in iceberg").show()

+------------+
|   namespace|
+------------+
|     default|
|social_media|
+------------+



In [10]:
spark.sql("SHOW TABLES in iceberg.social_media").show()

+------------+------------+-----------+
|   namespace|   tableName|isTemporary|
+------------+------------+-----------+
|social_media|twitter_data|      false|
+------------+------------+-----------+



In [11]:
spark.sql("DESCRIBE EXTENDED iceberg.social_media.twitter_data").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|          created_at|              string|   null|
|                  id|              bigint|   null|
|              id_str|              string|   null|
|                text|              string|   null|
|                    |                    |       |
|  # Metadata Columns|                    |       |
|            _spec_id|                 int|       |
|          _partition|            struct<>|       |
|               _file|              string|       |
|                _pos|              bigint|       |
|            _deleted|             boolean|       |
|                    |                    |       |
|# Detailed Table ...|                    |       |
|                Name|iceberg.social_me...|       |
|                Type|             MANAGED|       |
|            Location|s3a://csnow-bucke...|       |
|           

In [12]:
result_df = spark.sql("""
SELECT 
    * 
FROM 
    iceberg.social_media.twitter_data
""")
result_df.show(10)

+--------------------+--------------------+--------------------+--------------------+
|          created_at|                  id|              id_str|                text|
+--------------------+--------------------+--------------------+--------------------+
|                   1|                   1|                   1|                Yay!|
|2024/11/02 16:15:...|-1659833618039073746|-1659833618039073746|impressed with ho...|
|2024/11/02 16:15:...| 6512778609215664072| 6512778609215664072|motivated by how ...|
|2024/11/02 16:15:...| 8555625018826030287| 8555625018826030287|motivated by how ...|
|2024/11/02 16:15:...| 8614357796825723058| 8614357796825723058|finally got how w...|
|2024/11/02 16:15:...| 3520836456545401736| 3520836456545401736|totally in love w...|
|2024/11/02 16:15:...| 4371851260384616541| 4371851260384616541|can't believe how...|
|2024/11/02 16:15:...| -671018250379665149| -671018250379665149|impressed with ho...|
|2024/11/02 16:15:...| 1433228438804513320| 1433228438