# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [1]:
import os 

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

print(f"""
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
---
""")


---
VASTDB_ENDPOINT=http://172.200.201.1:80
VASTDB_ACCESS_KEY=DYF8
VASTDB_SECRET_KEY=****Usuu
---
S3_ENDPOINT=http://172.200.201.1:80
S3_ACCESS_KEY=DYF8
S3_SECRET_KEY=****Usuu
---



## Specify other Environment Variables

In [2]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [3]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    # ("spark.jars.packages", "com.amazonaws:aws-java-sdk:1.12.742,org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-client:3.3.4"),
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension')
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded")

Spark successfully loaded


## Connect to Vast DB

### Specify Environment

In [4]:
BUCKET_NAME = 'csnowdb'
DATABASE_SCHEMA='twitter_import'
TABLE_NAME='twitter_data'

### Connect and run a query

In [5]:
DATABASE_FULLNAME = f"ndb.{BUCKET_NAME}.{DATABASE_SCHEMA}"

spark.sql(f"create schema if not exists {DATABASE_FULLNAME}")

# Set the database name so we don't have to fully qualify all object names
# https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-usedb.html
# spark.sql(f"use {DATABASE_FULLNAME}")

print(f"Using {DATABASE_FULLNAME=}")

Using DATABASE_FULLNAME='ndb.csnowdb.twitter_import'


In [6]:
df = spark.sql(f"""
SELECT 
  *
FROM 
  {DATABASE_FULLNAME}.{TABLE_NAME}
WHERE
  created_at in (
    SELECT 
      MAX(created_at) latest_created_at
    FROM 
      {DATABASE_FULLNAME}.{TABLE_NAME}
  )
ORDER BY id
LIMIT 1
""")

In [7]:
df.show()

+--------------------+--------------------+--------------------+--------------------+
|          created_at|                  id|              id_str|                text|
+--------------------+--------------------+--------------------+--------------------+
|2024/11/02 13:48:...|-9204228685057172416|-9204228685057172416|so excited about ...|
+--------------------+--------------------+--------------------+--------------------+



## Connect to Vast S3

### Specify Environment

In [8]:
S3_PATH = 's3a://csnow-bucket/csnow_spark'

**WARNING**: the next cell does not return - it just hangs.

In [9]:
data = [("Alice", 1), ("Bob", 2), ("Cathy", 3)]
cols = ["Name", "Id"]

s3_df = spark.createDataFrame(data, cols)
s3_df.show()

+-----+---+
| Name| Id|
+-----+---+
|Alice|  1|
|  Bob|  2|
|Cathy|  3|
+-----+---+



In [10]:
s3_df.write.mode('append').parquet(S3_PATH)

In [12]:
spark.sql(f"""
CREATE EXTERNAL TABLE IF NOT EXISTS spark_demo (
    Name STRING,
    Id INT
)
STORED AS PARQUET
LOCATION '{S3_PATH}';
""")

AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT).;
'CreateTable `spark_catalog`.`default`.`spark_demo`, org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe, Ignore


In [None]:
spark.sql(f"""
INSERT INTO TABLE spark_demo VALUES ('abc', 123);
""")

In [None]:
# Verify that the table has been created and query it
result_df = spark.sql("SELECT * FROM spark_demo")
result_df.show()