# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

In [2]:
!pip install --quiet python-dotenv

In [3]:
from dotenv import load_dotenv, find_dotenv
import os

# Load the dotenv file and raise an error if it doesn't exist
if not load_dotenv(find_dotenv('dotenv'), override=True):
    raise Exception("Error: dotenv file not found.")

# Required environment variables in dotenv
required_vars = [
    "VASTDB_ENDPOINT", "VASTDB_ACCESS_KEY", "VASTDB_SECRET_KEY", 
    "S3_ENDPOINT", "S3_ACCESS_KEY", "S3_SECRET_KEY"
]

# Check for any missing variables and exit if any are not set
if missing := [var for var in required_vars if not os.getenv(var)]:
    raise Exception(f"Missing required environment variables: {', '.join(missing)}")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

S3_ENDPOINT = os.getenv("S3_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY")

## Specify other Environment Variables

In [4]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [5]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension')
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("Spark successfully loaded")

Spark successfully loaded


## Connect to Vast DB

### Specify Environment

In [None]:
BUCKET_NAME = 'vastdb'
DATABASE_SCHEMA='demo_twitter_import'
TABLE_NAME='twitter_data'

### Connect and run a query

In [None]:
DATABASE_FULLNAME = f"ndb.{BUCKET_NAME}.{DATABASE_SCHEMA}"

spark.sql(f"create schema if not exists {DATABASE_FULLNAME}")

# Set the database name so we don't have to fully qualify all object names
# https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-usedb.html
spark.sql(f"use {DATABASE_FULLNAME}")

print(f"Using {DATABASE_FULLNAME=}")

In [None]:
df = spark.sql(f"SELECT * FROM {TABLE_NAME}")

In [None]:
df.show()

## Connect to Vast S3

### Specify Environment

In [6]:
S3_BUCKET = 'datastore'

**WARNING**: the next cell does not return - it just hangs.

In [None]:
data = [("Alice", 1), ("Bob", 2), ("Cathy", 3)]
cols = ["Name", "Id"]

df = spark.createDataFrame(data, cols)


s3_path = f"s3a://{S3_BUCKET}/csnow_spark"
df.write.mode("overwrite").parquet(s3_path)

In [None]:
spark.sql(f"""
CREATE EXTERNAL TABLE spark_demo (
    Name STRING,
    Id INT
)
STORED AS PARQUET
LOCATION '{s3_path}'
""")

# Verify that the table has been created and query it
result_df = spark.sql("SELECT * FROM spark_demo")
result_df.show()
