# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [1]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=se-var-vastdb-ingest
---
VASTDB_ENDPOINT=http://172.200.201.1:80
VASTDB_ACCESS_KEY=DYF8
VASTDB_SECRET_KEY=****Usuu
VASTDB_TWITTER_INGEST_BUCKET=csnowdb
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---
S3_ENDPOINT=http://172.200.201.1:80
S3_ACCESS_KEY=DYF8
S3_SECRET_KEY=****Usuu
S3A_ICEBERG_URI=s3a://csnow-bucket/iceberg/
---



## Specify other Environment Variables

In [2]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [3]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option("max_colwidth", 150)

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    # ("spark.sql.execution.arrow.pyspark.enabled", "false"),
    # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # ICEBERG
    ("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog"),
    ("spark.sql.catalog.iceberg.type", "hive"),
    ("spark.sql.catalog.iceberg.uri", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
    # S3A
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
    # Hive
    ("hive.metastore.uris", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded")

Spark successfully loaded


In [4]:
sc

## Connect to Vast DB

### Specify Environment

In [5]:
BUCKET_NAME = VASTDB_TWITTER_INGEST_BUCKET
DATABASE_SCHEMA = 'NYC_DATA'
TABLE_NAME = 'yellow_tripdata_2024'

### Connect and run a query

In [6]:
DATABASE_FULLNAME = f"ndb.`{BUCKET_NAME}`.`{DATABASE_SCHEMA}`"

spark.sql(f"create schema if not exists {DATABASE_FULLNAME}")

# Set the database name so we don't have to fully qualify all object names
# https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-usedb.html
# spark.sql(f"use {DATABASE_FULLNAME}")

print(f"Using {DATABASE_FULLNAME=}")

Using DATABASE_FULLNAME='ndb.`csnowdb`.`NYC_DATA`'


## Read Parquet on Vast S3 and write to Vast DB

Check S3 mount is working

In [7]:
! ls ../s3

dst1_test-suite  iceberg  import-files	nyc-data  userdata


In [8]:
! mkdir -p ../s3/nyc-data/

In [9]:
! pip install --quiet s3cmd
! s3cmd --access_key=${S3A_ACCESS_KEY} \
        --secret_key=${S3A_SECRET_KEY} \
        --host=$(echo $S3A_ENDPOINT | sed -s 's@http://@@' | sed -s 's@:.*@@g') \
        --host-bucket=${S3A_BUCKET} \
        --no-check-certificate \
        ls s3://${S3A_BUCKET}/

                          DIR  s3://csnow-bucket/dst1_test-suite/
                          DIR  s3://csnow-bucket/iceberg/
                          DIR  s3://csnow-bucket/import-files/
                          DIR  s3://csnow-bucket/nyc-data/
                          DIR  s3://csnow-bucket/userdata/


In [10]:
! wget -nc --continue \
       --quiet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet
! wget -nc --continue \
       --quiet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet

In [11]:
! s3cmd --access_key=${S3A_ACCESS_KEY} \
        --secret_key=${S3A_SECRET_KEY} \
        --host=$(echo $S3A_ENDPOINT | sed -s 's@http://@@' | sed -s 's@:.*@@g') \
        --host-bucket=${S3A_BUCKET} \
        --no-check-certificate \
        --quiet \
        put *.parquet s3://${S3A_BUCKET}/nyc-data/



In [12]:
! ls ../s3/nyc-data

yellow_tripdata_2024-01.parquet  yellow_tripdata_2024-02.parquet


In [13]:
from pyspark.sql.functions import col

df = spark.read.format("parquet") \
    .load("s3a://csnow-bucket/nyc-data/")

# cast unsupported field types - just use string for now
df = df.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime").cast("string")) \
       .withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime").cast("string")) 

In [14]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [15]:
df.createOrReplaceTempView("temp_trips")

In [16]:
df = spark.sql(f"""
DROP TABLE IF EXISTS {DATABASE_FULLNAME}.{TABLE_NAME}
""")

In [17]:
# This next cell can take a few minutes

In [18]:
df = spark.sql(f"""
CREATE TABLE {DATABASE_FULLNAME}.{TABLE_NAME}
AS
SELECT * FROM temp_trips
""")

In [19]:
spark.sql(f"""
SELECT * FROM {DATABASE_FULLNAME}.{TABLE_NAME} LIMIT 100
""").toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-02-06 14:26:56,2024-02-06 14:30:52,1,1.06,1,N,262,75,2,7.2,0.0,0.5,0.00,0.0,1.0,11.20,2.5,0.0
1,2,2024-02-06 14:33:53,2024-02-06 14:40:17,1,1.24,1,N,75,236,2,8.6,0.0,0.5,0.00,0.0,1.0,12.60,2.5,0.0
2,2,2024-02-06 14:00:20,2024-02-06 14:11:24,2,1.03,1,N,237,233,1,11.4,0.0,0.5,0.00,0.0,1.0,15.40,2.5,0.0
3,2,2024-02-06 14:32:39,2024-02-06 14:39:57,2,1.26,1,N,164,161,1,9.3,0.0,0.5,2.66,0.0,1.0,15.96,2.5,0.0
4,2,2024-02-06 14:15:02,2024-02-06 14:50:55,1,4.65,1,N,161,42,1,32.4,0.0,0.5,7.28,0.0,1.0,43.68,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2024-02-06 14:35:55,2024-02-06 14:47:55,1,3.34,1,N,263,170,1,17.7,0.0,0.5,5.42,0.0,1.0,27.12,2.5,0.0
96,2,2024-02-06 14:56:23,2024-02-06 15:09:11,1,0.86,1,N,161,230,2,11.4,0.0,0.5,0.00,0.0,1.0,15.40,2.5,0.0
97,2,2024-02-06 14:48:21,2024-02-06 14:58:00,1,1.77,1,N,114,224,1,11.4,0.0,0.5,4.62,0.0,1.0,20.02,2.5,0.0
98,2,2024-02-06 14:13:01,2024-02-06 14:36:35,2,4.09,1,N,232,246,1,24.7,0.0,0.5,5.74,0.0,1.0,34.44,2.5,0.0
