# Spark Connectivity Test to Vast DB and Vast S3

## Load Endpoint Environment Variables

These environment variables have been set when your docker container was created.

In [1]:
import os

DOCKER_HOST_OR_IP = os.getenv("DOCKER_HOST_OR_IP")

VASTDB_ENDPOINT = os.getenv("VASTDB_ENDPOINT")
VASTDB_ACCESS_KEY = os.getenv("VASTDB_ACCESS_KEY")
VASTDB_SECRET_KEY = os.getenv("VASTDB_SECRET_KEY")

VASTDB_TWITTER_INGEST_BUCKET = os.getenv("VASTDB_TWITTER_INGEST_BUCKET")
VASTDB_TWITTER_INGEST_SCHEMA = os.getenv("VASTDB_TWITTER_INGEST_SCHEMA")
VASTDB_TWITTER_INGEST_TABLE = os.getenv("VASTDB_TWITTER_INGEST_TABLE")

S3_ENDPOINT = os.getenv("S3A_ENDPOINT")
S3_ACCESS_KEY = os.getenv("S3A_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3A_SECRET_KEY")

S3A_ICEBERG_URI = os.getenv("S3A_ICEBERG_URI")

print(f"""
---
DOCKER_HOST_OR_IP={DOCKER_HOST_OR_IP}
---
VASTDB_ENDPOINT={VASTDB_ENDPOINT}
VASTDB_ACCESS_KEY={VASTDB_ACCESS_KEY[-4:]}
VASTDB_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
VASTDB_TWITTER_INGEST_BUCKET={VASTDB_TWITTER_INGEST_BUCKET}
VASTDB_TWITTER_INGEST_SCHEMA={VASTDB_TWITTER_INGEST_SCHEMA}
VASTDB_TWITTER_INGEST_TABLE={VASTDB_TWITTER_INGEST_TABLE}
---
S3_ENDPOINT={S3_ENDPOINT}
S3_ACCESS_KEY={S3_ACCESS_KEY[-4:]}
S3_SECRET_KEY=****{VASTDB_SECRET_KEY[-4:]}
S3A_ICEBERG_URI={S3A_ICEBERG_URI}
---
""")


---
DOCKER_HOST_OR_IP=se-var-vastdb-ingest
---
VASTDB_ENDPOINT=http://172.200.201.1:80
VASTDB_ACCESS_KEY=DYF8
VASTDB_SECRET_KEY=****Usuu
VASTDB_TWITTER_INGEST_BUCKET=csnowdb
VASTDB_TWITTER_INGEST_SCHEMA=social_media
VASTDB_TWITTER_INGEST_TABLE=tweets
---
S3_ENDPOINT=http://172.200.201.1:80
S3_ACCESS_KEY=DYF8
S3_SECRET_KEY=****Usuu
S3A_ICEBERG_URI=s3a://csnow-bucket/iceberg/
---



## Specify other Environment Variables

In [2]:
SPARK_APPLICATION_NAME='Spark Demo'

## Start Spark Session

In [3]:
import socket
import os
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
pd.set_option("max_colwidth", 150)

conf = SparkConf()
conf.setAll([
    ("spark.driver.host", socket.gethostbyname(socket.gethostname())),
    # ("spark.sql.execution.arrow.pyspark.enabled", "false"),
    # VASTDB
    ("spark.sql.catalog.ndb", 'spark.sql.catalog.ndb.VastCatalog'),
    ("spark.ndb.endpoint", VASTDB_ENDPOINT),
    ("spark.ndb.data_endpoints", VASTDB_ENDPOINT),
    ("spark.ndb.access_key_id", VASTDB_ACCESS_KEY),
    ("spark.ndb.secret_access_key", VASTDB_SECRET_KEY),
    ("spark.driver.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.executor.extraClassPath", '/usr/local/spark/jars/spark3-vast-3.4.1-f93839bfa38a/*'),
    ("spark.sql.extensions", 'ndb.NDBSparkSessionExtension'),
    # ICEBERG
    ("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog"),
    ("spark.sql.catalog.iceberg.type", "hive"),
    ("spark.sql.catalog.iceberg.uri", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
    # S3A
    ("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"),
    ("fs.s3a.endpoint", S3_ENDPOINT),
    ("fs.s3a.access.key", S3_ACCESS_KEY),
    ("fs.s3a.secret.key", S3_SECRET_KEY),
    ("fs.s3a.endpoint.region", "vast"),
    ("fs.s3a.connection.ssl.enabled", "false"),
    # Hive
    ("hive.metastore.uris", f"thrift://{DOCKER_HOST_OR_IP}:9083"),
])

spark = SparkSession.builder \
    .master("local") \
    .appName(SPARK_APPLICATION_NAME) \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("DEBUG")

print("Spark successfully loaded")

Spark successfully loaded


In [4]:
sc

## Connect to Vast DB

### Specify Environment

In [5]:
BUCKET_NAME = VASTDB_TWITTER_INGEST_BUCKET
DATABASE_SCHEMA = 'NYC_DATA'
TABLE_NAME = 'yellow_tripdata_2024'

### Connect and run a query

In [6]:
DATABASE_FULLNAME = f"ndb.`{BUCKET_NAME}`.`{DATABASE_SCHEMA}`"

spark.sql(f"create schema if not exists {DATABASE_FULLNAME}")

# Set the database name so we don't have to fully qualify all object names
# https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-usedb.html
# spark.sql(f"use {DATABASE_FULLNAME}")

print(f"Using {DATABASE_FULLNAME=}")

Using DATABASE_FULLNAME='ndb.`csnowdb`.`NYC_DATA`'


## Read Parquet on Vast S3 and write to Vast DB

Configure s3cmd tool

In [7]:
! /usr/local/bin/s3cmd_configure.sh

Check S3 mount is working

In [8]:
! s3cmd ls s3://$S3A_BUCKET/nyc-data/

2024-11-14 13:30     49961641  s3://csnow-bucket/nyc-data/yellow_tripdata_2024-01.parquet
2024-11-14 13:30     50349284  s3://csnow-bucket/nyc-data/yellow_tripdata_2024-02.parquet


In [9]:
! wget -nc --continue \
       --quiet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet
! wget -nc --continue \
       --quiet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet

In [10]:
! s3cmd put *.parquet s3://${S3A_BUCKET}/nyc-data/

upload: 'yellow_tripdata_2024-01.parquet' -> 's3://csnow-bucket/nyc-data/yellow_tripdata_2024-01.parquet' (49961641 bytes in 0.4 seconds, 124.90 MB/s) [1 of 2]
upload: 'yellow_tripdata_2024-02.parquet' -> 's3://csnow-bucket/nyc-data/yellow_tripdata_2024-02.parquet' (50349284 bytes in 0.4 seconds, 134.88 MB/s) [2 of 2]


In [11]:
! s3cmd ls s3://$S3A_BUCKET/nyc-data/

2024-11-14 13:31     49961641  s3://csnow-bucket/nyc-data/yellow_tripdata_2024-01.parquet
2024-11-14 13:31     50349284  s3://csnow-bucket/nyc-data/yellow_tripdata_2024-02.parquet


In [12]:
from pyspark.sql.functions import col

df = spark.read.format("parquet") \
    .load("s3a://csnow-bucket/nyc-data/")

# cast unsupported field types - just use string for now
df = df.withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime").cast("string")) \
       .withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime").cast("string")) 

In [13]:
df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [14]:
df.createOrReplaceTempView("temp_trips")

In [25]:
df = spark.sql(f"""
DROP TABLE IF EXISTS {DATABASE_FULLNAME}.`{TABLE_NAME}`
""")

In [26]:
# This next cell can take a few minutes

In [27]:
df = spark.sql(f"""
CREATE TABLE {DATABASE_FULLNAME}.{TABLE_NAME}
AS
SELECT * FROM temp_trips
""")

In [28]:
spark.sql(f"""
SELECT * FROM {DATABASE_FULLNAME}.{TABLE_NAME} LIMIT 100
""").toPandas()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2024-02-04 22:14:32,2024-02-04 22:24:19,2,1.81,1,N,68,79,1,12.1,1.0,0.5,3.42,0.00,1.0,20.52,2.5,0.00
1,2,2024-02-04 22:28:46,2024-02-04 23:00:12,1,21.22,1,N,132,254,2,78.6,1.0,0.5,0.00,6.94,1.0,89.79,0.0,1.75
2,2,2024-02-04 22:04:42,2024-02-04 22:14:28,1,2.51,1,N,142,229,1,12.8,1.0,0.5,3.56,0.00,1.0,21.36,2.5,0.00
3,2,2024-02-04 22:25:09,2024-02-04 22:38:36,3,7.27,1,N,138,256,1,30.3,6.0,0.5,7.56,0.00,1.0,47.11,0.0,1.75
4,2,2024-02-04 22:01:59,2024-02-04 22:18:09,1,9.37,1,N,70,49,1,36.6,6.0,0.5,5.00,0.00,1.0,50.85,0.0,1.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2024-02-04 22:07:25,2024-02-04 22:40:24,1,18.25,2,N,132,211,1,70.0,0.0,0.5,14.80,0.00,1.0,90.55,2.5,1.75
96,2,2024-02-04 22:25:27,2024-02-04 22:35:11,1,2.50,1,N,158,50,1,12.8,1.0,0.5,3.56,0.00,1.0,21.36,2.5,0.00
97,2,2024-02-04 22:47:08,2024-02-04 22:52:17,1,0.93,1,N,90,114,1,7.2,1.0,0.5,2.44,0.00,1.0,14.64,2.5,0.00
98,2,2024-02-04 22:05:34,2024-02-04 22:16:31,2,2.09,1,N,239,262,1,13.5,1.0,0.5,3.70,0.00,1.0,22.20,2.5,0.00
