In [1]:
from pyspark.sql import *
from pyspark import SparkConf
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.amazonaws:aws-java-sdk:1.10.34,org.apache.spark:spark-hadoop-cloud_2.12:3.4.0,org.apache.iceberg:iceberg-spark:1.5.0,org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.4_2.12:0.80.0,org.apache.iceberg:iceberg-aws-bundle:1.5.0 pyspark-shell'

conf = SparkConf()

conf.set("spark.sql.execution.pyarrow.enabled", "true")
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.sql.catalog.arctic", "org.apache.iceberg.spark.SparkCatalog")
conf.set("spark.sql.catalog.arctic.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")

conf.set("spark.sql.catalog.arctic.warehouse", "s3://warehouse")
conf.set("spark.sql.catalog.arctic.s3.endpoint", "http://storage:9000")

conf.set("spark.sql.catalog.arctic.uri", "http://catalog:19120/api/v1")

conf.set("spark.sql.catalog.arctic.ref", "main")
conf.set("spark.sql.catalog.arctic.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")

conf.set("spark.sql.catalog.arctic.authentication.type", "NONE")
conf.set(
    "spark.sql.extensions",    "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions")

print("Spark will be instantiated. This may take a while specially the first time for downloading. Wait until it says Spark is running...")

#Run Spark
spark = SparkSession.builder.config(conf=conf).getOrCreate()
print("Spark is Running!")

Spark will be instantiated. This may take a while specially the first time for downloading. Wait until it says Spark is running...
Spark is Running!


In [2]:
spark.sql(
    "SHOW DATABASES IN arctic"
).show()

+---------+
|namespace|
+---------+
|     test|
+---------+



In [3]:
spark.sql(
    "DESCRIBE arctic.test.orders"
).show()

+----------+---------+-------+
|  col_name|data_type|comment|
+----------+---------+-------+
|  order_id|   bigint|   NULL|
|first_name|   string|   NULL|
| last_name|   string|   NULL|
|     email|   string|   NULL|
|     brand|   string|   NULL|
|     model|   string|   NULL|
|sale_price|   bigint|   NULL|
|    rating|   double|   NULL|
+----------+---------+-------+



In [4]:
spark.sql(
    "SELECT * from arctic.test.orders"
).show()

+--------+----------+-----------+--------------------+--------------------+--------------------+----------+------+
|order_id|first_name|  last_name|               email|               brand|               model|sale_price|rating|
+--------+----------+-----------+--------------------+--------------------+--------------------+----------+------+
|   10068|   Kristan|    Dalgety|  grighy5k@yandex.ru|        Harvey-Kuhic|    Max Sidekick 734|      7495|   0.0|
|   10069|     Mozes|     Gillio| mkubas73@cdbaby.com|        Torphy Group|   Perf Sidekick 251|     18995|   4.8|
|   10070|    Nanete|  Bartalini|  bthireau6b@cdc.gov|    Langworth-Little|        Perf Pro 804|     10497|   0.0|
|   10071|  Ruthanne|     Little|  lhubane@joomla.org|  Ankunding and Sons|        Pro Perf 959|      7995|   5.0|
|   10072|    Dwight|     Vassel|   wgorsse6j@umn.edu|        Torphy Group| Pro TrailRunner 582|     14995|   4.6|
|   10073|    Danica|       Medd|carmytage1i@netlo...|    Langworth-Little|Impre

In [6]:
spark.sql(
    "SELECT COUNT(*) from arctic.test.orders"
).show()

+--------+
|count(1)|
+--------+
|    9315|
+--------+

