In [1]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import pandas as pd

# This CATALOG_URL works for the "docker compose" testing and development environment
# Change 'lakekeeper' if you are not running on "docker compose" (f. ex. 'localhost' if Lakekeeper is running locally).
CATALOG_URL = "http://lakekeeper:8181/catalog"
WAREHOUSE = "sepahram"

SPARK_VERSION = pyspark.__version__
SPARK_MINOR_VERSION = '.'.join(SPARK_VERSION.split('.')[:2])
ICEBERG_VERSION = "1.9.2"

In [2]:
ICEBERG_VERSION

'1.9.2'

In [3]:
SPARK_VERSION


'3.5.6'

In [4]:
SPARK_MINOR_VERSION

'3.5'

# Connect with Spark

In [5]:
# Bring in runtime + all bundles (AWS, Azure, GCP) so Lakekeeper can decide
HADOOP_VERSION = "3.4.2"   

spark_jars_packages = (
    f"org.apache.iceberg:iceberg-spark-runtime-{SPARK_MINOR_VERSION}_2.12:{ICEBERG_VERSION},"
    f"org.apache.iceberg:iceberg-aws-bundle:{ICEBERG_VERSION},"
    f"org.apache.hadoop:hadoop-aws:{HADOOP_VERSION},"
    f"com.amazonaws:aws-java-sdk-bundle:1.12.698"
)

config = {

    f"spark.sql.catalog.lakekeeper": "org.apache.iceberg.spark.SparkCatalog",
    f"spark.sql.catalog.lakekeeper.type": "rest",
    f"spark.sql.catalog.lakekeeper.uri": CATALOG_URL,
    f"spark.sql.catalog.lakekeeper.warehouse": WAREHOUSE,
    "spark.sql.defaultCatalog": "lakekeeper",
}
    # "spark.jars.packages": spark_jars_packages,
    # we already downloaded the required jars in the spark Dockerfile 

In [8]:
spark_config = SparkConf().setMaster('local[*]').setAppName("Iceberg-REST-Cluster")
for k, v in config.items():
    spark_config = spark_config.set(k, v)

spark = SparkSession.builder.config(conf=spark_config).getOrCreate()

spark.sql("USE lakekeeper")

25/09/10 15:44:03 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(Unknown Source)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(Unknown Source)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Unknown Source)
java.base/java.lang.reflect.Constructor.newInstance(Unknown Source)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java

DataFrame[]

## Read and Write Tables

In [None]:
# spark.sql(f"CREATE NAMESPACE IF NOT EXISTS spark_namespace")
spark.sql("SHOW NAMESPACES").toPandas()

In [None]:
data = pd.DataFrame([[1, 'a-string', 2.2]], columns=['id', 'strings', 'floats'])
# sdf = spark.createDataFrame(data)
# sdf.writeTo(f"spark_namespace.cluster_table").createOrReplace()

In [9]:
spark.sql(f"SELECT * FROM banking.source_transactions").toPandas()

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
                                                                                

Unnamed: 0,transactionId,userId,timestamp,amount,currency,city,country,merchantName,paymentMethod,ipAddress,voucherCode,affiliateId
0,c95abc35-707a-4f40-8e10-720d93831b2d,garrettglenn,2025-09-10 14:15:23.588611,659.45,USD,North Emilyport,Nicaragua,Bates-Hoffman,online_transfer,104.229.233.98,,03888a74-8ef8-4b1a-b4b5-c4ce2f4ad32c
1,338c4e2b-d069-4301-aa27-7137a70fa0a1,alvarezdaniel,2025-09-10 14:15:24.235241,27.43,GBP,East Nicholaschester,Uruguay,Nichols-Schmidt,online_transfer,160.159.181.20,DISCOUNT10,0de178f0-a709-4e76-82bb-36b401daad96
2,7f997c27-4ddd-4ec5-b37a-8df62a181528,madeline98,2025-09-10 14:15:25.088741,863.54,GBP,South Williamhaven,Finland,Velez-King,credit_card,42.16.204.243,DISCOUNT10,1a890798-eb4d-44ed-a2d9-49b648755370
3,35cbdfb1-0f94-48e2-a8f9-60c2f6776255,amanda52,2025-09-10 14:15:25.932779,905.04,GBP,Port Meganburgh,Botswana,Velasquez PLC,credit_card,185.109.30.22,,f4a9e070-337e-4a88-8c80-4ab54a266c83
4,2bff4afc-95b1-4d39-ab43-d61e9a197c42,ugardner,2025-09-10 14:15:26.451489,304.14,GBP,Yangfort,Malta,Shah-Ruiz,online_transfer,59.90.182.58,DISCOUNT10,a8291a3f-bb59-4319-b0ac-aa3566afe53e
...,...,...,...,...,...,...,...,...,...,...,...,...
995,6cbcbd2a-22f0-42f9-951e-c3e8a2ed02cc,wardkatherine,2025-09-10 14:12:56.654949,574.86,USD,Rossland,Taiwan,Phillips-Harrington,credit_card,120.113.87.15,,c09de8a0-aa39-4cd3-beb5-527d42c0f8f0
996,3a0d2561-65ce-453b-9ffe-1c1be2b09e8e,vazquezkatie,2025-09-10 14:12:57.094817,452.81,GBP,Simontown,Haiti,Brown PLC,credit_card,169.4.157.162,,d2ef291d-9aff-4e01-baac-68e5b9dcb40b
997,c8449c01-7c69-4ab4-b587-bb96f1b6b9f3,briannahutchinson,2025-09-10 14:12:57.834892,231.34,GBP,Jeromechester,Chad,Marquez LLC,debit_card,64.63.254.187,,679fd474-f9d9-4ba7-9ad5-21350a7c2ae7
998,bb39cabe-615b-4551-b19a-47e3ad611757,emilygreen,2025-09-10 14:12:58.353552,10.94,GBP,Davisland,Papua New Guinea,Thomas Inc,debit_card,113.181.97.178,DISCOUNT10,4c1078bb-d5f1-44b0-99ac-fc9c4771898a


In [11]:
spark.sql("""
SELECT 
    merchantName, 
    SUM(amount) AS total_sales, 
    COUNT(*) AS transaction_count
FROM banking.source_transactions
GROUP BY merchantName
ORDER BY total_sales DESC
LIMIT 10;
""").toPandas()

                                                                                

Unnamed: 0,merchantName,total_sales,transaction_count
0,Nguyen PLC,2508.52,3
1,Lopez Group,1964.38,3
2,Hill PLC,1754.41,3
3,Reyes LLC,1564.77,2
4,Jenkins PLC,1552.13,2
5,Harris LLC,1386.27,2
6,Turner Ltd,1302.92,2
7,Williams Inc,1289.28,2
8,Henderson Inc,1276.77,2
9,Martinez PLC,1233.47,2


In [12]:
spark.stop()