In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import dayofweek, to_date, to_timestamp, year, hour, minute, month, when, dayofmonth, dayofweek
from pyspark.sql.functions import concat_ws, lpad, lit
from pyspark.sql.functions import lag
from pyspark.sql.window import Window
from pyspark.sql import functions, types

#from sklearn.ensemble import IsolationForest
#from sklearn.preprocessing import StandardScaler
#from sklearn.decomposition import PCA

#import pandas as pd
import os

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# Modify this please if deploying a demo using a non-default product namespace
NAMESPACE: str = "default"

In [3]:
# Manual check via pyarrow.fs
import pyarrow.fs as fs
s3 = fs.S3FileSystem(endpoint_override="http://minio:9000/", access_key="admin", secret_key="adminadmin", scheme="http")
files = s3.get_file_info(fs.FileSelector("demo/gas-sensor/raw/", recursive=True))
for f in files:
    print("Found file:", f.path)

Found file: demo/gas-sensor/raw/20160930_203718.csv


In [None]:
! ls -al /usr/local/spark/jars | grep spark-core
! python3 -V
! java --version
! pyspark --version

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("local-test").master("local[*]").getOrCreate()
print("Spark version:", spark.version)
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
df.show()
spark.stop()

Spark version: 3.5.1
+----+----+
|col1|col2|
+----+----+
|   a|   1|
|   b|   2|
+----+----+



In [None]:
import os
from pyspark.sql import SparkSession

NAMESPACE = os.environ.get("NAMESPACE", "default")
POD_NAME = os.environ.get("HOSTNAME", f"jupyter-{os.environ.get('USER', 'default')}-{NAMESPACE}")

# works with python-3.11 notebook image
#EXECUTOR_IMAGE = "oci.stackable.tech/sdp/spark-k8s:3.5.0-stackable24.3.0" 

# jars differ in size, 17.0.12 vs. 17.0.13, 3.11.10 vs. 3.11.9
#SerializableBuffer conflict
#EXECUTOR_IMAGE = "oci.stackable.tech/sdp/spark-k8s:3.5.2-stackable24.11.1" 

# java, jars match
#Python in worker has different version (3, 10) than that in driver 3.11, PySpark cannot run with different minor versions. 
#EXECUTOR_IMAGE = "apache/spark:3.5.2-java17-python3" 

# java, jars match
#Python in worker has different version (3, 10) than that in driver 3.11, PySpark cannot run with different minor versions. 
#EXECUTOR_IMAGE = "spark:3.5.2-scala2.12-java17-python3-ubuntu" 

#Python in worker has different version (3, 12) than that in driver 3.11, PySpark cannot run with different minor versions.
#EXECUTOR_IMAGE = "bitnami/spark:3.5.2"

# custom image with python 3.11 - works!
# based off: spark:3.5.2-scala2.12-java17-ubuntu
# see: 
EXECUTOR_IMAGE = "spark:3.5.2-python311" 

spark = (
    SparkSession.builder
    .master(f'k8s://https://{os.environ["KUBERNETES_SERVICE_HOST"]}:{os.environ["KUBERNETES_SERVICE_PORT"]}')
    .appName("process-s3-data")
    .config("spark.kubernetes.container.image", EXECUTOR_IMAGE)
    .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
    .config("spark.kubernetes.namespace", NAMESPACE)
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    .config("spark.kubernetes.authenticate.executor.serviceAccountName", "spark")
    #.config("spark.driver.host", f"driver-service.{NAMESPACE}.svc.cluster.local")
    .config("spark.driver.port", "2222")
    .config("spark.driver.blockManager.port", "7777")
    #.config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.executor.instances", "1")
    .config("spark.executor.memory", "1g")
    .config("spark.executor.cores", "1")
    # bitnami. See https://github.com/bitnami/containers/issues/52698#issuecomment-2275913474
    #.config("spark.executorEnv.LD_PRELOAD", "/opt/bitnami/common/lib/libnss_wrapper.so")
    #.config("spark.jars", "/usr/local/spark/jars/spark-core_2.12-3.5.2.jar")
    #.config("spark.driver.extraClassPath", "/usr/local/spark/jars/spark-core_2.12-3.5.2.jar")
    #.config("spark.executor.extraClassPath", "/stackable/spark/spark-core_2.12-3.5.2.jar")
    #.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.6,org.apache.hadoop:hadoop-common:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.767")
    #.config("spark.driver.extraClassPath", "/usr/local/spark/jars/spark-core_2.12-3.5.1.jar:/usr/local/spark/jars/spark-sql_2.12-3.5.1.jar:/usr/local/spark/jars/hadoop-aws-3.3.6.jar:/usr/local/spark/jars/hadoop-common-3.3.6.jar")
    #.config("spark.executor.extraClassPath", "/opt/spark/jars/spark-core_2.12-3.5.1.jar:/opt/spark/jars/spark-sql_2.12-3.5.1.jar:/opt/spark/jars/hadoop-aws-3.3.6.jar:/opt/spark/jars/hadoop-common-3.3.6.jar")
    #.config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000/")
    #.config("spark.hadoop.fs.s3a.path.style.access", "true")
    #.config("spark.hadoop.fs.s3a.access.key", "admin")
    #.config("spark.hadoop.fs.s3a.secret.key", "adminadmin")
    #.config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    #.config("spark.submit.deployMode", "client")
    .config("spark.kubernetes.driver.pod.name", POD_NAME)
    # Log classloader source
    #.config("spark.driver.extraJavaOptions", "-verbose:class")
    #.config("spark.executor.extraJavaOptions", "-verbose:class")
    .getOrCreate()
)

In [None]:
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
df.show()

In [None]:
spark.sparkContext.parallelize([1]).mapPartitions(log_classpath_simple).collect()

In [None]:
# Test
print("Spark version:", spark.version)
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
df.show()

In [None]:
spark = (
    SparkSession
    .builder
    .master(f'k8s://https://{os.environ["KUBERNETES_SERVICE_HOST"]}:{os.environ["KUBERNETES_SERVICE_PORT"]}')
    .config("spark.kubernetes.container.image", "oci.stackable.tech/sdp/spark-k8s:3.5.2-stackable24.11.1")
    #.config("spark.kubernetes.driver.container.image", "quay.io/jupyter/pyspark-notebook:python-3.11.8")
    .config("spark.driver.port", "2222")
    .config("spark.driver.blockManager.port", "7777")
    .config("spark.driver.host", f"driver-service.{NAMESPACE}.svc.cluster.local")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.kubernetes.namespace", NAMESPACE)
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    .config("spark.kubernetes.authenticate.serviceAccountName", "spark")
    .config("spark.executor.instances", "1")
    .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
    #.config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000/")
    #.config("spark.hadoop.fs.s3a.path.style.access", "true") 
    #.config("spark.hadoop.fs.s3a.access.key", "admin")
    #.config("spark.hadoop.fs.s3a.secret.key", "adminadmin") 
    #.config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    #.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.6,org.apache.hadoop:hadoop-common:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.767")
    .appName("process-s3-data")
    .getOrCreate()
)

In [6]:
# Verify Spark version
print("Spark version:", spark.version)

Spark version: 3.5.1


In [7]:
# Test minimal action
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
df.show()

Py4JJavaError: An error occurred while calling o107.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3) (10.244.0.154 executor 1): java.io.InvalidClassException: org.apache.spark.rdd.RDD; local class incompatible: stream classdesc serialVersionUID = 3516924559342767982, local class serialVersionUID = 823754013007382808
	at java.base/java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:560)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2020)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1870)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2020)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1870)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2201)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:87)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:129)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:86)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3537)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
Caused by: java.io.InvalidClassException: org.apache.spark.rdd.RDD; local class incompatible: stream classdesc serialVersionUID = 3516924559342767982, local class serialVersionUID = 823754013007382808
	at java.base/java.io.ObjectStreamClass.initNonProxy(ObjectStreamClass.java:560)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2020)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1870)
	at java.base/java.io.ObjectInputStream.readNonProxyDesc(ObjectInputStream.java:2020)
	at java.base/java.io.ObjectInputStream.readClassDesc(ObjectInputStream.java:1870)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2201)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2496)
	at java.base/java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:2390)
	at java.base/java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:2228)
	at java.base/java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1687)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:489)
	at java.base/java.io.ObjectInputStream.readObject(ObjectInputStream.java:447)
	at org.apache.spark.serializer.JavaDeserializationStream.readObject(JavaSerializer.scala:87)
	at org.apache.spark.serializer.JavaSerializerInstance.deserialize(JavaSerializer.scala:129)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:86)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [None]:
# Write a test CSV
test_df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
test_df.show()

In [None]:
test_df.write.csv("s3a://demo/gas-sensor/test/", mode="overwrite")

In [None]:
# Read it back
df = spark.read.csv("s3a://demo/gas-sensor/test/")
df.show()

In [None]:
print(spark.version)
print(spark._jsc.hadoopConfiguration().get("hadoop.version"))

In [None]:
df = spark.read.csv("s3a://demo/gas-sensor/raw/20160930_203718.csv")
df.show()

In [None]:
# Get JARs using toString() on the Vector
jars_vector = spark.sparkContext._jsc.sc().listJars()
jars_str = jars_vector.toString()
print("Loaded JARs (raw string):")
print(jars_str)

# Verify Spark version
print("Spark version:", spark.version)

In [None]:
classpath = spark.sparkContext._jvm.java.lang.System.getProperty("java.class.path")
print("Full JVM Classpath:")
print(classpath)

In [None]:
# Print full classpath
classpath = spark.sparkContext._jvm.java.lang.System.getProperty("java.class.path")
print("Full JVM Classpath:")
print(classpath)

# Verify Spark version
print("Spark version:", spark.version)

# Set log level
spark.sparkContext.setLogLevel("DEBUG")

# Test minimal action
print("Creating DataFrame...")
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
print("Attempting show...")
try:
    df.show()
except Exception as e:
    print("Error:", e)
    print("Full stack trace:")
    traceback.print_exc()

# Check executor classpath (if possible)
print("Executor classpath (may require logs):")
spark.sparkContext._jsc.sc().getExecutorMemoryStatus()  # Trigger executor communication

In [None]:
import pyarrow.fs as fs
s3 = fs.S3FileSystem(endpoint_override="http://minio:9000/", access_key="admin", secret_key="adminadmin", scheme="http")
files = s3.get_file_info(fs.FileSelector("demo/gas-sensor/raw/", recursive=True))
for f in files:
    if "_temporary" in f.path or "shuffle" in f.path:
        print("Deleting:", f.path)
        s3.delete_file(f.path)

In [None]:
from pyspark.sql import SparkSession
import traceback

# Start fresh
if 'spark' in globals():
    spark.stop()

spark = SparkSession.builder \
    .appName("S3AExample") \
    .config("spark.jars.packages", "org.apache.spark:spark-core_2.12:3.5.1,org.apache.spark:spark-sql_2.12:3.5.1,org.apache.hadoop:hadoop-aws:3.3.6,org.apache.hadoop:hadoop-common:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.767") \
    .config("spark.driver.extraClassPath", "/usr/local/spark/jars/spark-core_2.12-3.5.1.jar:/usr/local/spark/jars/spark-sql_2.12-3.5.1.jar") \
    .config("spark.executor.extraClassPath", "/usr/local/spark/jars/spark-core_2.12-3.5.1.jar:/usr/local/spark/jars/spark-sql_2.12-3.5.1.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000/") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.access.key", "admin") \
    .config("spark.hadoop.fs.s3a.secret.key", "admin") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

# Verify Spark version
print("Spark version:", spark.version)

# Set log level
spark.sparkContext.setLogLevel("DEBUG")

In [None]:
# Run a dummy action to log executor classpath
spark.sparkContext.parallelize([1, 2, 3]).foreachPartition(lambda _: 
    print(spark.sparkContext._jvm.java.lang.System.getProperty("java.class.path")))

In [None]:
# Test minimal action
print("Creating DataFrame...")
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
print("Attempting show...")
try:
    df.show()
except Exception as e:
    print("Error:", e)
    print("Full stack trace:")
    traceback.print_exc()

In [None]:
import os
from pyspark.sql import SparkSession

NAMESPACE = os.environ.get("NAMESPACE", "default")  # Adjust if needed
POD_NAME = os.environ.get("HOSTNAME", f"jupyter-{os.environ.get('USER', 'default')}-{NAMESPACE}")  # Use actual pod name

spark = (
    SparkSession.builder
    .master(f'k8s://https://{os.environ["KUBERNETES_SERVICE_HOST"]}:{os.environ["KUBERNETES_SERVICE_PORT"]}')
    .appName("process-s3-data")
    .config("spark.kubernetes.container.image", "quay.io/jupyter/pyspark-notebook:python-3.11.8")
    .config("spark.kubernetes.container.image.pullPolicy", "IfNotPresent")
    .config("spark.kubernetes.namespace", NAMESPACE)
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    .config("spark.kubernetes.authenticate.executor.serviceAccountName", "spark")
    .config("spark.driver.host", f"driver-service.{NAMESPACE}.svc.cluster.local")
    .config("spark.driver.port", "2222")
    .config("spark.driver.blockManager.port", "7777")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.executor.instances", "1")
    .config("spark.executor.memory", "1g")
    .config("spark.executor.cores", "1")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.6,org.apache.hadoop:hadoop-common:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.767")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000/")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "adminadmin")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .config("spark.submit.deployMode", "client")  # Explicitly enforce client mode
    .config("spark.kubernetes.driver.pod.name", POD_NAME)  # Match current Jupyter pod
    .getOrCreate()
)

# Verify Spark version
print("Spark version:", spark.version)

# Test minimal action
df = spark.createDataFrame([("a", 1), ("b", 2)], ["col1", "col2"])
df.show()

In [None]:
print("Spark version:", spark.version)