### Select a iamge:
https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html#jupyter-base-notebook

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("spark-minio")
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio.airflow:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .getOrCreate()
)

In [5]:
# validate the verion (3.5 version is more compatible with minio)
# jupyter-pyspark image: quay.io/jupyter/pyspark-notebook:spark-3.5.1
import pyspark 
print("PySpark:", pyspark.__version__) 
print("Spark:", spark.version) 
print("Hadoop:", spark.sparkContext._jsc.hadoopConfiguration().get("hadoop.version"))
print(spark._jsc.hadoopConfiguration().get("fs.s3a.endpoint"))

PySpark: 3.5.1
Spark: 3.5.1
Hadoop: None
http://minio.airflow:9000


In [6]:
data = [
    ("James", "Smith", "36636", "M", 3000),
    ("Michael", "Rose", "40288", "M", 4000),
    ("Robert", "Williams", "42114", "M", 4000),
    ("Maria", "Jones", "39192", "F", 4000),
    ("Jen", "Brown", "39193", "F", -1)
]

schema = StructType(
    [
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("id", StringType(), True),
        StructField("gender", StringType(), True),
        StructField("salary", IntegerType(), True),
    ]
)

df = spark.createDataFrame(data, schema)

In [7]:
df.toPandas()

Unnamed: 0,first_name,last_name,id,gender,salary
0,James,Smith,36636,M,3000
1,Michael,Rose,40288,M,4000
2,Robert,Williams,42114,M,4000
3,Maria,Jones,39192,F,4000
4,Jen,Brown,39193,F,-1


In [8]:
# write minio
df.write.mode("overwrite").parquet("s3a://datalake/bronze/notebook_minio_test")

In [9]:
# read minio
df = spark.read.format("parquet").load("s3a://datalake/bronze/notebook_minio_test")
df.show()

+----------+---------+-----+------+------+
|first_name|last_name|   id|gender|salary|
+----------+---------+-----+------+------+
|    Robert| Williams|42114|     M|  4000|
|   Michael|     Rose|40288|     M|  4000|
|     James|    Smith|36636|     M|  3000|
|     Maria|    Jones|39192|     F|  4000|
|       Jen|    Brown|39193|     F|    -1|
+----------+---------+-----+------+------+

