In [5]:
import os
from pyspark.sql import SparkSession

# Set environment variables
os.environ['SPARK_VERSION'] = '3.1'

# Paths to JAR files
iceberg_jar_path = "/opt/glue/jars/iceberg-spark3-runtime-0.12.0.jar"
hadoop_aws_jar_path = "/opt/glue/jars/hadoop-aws-3.2.0.jar"
aws_sdk_jar_path = "/opt/glue/jars/aws-java-sdk-bundle-1.11.375.jar"
pydeequ_jar_path = "/opt/glue/jars/deequ-glue-1.0-SNAPSHOT-jar-with-dependencies.jar"

# Initialize Spark session with Iceberg, S3, and PyDeequ configurations
spark = SparkSession.builder \
    .config("spark.jars", ",".join([iceberg_jar_path, hadoop_aws_jar_path, aws_sdk_jar_path, pydeequ_jar_path])) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.master_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.master_catalog.type", "hadoop") \
    .config("spark.sql.catalog.master_catalog.warehouse", "s3://ecommerce-data-lake-us-east-1-dev/04_master/") \
    .config("spark.sql.catalog.curated_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.curated_catalog.type", "hadoop") \
    .config("spark.sql.catalog.curated_catalog.warehouse", "s3://ecommerce-data-lake-us-east-1-dev/06_curated/") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

print("Spark session initialized successfully")


24/06/26 20:30:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/26 20:30:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark session initialized successfully


In [3]:
import subprocess
import sys

# Install Deequ
subprocess.check_call([sys.executable, "-m", "pip", "install", "--target=/tmp", "pydeequ"])
sys.path.insert(0, '/tmp')

Collecting pydeequ
  Using cached pydeequ-1.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting numpy>=1.14.1 (from pydeequ)
  Using cached numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting pandas>=0.23.0 (from pydeequ)
  Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting python-dateutil>=2.8.2 (from pandas>=0.23.0->pydeequ)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas>=0.23.0->pydeequ)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas>=0.23.0->pydeequ)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas>=0.23.0->pydeequ)
  Using cached six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Using cached pydeequ-1.3.0-py3-none-any.whl (37 kB)
Using cached numpy-2.0.0-cp310-cp310-manylinu

[0m

In [6]:
from pydeequ.checks import *
from pydeequ.verification import *

# Create a simple DataFrame
data = [("Alice", 34), ("Bob", 45), ("Charlie", 29)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

# Perform data quality checks
check = Check(spark, CheckLevel.Warning, "example-check")
check = check.hasSize(lambda x: x >= 3) \
             .hasMin("age", lambda x: x >= 0) \
             .isComplete("name") \
             .isUnique("name")

# Run the checks
verification_result = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(check) \
    .run()

verification_result_df = VerificationResult.checkResultsAsDataFrame(spark, verification_result)
verification_result_df.show()


Python Callback server started!


Py4JError: An error occurred while calling o71.hasMin. Trace:
py4j.Py4JException: Method hasMin([class java.lang.String, class jdk.proxy3.$Proxy39, class scala.None$, class scala.None$]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:321)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:329)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Unknown Source)

