In [1]:
import sys
import os
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
import subprocess

# Define parameters in a dictionary
parameters = {
    'JOB_NAME': 'FullLoadAndCDCProcesserJob',
    'RAW_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/01_raw/',
    'STAGING_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/02_staging/',
    'PREPROCESSED_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/03_preprocessed/',
    'MASTER_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/04_master/',
    'CURATED_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/06_curated/',
    'TABLE_NAME': 'orders',
    'PROCESS_TYPE': 'full_load',
    'SOURCE': 'netSuite'
}

# Mock function to simulate getResolvedOptions
def get_resolved_options(args, keys):
    return {key: parameters[key] for key in keys}

# Mock sys.argv to simulate Glue job arguments
sys.argv = [
    '--JOB_NAME', parameters['JOB_NAME'],
    '--RAW_S3_PATH', parameters['RAW_S3_PATH'],
    '--STAGING_S3_PATH', parameters['STAGING_S3_PATH'],
    '--PREPROCESSED_S3_PATH', parameters['PREPROCESSED_S3_PATH'],
    '--MASTER_S3_PATH', parameters['MASTER_S3_PATH'],
    '--CURATED_S3_PATH', parameters['CURATED_S3_PATH'],
    '--TABLE_NAME', parameters['TABLE_NAME'],
    '--PROCESS_TYPE', parameters['PROCESS_TYPE'],
    '--SOURCE', parameters['SOURCE']
]

# Use the mock get_resolved_options function
args = get_resolved_options(sys.argv, [
    'JOB_NAME', 
    'RAW_S3_PATH', 
    'STAGING_S3_PATH', 
    'PREPROCESSED_S3_PATH', 
    'MASTER_S3_PATH',
    'CURATED_S3_PATH', 
    'TABLE_NAME', 
    'PROCESS_TYPE',
    'SOURCE'
])

# Set environment variables
os.environ['SPARK_VERSION'] = '3.1'

# Path to the Iceberg JAR file
iceberg_jar_path = "/opt/glue/jars/iceberg-spark-3.1_2.12-1.3.1.jar"
hadoop_aws_jar_path = "/opt/glue/jars/hadoop-aws-3.2.0.jar"
aws_sdk_jar_path = "/opt/glue/jars/aws-java-sdk-bundle-1.11.375.jar"

# Initialize Spark session with Iceberg and S3 configurations
spark = SparkSession.builder \
    .config("spark.jars", ",".join([iceberg_jar_path, hadoop_aws_jar_path, aws_sdk_jar_path])) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.master_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.master_catalog.type", "hadoop") \
    .config("spark.sql.catalog.master_catalog.warehouse", "s3://ecommerce-data-lake-us-east-1-dev/04_master/") \
    .config("spark.sql.catalog.curated_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.curated_catalog.type", "hadoop") \
    .config("spark.sql.catalog.curated_catalog.warehouse", "s3://ecommerce-data-lake-us-east-1-dev/06_curated/") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .getOrCreate()

24/06/26 16:37:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/26 16:37:53 WARN SparkSession: Cannot use org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions to configure session extensions.
java.lang.ClassNotFoundException: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
	at java.base/java.net.URLClassLoader.findClass(Unknown Source)
	at java.base/java.lang.ClassLoader.loadClass(Unknown Source)
	at java.base/java.lang.ClassLoader.loadClass(Unknown Source)
	at java.base/java.lang.Class.forName0(Native Method)
	at java.base/java.lang.Class.forName(Unknown Source)
	at org.apache.spark.util.Utils$.classForName(Utils.scala:225)
	at org.apache.spark.sql.SparkSession$.$anonfun$applyExtensions$1(SparkSession.scala:1294)
	at org.apache.spark.sql.Spark

In [2]:
# Install Deequ
subprocess.check_call([sys.executable, "-m", "pip", "install", "--target=/tmp", "pydeequ"])
sys.path.insert(0, '/tmp')

Collecting pydeequ
  Using cached pydeequ-1.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting numpy>=1.14.1 (from pydeequ)
  Using cached numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting pandas>=0.23.0 (from pydeequ)
  Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting python-dateutil>=2.8.2 (from pandas>=0.23.0->pydeequ)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas>=0.23.0->pydeequ)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas>=0.23.0->pydeequ)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas>=0.23.0->pydeequ)
  Using cached six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Using cached pydeequ-1.3.0-py3-none-any.whl (37 kB)
Using cached numpy-2.0.0-cp310-cp310-manylinu

[0m

In [4]:
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite

In [5]:
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import col, lit, current_date, expr, when
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [None]:
# Read source data
source_df = spark.read.format("csv").option("header", "true").load(args['RAW_S3_PATH'] + args['SOURCE'] + '/' + args['TABLE_NAME'] + '/' + args['PROCESS_TYPE'])

# Show the data
source_df.show()

glueContext = SparkSession.builder.getOrCreate()
print(args)