In [2]:
import sys
import os
from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

# Define parameters in a dictionary
parameters = {
    'JOB_NAME': 'FullLoadAndCDCProcesserJob',
    'RAW_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/01_raw/',
    'STAGING_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/02_staging/',
    'PREPROCESSED_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/03_preprocessed/',
    'MASTER_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/04_master/',
    'CURATED_S3_PATH': 's3://ecommerce-data-lake-us-east-1-dev/06_curated/',
    'TABLE_NAME': 'orders',
    'PROCESS_TYPE': 'full_load',
    'SOURCE': 'netSuite'
}

# Mock function to simulate getResolvedOptions
def get_resolved_options(args, keys):
    return {key: parameters[key] for key in keys}

# Mock sys.argv to simulate Glue job arguments
sys.argv = [
    '--JOB_NAME', parameters['JOB_NAME'],
    '--RAW_S3_PATH', parameters['RAW_S3_PATH'],
    '--STAGING_S3_PATH', parameters['STAGING_S3_PATH'],
    '--PREPROCESSED_S3_PATH', parameters['PREPROCESSED_S3_PATH'],
    '--MASTER_S3_PATH', parameters['MASTER_S3_PATH'],
    '--CURATED_S3_PATH', parameters['CURATED_S3_PATH'],
    '--TABLE_NAME', parameters['TABLE_NAME'],
    '--PROCESS_TYPE', parameters['PROCESS_TYPE'],
    '--SOURCE', parameters['SOURCE']
]

# Use the mock get_resolved_options function
args = get_resolved_options(sys.argv, [
    'JOB_NAME', 
    'RAW_S3_PATH', 
    'STAGING_S3_PATH', 
    'PREPROCESSED_S3_PATH', 
    'MASTER_S3_PATH',
    'CURATED_S3_PATH', 
    'TABLE_NAME', 
    'PROCESS_TYPE',
    'SOURCE'
])

# Paths to JAR files
hadoop_aws_jar_path = "/opt/glue/jars/hadoop-aws-3.2.0.jar"
aws_sdk_jar_path = "/opt/glue/jars/aws-java-sdk-bundle-1.11.375.jar"

# Initialize Spark session with S3 configurations
spark = SparkSession.builder \
    .config("spark.jars", ",".join([hadoop_aws_jar_path, aws_sdk_jar_path])) \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .config("spark.driver.extraJavaOptions", "--add-opens java.base/java.nio=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED") \
    .getOrCreate()

print("Spark session initialized successfully")

# Read source data
source_df = spark.read.format("csv").option("header", "true").load('s3a://ecommerce-data-lake-us-east-1-dev/customers.csv')

# Show the data
source_df.show()


Spark session initialized successfully
+-----------------+---------------+--------------------+--------------------+-----------+
|      customer_id|           name|               email|             address|signup_date|
+-----------------+---------------+--------------------+--------------------+-----------+
|             5376|    Jaime Weber|  adam81@example.org|98843 Justin Squares|       null|
|  West Jayborough|      PA 27422"|          2021-01-26|                null|       null|
|             9429|   Leonard Ruiz|rachel38@example.com|38493 Dougherty K...|       null|
| West Thomashaven|      DC 70645"|          2021-11-01|                null|       null|
|             7584| Brittany Perez|  esmith@example.net|30683 Anderson Vista|       null|
|    South Brandon|      RI 34383"|          2020-12-05|                null|       null|
|             1781|   Robert Moore|kentkaufman@examp...| 959 Jill Throughway|       null|
|       Thomasberg|      RI 27772"|          2022-04-03|     