In [None]:
import sys  # Used to access arguments passed to the Glue job at runtime

# AWS Glue-specific libraries
from awsglue.transforms import *  # Provides built-in Glue transformations like ApplyMapping, Join, etc.
from awsglue.utils import getResolvedOptions  # Used to retrieve job arguments passed from Glue job parameters
from awsglue.context import GlueContext  # Glue-specific context wrapper for SparkContext
from awsglue.job import Job  # Provides structure for defining and managing the Glue job lifecycle
from awsglue.gluetypes import *  # Contains Glue-compatible data types for schema definitions

# Spark context initialization
from pyspark.context import SparkContext  # Entry point for Spark functionality in Glue

# AWS Glue Data Quality libraries
from awsgluedq.transforms import EvaluateDataQuality  # Used to run DQ rulesets and evaluate data quality

# DynamicFrame abstraction used in AWS Glue
from awsglue import DynamicFrame  # Glue-specific abstraction over Spark DataFrames with built-in transformations


In [None]:
def _find_null_fields(ctx, schema, path, output, nullStringSet, nullIntegerSet, frame):
    """
    Recursively traverses the schema of a Glue DynamicFrame and identifies fields
    that are effectively "null" based on domain-specific null value sets.

    Args:
        ctx (GlueContext): The Glue context.
        schema (StructType): The schema of the data being analyzed.
        path (str): Current dot-separated path to the field.
        output (list): Accumulator list to collect null field paths.
        nullStringSet (set): Set of string values considered null (e.g., {"", "null", "none"}).
        nullIntegerSet (set): Set of numeric values considered null (e.g., {0, -1}).
        frame (DynamicFrame): The Glue DynamicFrame containing the data.

    Returns:
        list: A list of field paths (str) that are considered null-like fields.
    """

    # If current schema is a struct, recursively evaluate each nested field
    if isinstance(schema, StructType):
        for field in schema:
            new_path = path + "." if path != "" else path
            output = _find_null_fields(
                ctx,
                field.dataType,
                new_path + field.name,
                output,
                nullStringSet,
                nullIntegerSet,
                frame
            )

    # If current schema is an array of structs, recursively evaluate struct elementType
    elif isinstance(schema, ArrayType):
        if isinstance(schema.elementType, StructType):
            output = _find_null_fields(
                ctx,
                schema.elementType,
                path,
                output,
                nullStringSet,
                nullIntegerSet,
                frame
            )

    # Directly append null paths for fields explicitly marked NullType
    elif isinstance(schema, NullType):
        output.append(path)

    else:
        # Convert DynamicFrame to DataFrame for column analysis
        df = frame.toDF()
        distinct_set = set()

        # Extract distinct values for the field
        for row in df.select(path).distinct().collect():
            value = row[path.split('.')[-1]]

            # Normalize and clean values depending on type
            if isinstance(value, list):
                # Flatten list of strings or other primitives
                distinct_set |= set([item.strip() if isinstance(item, str) else item for item in value])
            elif isinstance(value, str):
                distinct_set.add(value.strip())
            else:
                distinct_set.add(value)

        # Check if the set of distinct values is a subset of null indicators
        if isinstance(schema, StringType):
            if distinct_set.issubset(nullStringSet):
                output.append(path)
        elif isinstance(schema, (IntegerType, LongType, DoubleType)):
            if distinct_set.issubset(nullIntegerSet):
                output.append(path)

    return output


In [None]:
def drop_nulls(glueContext, frame, nullStringSet, nullIntegerSet, transformation_ctx) -> DynamicFrame:
    """
    Identifies and removes fields from a Glue DynamicFrame that are deemed 'null-like',
    based on domain-specific null value sets for strings and numeric types.

    Args:
        glueContext (GlueContext): The AWS Glue context for job execution.
        frame (DynamicFrame): The input DynamicFrame to process.
        nullStringSet (set): Set of string values considered as null indicators (e.g., {"", "null", "none"}).
        nullIntegerSet (set): Set of integer values considered as null indicators (e.g., {0, -1}).
        transformation_ctx (str): A unique context string used for job lineage tracking in AWS Glue.

    Returns:
        DynamicFrame: A new DynamicFrame with null-like fields removed.
    """

    # Step 1: Identify all null-like fields based on schema and value sets
    nullColumns = _find_null_fields(
        frame.glue_ctx,        # Glue context (accessed from the frame itself)
        frame.schema(),        # Schema of the DynamicFrame
        "",                    # Starting path (top level)
        [],                    # Initial output list to hold null field paths
        nullStringSet,         # Domain-specific null-like string values
        nullIntegerSet,        # Domain-specific null-like integer values
        frame                  # The data frame being analyzed
    )

    # Step 2: Drop the identified null-like fields using DropFields transformation
    return DropFields.apply(
        frame=frame,
        paths=nullColumns,
        transformation_ctx=transformation_ctx  # For Glue lineage tracking
    )


In [None]:
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
    """
    Executes a Spark SQL query on registered temporary views and returns the result as a DynamicFrame.

    Args:
        glueContext (GlueContext): AWS Glue context required to convert Spark DataFrames back to DynamicFrames.
        query (str): The SQL query to execute on the temporary views.
        mapping (dict): A dictionary mapping table aliases (used in SQL) to Glue DynamicFrames.
        transformation_ctx (str): A unique string used for job tracking and lineage in AWS Glue.

    Returns:
        DynamicFrame: The result of the SQL query converted back into a DynamicFrame.
    """

    # Register each input DynamicFrame as a temporary SQL view
    for alias, frame in mapping.items():
        frame.toDF().createOrReplaceTempView(alias)

    # Execute the SQL query
    result = spark.sql(query)

    # Convert the resulting Spark DataFrame back into a Glue DynamicFrame
    return DynamicFrame.fromDF(result, glueContext, transformation_ctx)


# ---- Glue Job Initialization Section ---- #

# Parse job arguments (e.g., --JOB_NAME) passed at runtime
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

# Initialize the Spark context and Glue context
sc = SparkContext()
glueContext = GlueContext(sc)

# Create the Spark session for SQL operations
spark = glueContext.spark_session

# Initialize the Glue job with the given name
job = Job(glueContext)
job.init(args['JOB_NAME'], args)


# ---- Data Quality Default Ruleset ---- #

# This is a default data quality ruleset definition.
# Additional rules can be appended to this string as needed per dataset.
DEFAULT_DATA_QUALITY_RULESET = """
    Rules = [
        ColumnCount > 0
    ]
"""


In [None]:
# Script generated for node airline_raw_data
# This step loads raw data from the AWS Glue Data Catalog, where "ad_raw_data" is the table in the "airline_raw_data_db" database.
# This dynamic frame acts as a foundational source for processing the flight data.
airline_raw_data_node1744438827213 = glueContext.create_dynamic_frame.from_catalog(
    database="airline_raw_data_db", table_name="ad_raw_data", transformation_ctx="airline_raw_data_node1744438827213"
)



In [None]:
# Script generated for node Drop Null Fields
# This transformation step drops rows with null or missing values for specific fields to ensure that the data being processed is clean.
# It helps in maintaining data quality and avoids errors in downstream processing that could arise from missing critical values.
# The nullStringSet defines values considered as "null", such as empty strings or "NaN".
DropNullFields_node1744439376908 = drop_nulls(
    glueContext,
    frame=airline_raw_data_node1744438827213,
    nullStringSet={"", "null", "NaN"},
    nullIntegerSet={},
    transformation_ctx="DropNullFields_node1744439376908"
)



In [None]:
# Script generated for node Drop Fields
# This transformation removes non-essential fields that are not required for further analysis.
# Dropping fields like flight codeshare details and partition fields helps reduce storage and improve processing performance.
# In a production environment, this ensures the data remains streamlined and only relevant data is passed on.
DropFields_node1744453341635 = DropFields.apply(
    frame=DropNullFields_node1744439376908,
    paths=[
        "departure_estimated", "departure_estimated_runway", "departure_actual_runway",
        "arrival_estimated", "arrival_estimated_runway", "arrival_actual_runway",
        "flight_codeshared_airline_name", "flight_codeshared_airline_iata", "flight_codeshared_airline_icao",
        "flight_codeshared_flight_number", "flight_codeshared_flight_iata", "flight_codeshared_flight_icao",
        "flight_codeshared", "partition_1", "partition_2", "partition_3", "partition_0",
        "airline_name", "airline_iata", "airline_icao"
    ],
    transformation_ctx="DropFields_node1744453341635"
)



In [None]:
# Script generated for node fill arrival_actual and departure_actual
# This SQL query is designed to replace null values in 'departure_actual' and 'arrival_actual' columns with the corresponding 'scheduled' times.
# This transformation is essential for ensuring data consistency, especially when the actual departure/arrival data is missing.
# It fills gaps in the data so that later analysis or downstream jobs aren't disrupted by missing values.
SqlQuery0 = '''
SELECT
*,
    CASE
        WHEN departure_actual IS NULL THEN departure_scheduled
        ELSE departure_actual
    END AS departure_actual,
    CASE
        WHEN arrival_actual IS NULL THEN arrival_scheduled
        ELSE arrival_actual
    END AS arrival_actual
FROM american_airlines;
'''



In [None]:
# Executing the SQL query on the transformed data. The query maps to the dynamic frame of the previous step,
# ensuring that the missing 'departure_actual' and 'arrival_actual' values are handled.
# This is a critical step to keep the dataset consistent and ready for further processing or analytics.
fillarrival_actualanddeparture_actual_node1744452588102 = sparkSqlQuery(
    glueContext, query=SqlQuery0, mapping={"american_airlines": DropFields_node1744453341635}, transformation_ctx="fillarrival_actualanddeparture_actual_node1744452588102"
)





In [None]:
# Script generated for node Fix Data Types
# This step applies data type corrections to ensure that the columns have the appropriate types for further processing.
# Correcting data types ensures data consistency and prevents errors in downstream operations such as database inserts or analytics.
FixDataTypes_node1744455298016 = ApplyMapping.apply(
    frame=fillarrival_actualanddeparture_actual_node1744452588102,
    mappings=[
        ("flight_date", "string", "flight_date", "date"),
        ("flight_status", "string", "flight_status", "string"),
        ("departure_airport", "string", "departure_airport", "string"),
        ("departure_timezone", "string", "departure_timezone", "string"),
        ("departure_iata", "string", "departure_iata", "string"),
        ("departure_icao", "string", "departure_icao", "string"),
        ("departure_terminal", "string", "departure_terminal", "string"),
        ("departure_gate", "string", "departure_gate", "string"),
        ("departure_delay", "double", "departure_delay", "int"),
        ("departure_scheduled", "string", "departure_scheduled", "timestamp"),
        ("departure_actual", "string", "departure_actual", "timestamp"),
        ("arrival_airport", "string", "arrival_airport", "string"),
        ("arrival_timezone", "string", "arrival_timezone", "string"),
        ("arrival_iata", "string", "arrival_iata", "string"),
        ("arrival_icao", "string", "arrival_icao", "string"),
        ("arrival_terminal", "string", "arrival_terminal", "string"),
        ("arrival_gate", "string", "arrival_gate", "string"),
        ("arrival_baggage", "string", "arrival_baggage", "string"),
        ("arrival_scheduled", "string", "arrival_scheduled", "timestamp"),
        ("arrival_delay", "double", "arrival_delay", "int"),
        ("arrival_actual", "string", "arrival_actual", "timestamp"),
        ("flight_number", "string", "flight_number", "int"),
        ("flight_iata", "string", "flight_iata", "string"),
        ("flight_icao", "string", "flight_icao", "string")
    ],
    transformation_ctx="FixDataTypes_node1744455298016"
)



In [None]:
# Script generated for node flights
# The 'SplitFields' transformation is used to divide the dataset into smaller dynamic frames based on specific fields.
# Here, we are isolating flight-level information (flight number, date, and status) from other aspects of the data.
# This step ensures that the flight-level data can be processed separately from departures and arrivals for further analysis.
flights_node1744464879178 = SplitFields.apply(
    frame=FixDataTypes_node1744455298016,
    paths=["flight_number", "flight_date", "flight_status", "flight_iata", "flight_icao"],
    name2="flights_node17444648791781",
    name1="flights_node17444648791780",
    transformation_ctx="flights_node1744464879178"
)



In [None]:
# Script generated for node departures
# Similarly, we split the departures information into a separate dynamic frame.
# This step is crucial to ensure that we can handle departure-specific details like gate, terminal, delay, and scheduled times independently.
departures_node1744466056302 = SplitFields.apply(
    frame=FixDataTypes_node1744455298016,
    paths=["flight_number", "departure_airport", "departure_timezone", "departure_iata", "departure_icao",
           "departure_terminal", "departure_gate", "departure_delay", "departure_scheduled", "departure_actual"],
    name2="departures_node17444660563021",
    name1="departures_node17444660563020",
    transformation_ctx="departures_node1744466056302"
)



In [None]:
# Script generated for node arrivals
# This splits the arrival-specific information, enabling further separate processing for arrival details such as baggage, gate, and delays.
arrivals_node1744477694485 = SplitFields.apply(
    frame=FixDataTypes_node1744455298016,
    paths=["flight_number", "arrival_airport", "arrival_timezone", "arrival_iata", "arrival_icao",
           "arrival_terminal", "arrival_gate", "arrival_baggage", "arrival_scheduled", "arrival_delay", "arrival_actual"],
    name2="arrivals_node17444776944851",
    name1="arrivals_node17444776944850",
    transformation_ctx="arrivals_node1744477694485"
)



In [None]:
# Script generated for node flights_convert_to_single_dynamic_frame
# This step flattens the flight-related data (from the split) back into a single dynamic frame for ease of output storage.
# This ensures we have a properly structured frame for writing to S3.
flights_convert_to_single_dynamic_frame_node1744465833856 = SelectFromCollection.apply(
    dfc=flights_node1744464879178,
    key=list(flights_node1744464879178.keys())[0],
    transformation_ctx="flights_convert_to_single_dynamic_frame_node1744465833856"
)



In [None]:
# Script generated for node departures_convert_to_single_dynamic_frame
# Similar to the flights conversion, this step combines departure-related data back into a single dynamic frame.
# This is crucial for output storage, as well as enabling efficient downstream processing.
departures_convert_to_single_dynamic_frame_node1744477596454 = SelectFromCollection.apply(
    dfc=departures_node1744466056302,
    key=list(departures_node1744466056302.keys())[0],
    transformation_ctx="departures_convert_to_single_dynamic_frame_node1744477596454"
)



In [None]:
# Script generated for node arrivals_convert_to_single_dynamic_frame
# This step flattens the arrival-related data into a single dynamic frame for easy output handling.
arrivals_convert_to_single_dynamic_frame_node1744477730878 = SelectFromCollection.apply(
    dfc=arrivals_node1744477694485,
    key=list(arrivals_node1744477694485.keys())[0],
    transformation_ctx="arrivals_convert_to_single_dynamic_frame_node1744477730878"
)



In [None]:
# Script generated for node Amazon S3 - Flights Data Quality Evaluation
# Data Quality evaluation is done at each step to ensure the data meets predefined standards.
# This is crucial in a production environment to catch any inconsistencies or errors before writing to S3.
EvaluateDataQuality().process_rows(
    frame=flights_convert_to_single_dynamic_frame_node1744465833856,
    ruleset=DEFAULT_DATA_QUALITY_RULESET,
    publishing_options={"dataQualityEvaluationContext": "EvaluateDataQuality_node1744462146370",
                        "enableDataQualityResultsPublishing": True},
    additional_options={"dataQualityResultsPublishing.strategy": "BEST_EFFORT", "observations.scope": "ALL"}
)


In [None]:
# Writing the clean, processed flight data to S3 in Parquet format with Snappy compression.
AmazonS3_node1744465688208 = glueContext.write_dynamic_frame.from_options(
    frame=flights_convert_to_single_dynamic_frame_node1744465833856,
    connection_type="s3",
    format="glueparquet",
    connection_options={"path": "s3://airline-dataset-quaser/ad_cleaned_data/ad_flights/", "partitionKeys": []},
    format_options={"compression": "snappy"},
    transformation_ctx="AmazonS3_node1744465688208"
)



In [None]:
# Script generated for node Amazon S3 - Departures Data Quality Evaluation
# Similar to flights, data quality checks are performed on departures data before storage in S3.
EvaluateDataQuality().process_rows(
    frame=departures_convert_to_single_dynamic_frame_node1744477596454,
    ruleset=DEFAULT_DATA_QUALITY_RULESET,
    publishing_options={"dataQualityEvaluationContext": "EvaluateDataQuality_node1744477388388",
                        "enableDataQualityResultsPublishing": True},
    additional_options={"dataQualityResultsPublishing.strategy": "BEST_EFFORT", "observations.scope": "ALL"}
)


In [None]:
# Writing the cleaned departure data to S3 with Snappy compression.
AmazonS3_node1744477630245 = glueContext.write_dynamic_frame.from_options(
    frame=departures_convert_to_single_dynamic_frame_node1744477596454,
    connection_type="s3",
    format="glueparquet",
    connection_options={"path": "s3://airline-dataset-quaser/ad_cleaned_data/ad_departures/", "partitionKeys": []},
    format_options={"compression": "snappy"},
    transformation_ctx="AmazonS3_node1744477630245"
)



In [None]:
# Script generated for node Amazon S3 - Arrivals Data Quality Evaluation
# A similar data quality evaluation for the arrival data is conducted before writing it to S3.
EvaluateDataQuality().process_rows(
    frame=arrivals_convert_to_single_dynamic_frame_node1744477730878,
    ruleset=DEFAULT_DATA_QUALITY_RULESET,
    publishing_options={"dataQualityEvaluationContext": "EvaluateDataQuality_node1744477388388",
                        "enableDataQualityResultsPublishing": True},
    additional_options={"dataQualityResultsPublishing.strategy": "BEST_EFFORT", "observations.scope": "ALL"}
)


In [None]:
# Writing the cleaned arrival data to S3 with Snappy compression.
AmazonS3_node1744477762725 = glueContext.write_dynamic_frame.from_options(
    frame=arrivals_convert_to_single_dynamic_frame_node1744477730878,
    connection_type="s3",
    format="glueparquet",
    connection_options={"path": "s3://airline-dataset-quaser/ad_cleaned_data/ad_arrivals/", "partitionKeys": []},
    format_options={"compression": "snappy"},
    transformation_ctx="AmazonS3_node1744477762725"
)



In [None]:
# Final job commit to ensure that all transformations and data writing steps are finalized and saved to S3.
job.commit()
