In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests
import json
import time

# Define the schema for the OpenSky states data
opensky_schema = StructType([
    StructField("icao24", StringType()),
    StructField("callsign", StringType()),
    StructField("origin_country", StringType()),
    StructField("time_position", LongType()),
    StructField("last_contact", LongType()),
    StructField("longitude", DoubleType()),
    StructField("latitude", DoubleType()),
    StructField("baro_altitude", DoubleType()),
    StructField("on_ground", BooleanType()),
    StructField("velocity", DoubleType()),
    StructField("true_track", DoubleType()),
    StructField("vertical_rate", DoubleType()),
    StructField("sensors", ArrayType(LongType())),
    StructField("geo_altitude", DoubleType()),
    StructField("squawk", StringType()),
    StructField("spi", BooleanType()),
    StructField("position_source", IntegerType()),
    StructField("ingestion_time", TimestampType())
])

@dlt.table(
    name="bronze_opensky_states",
    comment="Raw flight states data from OpenSky Network API",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def bronze_opensky_states():
    # Create an empty DataFrame with the expected schema
    empty_df = spark.createDataFrame([], opensky_schema)
    
    # Get API data
    def fetch_data():
        url = "https://opensky-network.org/api/states/all"
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                data = response.json()
                if data and "states" in data:
                    current_timestamp = time.strftime('%Y-%m-%d %H:%M:%S')
                    states = data["states"]
                    time_data = data["time"]
                    
                    rows = []
                    for state in states:
                        rows.append({
                            "icao24": state[0],
                            "callsign": state[1].strip() if state[1] else None,
                            "origin_country": state[2],
                            "time_position": state[3],
                            "last_contact": state[4],
                            "longitude": state[5],
                            "latitude": state[6],
                            "baro_altitude": state[7],
                            "on_ground": state[8],
                            "velocity": state[9],
                            "true_track": state[10],
                            "vertical_rate": state[11],
                            "sensors": state[12],
                            "geo_altitude": state[13],
                            "squawk": state[14],
                            "spi": state[15],
                            "position_source": state[16],
                            "ingestion_time": current_timestamp
                        })
                    
                    if rows:
                        return spark.createDataFrame(rows, opensky_schema)
        except Exception as e:
            print(f"Error fetching data: {str(e)}")
        return None
    
    # Get initial data
    api_df = fetch_data()
    if api_df:
        return api_df
    else:
        return empty_df

@dlt.table(
    name="silver_flights_cleaned",
    comment="Cleaned and validated flight data",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.managed": "true"
    }
)
@dlt.expect_or_drop("valid_callsign", "callsign IS NOT NULL OR callsign != ''")
@dlt.expect_or_drop("valid_position", "longitude IS NOT NULL AND latitude IS NOT NULL")
def silver_flights_cleaned():
    return dlt.read("bronze_opensky_states").where(
        (col("on_ground") == False) &  # Only airborne flights
        (col("geo_altitude") > 100)    # Minimum altitude filter
    )

@dlt.table(
    name="gold_flight_aggregations",
    comment="Aggregated flight statistics by country",
    table_properties={
        "quality": "gold",
        "pipelines.autoOptimize.managed": "true"
    }
)
def gold_flight_aggregations():
    return (
        dlt.read("silver_flights_cleaned")
        .groupBy("origin_country")
        .agg(
            count("*").alias("flight_count"),
            avg("geo_altitude").alias("avg_altitude"),
            avg("velocity").alias("avg_speed"),
            countDistinct("icao24").alias("unique_aircraft")
        )
        .orderBy(desc("flight_count"))
    )

@dlt.table(
    name="gold_active_flights",
    comment="Current active flights with positions",
    table_properties={
        "quality": "gold",
        "pipelines.autoOptimize.managed": "true"
    }
)
def gold_active_flights():
    return (
        dlt.read("silver_flights_cleaned")
        .select(
            "icao24",
            "callsign",
            "origin_country",
            "longitude",
            "latitude",
            "geo_altitude",
            "velocity",
            "true_track",
            "ingestion_time"
        )
        .withColumn("current_timestamp", current_timestamp())
        .where(
            col("ingestion_time") > date_sub(current_timestamp(), 1)
        )
    )