In [1]:
one_day_path = "D:/Project/Spark_project/one_day_data.txt"


In [2]:
import duckdb

# Create a DuckDB connection
con = duckdb.connect()

# Execute the SQL command
con.execute(f"""
CREATE OR REPLACE TABLE duck_one_day_raw AS
SELECT * 
FROM read_json_auto('{one_day_path}', union_by_name=true, sample_size=-1);
""")


<duckdb.duckdb.DuckDBPyConnection at 0x2b109fbdbb0>

In [3]:
con.execute("SELECT dt, payload FROM duck_one_day_raw LIMIT 5").fetchdf()



Unnamed: 0,dt,payload
0,2024-12-31 00:00:00.211996,"{'hex': 'c0884e', 'flight': 'POE135 ', 'alt_b..."
1,2024-12-31 00:00:00.220373,"{'hex': '4080bf', 'flight': 'BAW6AC ', 'alt_b..."
2,2024-12-31 00:00:00.220540,"{'hex': 'aa80c0', 'flight': 'EJA776 ', 'alt_b..."
3,2024-12-31 00:00:00.220652,"{'hex': 'a18380', 'flight': 'EDV5489 ', 'alt_b..."
4,2024-12-31 00:00:00.220729,"{'hex': 'c05403', 'flight': 'WJA598 ', 'alt_b..."


In [4]:
con.execute("""
CREATE OR REPLACE TABLE duck_one_day_flat AS
SELECT 
    dt AS timestamp,
    payload ->> 'hex' AS hex,
    payload ->> 'flight' AS flight,
    TRY_CAST(payload ->> 'alt_baro' AS DOUBLE) AS alt_baro,
    TRY_CAST(payload ->> 'alt_geom' AS DOUBLE) AS alt_geom,
    TRY_CAST(payload ->> 'gs' AS DOUBLE) AS ground_speed,
    TRY_CAST(payload ->> 'track' AS DOUBLE) AS track,
    TRY_CAST(payload ->> 'baro_rate' AS DOUBLE) AS baro_rate,
    payload ->> 'squawk' AS squawk,
    payload ->> 'emergency' AS emergency,
    payload ->> 'category' AS category,
    TRY_CAST(payload ->> 'nav_qnh' AS DOUBLE) AS nav_qnh,
    TRY_CAST(payload ->> 'nav_altitude_mcp' AS DOUBLE) AS nav_altitude_mcp,
    TRY_CAST(payload ->> 'nav_heading' AS DOUBLE) AS nav_heading,
    TRY_CAST(payload ->> 'lat' AS DOUBLE) AS lat,
    TRY_CAST(payload ->> 'lon' AS DOUBLE) AS lon,
    TRY_CAST(payload ->> 'nic' AS INTEGER) AS nic,
    TRY_CAST(payload ->> 'rc' AS INTEGER) AS rc,
    TRY_CAST(payload ->> 'seen_pos' AS DOUBLE) AS seen_pos,
    TRY_CAST(payload ->> 'version' AS INTEGER) AS version,
    TRY_CAST(payload ->> 'nic_baro' AS INTEGER) AS nic_baro,
    TRY_CAST(payload ->> 'nac_p' AS INTEGER) AS nac_p,
    TRY_CAST(payload ->> 'nac_v' AS INTEGER) AS nac_v,
    TRY_CAST(payload ->> 'sil' AS INTEGER) AS sil,
    payload ->> 'sil_type' AS sil_type,
    TRY_CAST(payload ->> 'gva' AS INTEGER) AS gva,
    TRY_CAST(payload ->> 'sda' AS INTEGER) AS sda,
    TRY_CAST(payload ->> 'mlat' AS BOOLEAN) AS mlat,
    TRY_CAST(payload ->> 'tisb' AS BOOLEAN) AS tisb,
    TRY_CAST(payload ->> 'messages' AS INTEGER) AS messages,
    TRY_CAST(payload ->> 'seen' AS DOUBLE) AS seen,
    TRY_CAST(payload ->> 'rssi' AS DOUBLE) AS rssi
FROM duck_one_day_raw;
""")



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x2b109fbdbb0>

In [None]:
con.execute("SELECT * FROM duck_one_day_flat LIMIT 5").fetchdf()


Unnamed: 0,timestamp,hex,flight,alt_baro,lat,lon,ground_speed
0,2024-12-31 00:00:00.211996,c0884e,POE135,28025.0,44.909225,-75.22195,458.3
1,2024-12-31 00:00:00.220373,4080bf,BAW6AC,36975.0,45.040421,-75.155443,538.6
2,2024-12-31 00:00:00.220540,aa80c0,EJA776,23850.0,44.989014,-74.94376,435.2
3,2024-12-31 00:00:00.220652,a18380,EDV5489,19700.0,45.061077,-74.762097,422.2
4,2024-12-31 00:00:00.220729,c05403,WJA598,17000.0,45.164474,-74.54858,424.9


In [None]:
# Load full telemetry and extract all payload fields in Spark

# 🚀 Initialize Spark Session


from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("SparkDuckDB_Comparison") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

print("✅ Spark Session created")

df_spark_small = spark.read.json(one_day_path)

df_spark_small_flat = df_spark_small.select(
    col("dt").alias("timestamp"),
    col("payload.hex").alias("hex"),
    col("payload.alt_baro").alias("alt_baro"),
    col("payload.alt_geom").alias("alt_geom"),
    col("payload.gs").alias("ground_speed"),
    col("payload.track").alias("track"),
    col("payload.baro_rate").alias("baro_rate"),
    col("payload.squawk").alias("squawk"),
    col("payload.emergency").alias("emergency"),
    col("payload.category").alias("category"),
    col("payload.nav_qnh").alias("nav_qnh"),
    col("payload.nav_altitude_mcp").alias("nav_altitude_mcp"),
    col("payload.nav_heading").alias("nav_heading"),
    col("payload.lat").alias("lat"),
    col("payload.lon").alias("lon"),
    col("payload.nic").alias("nic"),
    col("payload.rc").alias("rc"),
    col("payload.seen_pos").alias("seen_pos"),
    col("payload.version").alias("version"),
    col("payload.nic_baro").alias("nic_baro"),
    col("payload.nac_p").alias("nac_p"),
    col("payload.nac_v").alias("nac_v"),
    col("payload.sil").alias("sil"),
    col("payload.sil_type").alias("sil_type"),
    col("payload.gva").alias("gva"),
    col("payload.sda").alias("sda"),
    col("payload.mlat").alias("mlat"),
    col("payload.tisb").alias("tisb"),
    col("payload.messages").alias("messages"),
    col("payload.seen").alias("seen"),
    col("payload.rssi").alias("rssi"),
    col("payload.flight").alias("flight")
)

df_spark_small_flat.show(5)
