In [0]:
%pip install --force-reinstall /Volumes/unitygo/telebricks/package/pyspark_data_sources-0.1.10-py3-none-any.whl
%restart_python

Processing /Volumes/unitygo/telebricks/package/pyspark_data_sources-0.1.10-py3-none-any.whl
Collecting mkdocstrings<0.29.0,>=0.28.0 (from mkdocstrings[python]<0.29.0,>=0.28.0->pyspark-data-sources==0.1.10)
  Using cached mkdocstrings-0.28.3-py3-none-any.whl.metadata (8.4 kB)
Collecting pyarrow>=11.0.0 (from pyspark-data-sources==0.1.10)
  Using cached pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting requests<3.0.0,>=2.31.0 (from pyspark-data-sources==0.1.10)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting requests-oauthlib<2.0.0,>=1.3.1 (from pyspark-data-sources==0.1.10)
  Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting Jinja2>=2.11.1 (from mkdocstrings<0.29.0,>=0.28.0->mkdocstrings[python]<0.29.0,>=0.28.0->pyspark-data-sources==0.1.10)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting Markdown>=3.6 (from mkdocstrings<0.29.0,>=0.28.0->mkdocstrings[python]<0.29.0,

# OpenSky Network - Flight Tracker

Fetch real-time flight data using REST data source.

**Available Regions:**
- `EUROPE`, `NORTH_AMERICA`, `SOUTH_AMERICA`, `ASIA`, `AUSTRALIA`, `AFRICA`

**Note:** Make sure to install the `pyspark-data-sources` package before running this notebook.

In [0]:
# Setup
from pyspark.sql import SparkSession
from pyspark_datasources import rest_api_call, rest_api_call_csv, parse_array_response, parse_array_response_streaming, RestDataSource
import json

# Use getOrCreate to work with Databricks cluster
spark = SparkSession.builder.appName("OpenSky").getOrCreate()

# Define regions (from opensky.py)
regions = {
    "EUROPE": {"lamin": 35.0, "lamax": 72.0, "lomin": -25.0, "lomax": 45.0},
    "NORTH_AMERICA": {"lamin": 7.0, "lamax": 72.0, "lomin": -168.0, "lomax": -60.0},
    "SOUTH_AMERICA": {"lamin": -56.0, "lamax": 15.0, "lomin": -90.0, "lomax": -30.0},
    "ASIA": {"lamin": -10.0, "lamax": 82.0, "lomin": 45.0, "lomax": 180.0},
    "AUSTRALIA": {"lamin": -50.0, "lamax": -10.0, "lomin": 110.0, "lomax": 180.0},
    "AFRICA": {"lamin": -35.0, "lamax": 37.0, "lomin": -20.0, "lomax": 52.0},
}

# Column names for OpenSky flight arrays
column_names = [
    "icao24", "callsign", "origin_country", "time_position", "last_contact",
    "longitude", "latitude", "geo_altitude", "on_ground", "velocity",
    "true_track", "vertical_rate", "sensors", "baro_altitude",
    "squawk", "spi", "category"
]

print("✓ Ready!")

✓ Ready!


In [0]:
# Register REST data source
spark.dataSource.register(RestDataSource)

# Choose region
region = "NORTH_AMERICA"
bbox = regions[region]

# Create input DataFrame
input_df = spark.createDataFrame([{"region": region, **bbox}])
input_json = json.dumps([{"region": region, **bbox}])

print(f"Setting up streaming for region: {region}")
print(f"Input data: {input_json}\n")

# Configure streaming
# IMPORTANT: Set dataField="" to return the whole response as JSON string
url = "https://opensky-network.org/api/states/all?lamin={lamin}&lamax={lamax}&lomin={lomin}&lomax={lomax}"

print("Creating streaming DataFrame...")
stream_df = spark.readStream.format("rest") \
    .option("url", url) \
    .option("method", "GET") \
    .option("streaming", "true") \
    .option("inputData", input_json) \
    .option("queryType", "querystring") \
    .option("streamingInterval", "10") \
    .option("offsetType", "timestamp") \
    .option("offsetField", "time") \
    .option("initialOffset", "0") \
    .option("dataField", "") \
    .load()

print(f"Stream schema: {stream_df.schema}")

# Parse the output field as JSON and explode the states array
from pyspark.sql.functions import from_json, explode, col
from pyspark.sql.types import StructType, StructField, LongType, ArrayType, StringType

# Define schema for OpenSky response
response_schema = StructType([
    StructField("time", LongType(), True),
    StructField("states", ArrayType(ArrayType(StringType())), True)
])

print("\nParsing response and exploding states...")

# Parse the output field and extract data
parsed_df = stream_df.withColumn("parsed", from_json(col("output"), response_schema))

# Extract and explode states
with_fields = parsed_df.select(
    col("region"),
    col("parsed.time").alias("time"),
    explode(col("parsed.states")).alias("state_array")
)

# Map array positions to column names
flights_display = with_fields.select(
    "region",
    "time",
    col("state_array")[0].alias("icao24"),
    col("state_array")[1].alias("callsign"),
    col("state_array")[2].alias("origin_country"),
    col("state_array")[5].cast("double").alias("longitude"),
    col("state_array")[6].cast("double").alias("latitude")
)

print(f"✓ Flights DataFrame created")
print(f"Schema: {flights_display.schema}\n")

print("Starting streaming query...")
print("Polling every 10 seconds. Use 'Stop' button to stop.\n")

# Start streaming
display(flights_display, streamName="opensky_flights")

region,time,icao24,callsign,origin_country,longitude,latitude
NORTH_AMERICA,1761449694,aae316,AAL1882,United States,-77.2028,37.3656
NORTH_AMERICA,1761449694,acdfa3,ASA898,United States,-125.431,44.9576
NORTH_AMERICA,1761449694,a0e250,AAL903,United States,-77.0789,38.6237
NORTH_AMERICA,1761449694,ab0356,ASA9,United States,-122.6014,45.5894
NORTH_AMERICA,1761449694,e8024d,ARE4280,Chile,-81.7327,12.5712
NORTH_AMERICA,1761449694,a963f7,FFT1954,United States,-80.7393,28.1796
NORTH_AMERICA,1761449694,ac3688,SWA2912,United States,-117.1233,37.012
NORTH_AMERICA,1761449694,3455d5,IBE03HB,Spain,-80.3466,25.7847
NORTH_AMERICA,1761449694,a6f23d,DAL571,United States,-117.1358,32.7205
NORTH_AMERICA,1761449694,0d11f9,VIV2022,Mexico,-87.7787,20.8174


In [0]:
# Register REST data source
spark.dataSource.register(RestDataSource)

# Choose region
region = "NORTH_AMERICA"
bbox = regions[region]

# Create input DataFrame
input_df = spark.createDataFrame([{"region": region, **bbox}])
input_json = json.dumps([{"region": region, **bbox}])

print(f"Setting up streaming for region: {region}")
print(f"Bounding box: {bbox}\n")

# Configure streaming
# IMPORTANT: Set dataField="" to return the whole response as JSON string
url = "https://opensky-network.org/api/states/all?lamin={lamin}&lamax={lamax}&lomin={lomin}&lomax={lomax}"

print("Creating streaming DataFrame...")
stream_df = spark.readStream.format("rest") \
    .option("url", url) \
    .option("method", "GET") \
    .option("streaming", "true") \
    .option("inputData", input_json) \
    .option("queryType", "querystring") \
    .option("streamingInterval", "10") \
    .option("offsetType", "timestamp") \
    .option("offsetField", "time") \
    .option("initialOffset", "0") \
    .option("dataField", "") \
    .load()

print(f"✓ Stream DataFrame created")
print(f"Schema: {stream_df.schema}\n")

# Parse array response using helper function
print("Parsing array response using parse_array_response_streaming...")
flights = parse_array_response_streaming(
    stream_df, 
    array_path="states", 
    column_names=column_names, 
    timestamp_field="time"
)

print(f"✓ Flights DataFrame created")
print(f"Schema: {flights.schema}\n")

# Select columns to display
flights_display = flights.select(
    "region", "time", "icao24", "callsign", "origin_country", 
    "longitude", "latitude", "geo_altitude", "velocity"
)

print("Starting streaming query...")
print("Polling every 10 seconds. Use 'Stop' button to stop.\n")

# Start streaming
display(flights_display, streamName="opensky_flights")

region,time,icao24,callsign,origin_country,longitude,latitude,geo_altitude,velocity
NORTH_AMERICA,1761449720,aae316,AAL1882,United States,-77.2166,37.3829,480.06,88.78
NORTH_AMERICA,1761449720,acdfa3,ASA898,United States,-125.3522,44.9734,9585.96,256.97
NORTH_AMERICA,1761449720,a0e250,AAL903,United States,-77.0549,38.6334,632.46,94.32
NORTH_AMERICA,1761449720,ab0356,ASA9,United States,-122.5989,45.5885,,5.14
NORTH_AMERICA,1761449720,e8024d,ARE4280,Chile,-81.7172,12.5803,38.1,74.73
NORTH_AMERICA,1761449720,a963f7,FFT1954,United States,-80.6936,28.1516,3848.1,215.2
NORTH_AMERICA,1761449720,ac3688,SWA2912,United States,-117.1535,37.0582,9753.6,220.22
NORTH_AMERICA,1761449720,3455d5,IBE03HB,Spain,-80.3287,25.7855,76.2,68.49
NORTH_AMERICA,1761449720,a6f23d,DAL571,United States,-117.1537,32.7249,160.02,67.27
NORTH_AMERICA,1761449720,0d11f9,VIV2022,Mexico,-87.8301,20.814,6614.16,208.88
