In [11]:
%pip install Shapely
import json, pandas as pd

with open("/lakehouse/default/Files/GIS Data/City Wards Data TopoJSON - 4326.json") as f:
    topo = json.load(f)

layer_name = list(topo["objects"].keys())[0]
features = topo["objects"][layer_name]["geometries"]

# Arcs is the full set of coordinates with delta steps from an origin point 
# The origin point is provided by the translate attribute below
# Transform provides translate, the origin point and scale, the multiplying factor 
# Scale is used to multiply the delta values to convert to real-world coords
arcs = topo["arcs"]
transform = topo["transform"]
scale = transform["scale"]
translate = transform["translate"]

# Decode arcs into coordinates
def decode_arc(arc):
    coords = []                # Final list to store decoded (lon, lat) coordinates
    x, y = 0, 0                # Start at origin (0, 0)

    for dx, dy in arc:        # Example arc: [[1, 0], [1, 0], [0, 1]]
        x += dx               # Step 1: x = 0 + 1 → x = 1
        y += dy               # Step 1: y = 0 + 0 → y = 0

        lon = x * scale[0] + translate[0]   # lon = 1 * 0.01 + (-79.5) → -79.49
        lat = y * scale[1] + translate[1]   # lat = 0 * 0.01 + 43.6 → 43.6

        coords.append((lon, lat))          # First point: (-79.49, 43.6)

    return coords             # Returns: [(-79.49, 43.6), (-79.48, 43.6), (-79.48, 43.61)]


# Resolve arc indices to arcs including handling reversed arcs
def resolve_arcs(arc_indices): #arc indices refer to the positions of the full list of arcs
    coords = []  # Final list to store all decoded coordinates for the shape

    for idx in arc_indices:  # Example: arc_indices = [0, -1, 2]
        arc = arcs[abs(idx)]  # Get arc by absolute index
        # Step 1: idx = 0 → abs(0) = 0 → arc = arcs[0]
        # Step 2: idx = -1 → abs(-1) = 1 → arc = arcs[1]
        # Step 3: idx = 2 → abs(2) = 2 → arc = arcs[2]

        if idx < 0:
            arc = arc[::-1]  # Reverse arc if index is negative using list slicing technique
            # Step 2: idx = -1 → reverse arcs[1] to walk it backward

        coords.extend(decode_arc(arc))  # Decode arc into real-world coordinates and append
        # Each call to decode_arc returns a list of (lon, lat) tuples
        # These are added to the final coords list to build the full polygon

    return coords  # Returns the full list of decoded coordinates for the shape


# Build WKT Polygons
from shapely.geometry import Polygon  # Import Polygon class to create geometric shapes

rows = []  # Final list to store enriched property dictionaries. A list of dictionaries.

for f in features:  # Loop through each feature in the TopoJSON layer
    props = f["properties"]  # Extract metadata like WARD_ID, AREA_NAME, etc.
    # Example: props = {"WARD_ID": "07", "AREA_NAME": "York South–Weston"}

    # Extract arc indices for the polygon
    arc_indices = f["arcs"][0] if isinstance(f["arcs"][0], list) else f["arcs"]
    # Example: f["arcs"] = [[0, -1, 2]] → arc_indices = [0, -1, 2]
    # If f["arcs"] = [0, -1, 2] directly (no nesting), it still works

    coords = resolve_arcs(arc_indices)  # Decode arc indices into real-world coordinates
    # Example output: [(-79.49, 43.6), (-79.48, 43.6), (-79.48, 43.61), ...]

    polygon = Polygon(coords)  # Create a Shapely Polygon from the coordinate list
    # This builds a geometric object that can be used for spatial operations

    props["geometry_wkt"] = polygon.wkt  # Convert polygon to WKT format and store in props
    # Example: 'POLYGON ((-79.49 43.6, -79.48 43.6, -79.48 43.61, ...))'

    rows.append(props)  # Add enriched props dictionary to the final list

# Creating a PySpark dataframe to store enriched geometry
df = pd.DataFrame(rows)  # Convert to Pandas first
spark_df = spark.createDataFrame(df)  # Then to PySpark

# Optional: Rename or cast columns
spark_df = spark_df.withColumnRenamed("AREA_SHORT_CODE", "ward_id") 
# Write to silver table with flattened properties and geometry for all features
spark_df.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("silver_03_dim_ward_topojson_toronto")

# Final Output as CSV
silver_df = spark.read.table("silver_03_dim_ward_topojson_toronto")
silver_pdf = silver_df.toPandas()
silver_pdf.to_csv("/lakehouse/default/Files/silver_03_dim_ward_topojson_toronto.csv", index=False)

display(silver_pdf) # To get a download option as csv

StatementMeta(, a6a548c1-05d3-4936-a258-927bbcb3ae53, 72, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

