In [None]:
# Databricks notebook source# MAGIC %md# MAGIC # POI Extraction from OSM - Bronze Layer# MAGIC# MAGIC Extracts raw Point of Interest (POI) data from OpenStreetMap PBF files.# MAGIC# MAGIC **Purpose**: Raw extraction of POI nodes with their tags.# MAGIC# MAGIC **Input**: OSM PBF file from Bronze volume# MAGIC **Output**: Bronze table with raw POI data (osm_id, osm_type, latitude, longitude, tags)# MAGIC

%md## Prerequisites**Required Library**: `pyosmium`This library must be installed on the cluster.

In [None]:
!pip install pyyaml

In [None]:
!pip install osmium

In [None]:
import osmiumimport yamlfrom pyspark.sql import functions as Ffrom pyspark.sql.types import *from datetime import datetimeimport os# Notebook parametersdbutils.widgets.text("catalog", "")dbutils.widgets.text("bronze_schema", "")dbutils.widgets.text("osm_region", "")dbutils.widgets.text("config_path", "")# Extract parameterscatalog = dbutils.widgets.get("catalog")bronze_schema = dbutils.widgets.get("bronze_schema")osm_region = dbutils.widgets.get("osm_region")config_path = dbutils.widgets.get("config_path")assert catalog and bronze_schema and osm_region and config_path, "Missing required parameters"# Load configurationwith open(config_path, 'r') as f:    config = yaml.safe_load(f)poi_config = config['poi_extraction']table_config = config['table_names']paths_config = config['paths']# Define pathsosm_file_path = f"/Volumes/{catalog}/{bronze_schema}/osm_data/{osm_region}-latest.osm.pbf"output_table = f"{catalog}.{bronze_schema}.bronze_{table_config['bronze_raw_suffix']}"temp_path = paths_config['temp_path']

In [None]:
# Check if OSM file existstry:    dbutils.fs.ls(osm_file_path)except Exception as e:    raise RuntimeError(f"OSM file not found: {osm_file_path}. Please ensure OSM download task completed successfully.") from e

%md## Define POI HandlerExtracts nodes that have POI tags (amenity, shop, leisure, etc.).Only nodes with relevant POI tags are extracted - nodes without POI tags are skipped.

In [None]:
class POIHandler(osmium.SimpleHandler):    """Handler to extract POI nodes from OSM data based on tag categories"""        def __init__(self, extract_all=True, poi_tag_categories=None):        super().__init__()        self.pois = []                # Default POI tag categories (used when extract_all=True)        default_poi_tags = [            'amenity', 'shop', 'leisure', 'tourism', 'office',             'public_transport', 'railway', 'natural', 'building'        ]                # If extract_all=True, use all default POI tags        # If extract_all=False, use only the specified poi_tag_categories        if extract_all:            self.poi_tag_categories = default_poi_tags        else:            # When extract_all=False, poi_tag_categories must be provided            self.poi_tag_categories = poi_tag_categories if poi_tag_categories else default_poi_tags        def _has_poi_tag(self, tags):        """Check if element has any POI tag from the configured categories"""        tag_keys = {tag.k for tag in tags}        return any(poi_tag in tag_keys for poi_tag in self.poi_tag_categories)        def node(self, n):        """Extract nodes with POI tags"""        # Only extract nodes that have POI tags and valid location        if not n.location.valid():            return                if self._has_poi_tag(n.tags):            # Use dict() constructor as shown in osmium documentation            tags_dict = dict(n.tags)                        # Only add if tags dict is not empty            if tags_dict:                self.pois.append({                    'osm_id': str(n.id),                    'osm_type': 'node',                    'latitude': n.location.lat,                    'longitude': n.location.lon,                    'tags': tags_dict                })

%md## Extract POIs from OSM File

In [None]:
# Parse OSM file and extract POIsextract_all = poi_config.get('extract_all', True)poi_tag_categories = poi_config.get('poi_tag_categories', [])# Initialize handler with configurationhandler = POIHandler(extract_all=extract_all, poi_tag_categories=poi_tag_categories)# Parse OSM file directly from Unity Catalog volume# With SINGLE_USER mode, Unity Catalog volumes are FUSE-mounted and accessible as POSIX paths# osmium can read directly from /Volumes/ paths# Verify file existsif not os.path.exists(osm_file_path):    raise RuntimeError(f"OSM file not found at: {osm_file_path}")# Parse with osmium directly from Unity Catalog volume pathhandler.apply_file(osm_file_path)poi_count = len(handler.pois)if poi_count == 0:    raise RuntimeError("No POIs found in OSM file. Check if file contains POI data with matching tags.")

In [None]:
# Convert POIs to Spark DataFrame directly# Define schema for Spark DataFrame# Tags will be stored as MapType(StringType(), StringType())schema = StructType([    StructField("osm_id", StringType(), False),    StructField("osm_type", StringType(), False),    StructField("latitude", DoubleType(), True),    StructField("longitude", DoubleType(), True),    StructField("tags", MapType(StringType(), StringType()), True)])# Create Spark DataFrame directly from list of dictionariespoi_df = spark.createDataFrame(handler.pois, schema=schema)display(poi_df.limit(10))

%md## Write to Bronze Table

In [None]:
# Write to Bronze tablepoi_df.write \    .format("delta") \    .mode("overwrite") \    .option("overwriteSchema", "true") \    .option("delta.autoOptimize.optimizeWrite", "true") \    .saveAsTable(output_table)# Summary statistics with tag validationsummary = spark.sql(f"""    SELECT         COUNT(*) as total_pois,        COUNT(DISTINCT osm_id) as unique_pois,        COUNT(DISTINCT osm_type) as osm_types,        COUNT(CASE WHEN latitude IS NOT NULL AND longitude IS NOT NULL THEN 1 END) as pois_with_coords,        COUNT(CASE WHEN tags IS NOT NULL THEN 1 END) as pois_with_tags,        COUNT(CASE WHEN tags IS NULL THEN 1 END) as pois_without_tags    FROM {output_table}""")display(summary)