In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # POI Cleaning - Silver Layer
# MAGIC
# MAGIC Cleans and structures raw POI data from Bronze layer.
# MAGIC
# MAGIC **Purpose**: Transform raw OSM POI data into cleaned structured format.
# MAGIC
# MAGIC **Input**: Bronze POI table (`{catalog}.bronze.osm_pois_raw`)
# MAGIC **Output**: Silver table with cleaned POI data (poi_id, name, category, latitude, longitude, address)
# MAGIC

In [None]:
import yaml
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime

# Notebook parameters
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("bronze_schema", "")
dbutils.widgets.text("silver_schema", "")
dbutils.widgets.text("config_path", "")
dbutils.widgets.text("input_table", "")
dbutils.widgets.text("output_table", "")

# Extract parameters
catalog = dbutils.widgets.get("catalog")
bronze_schema = dbutils.widgets.get("bronze_schema")
silver_schema = dbutils.widgets.get("silver_schema")
config_path = dbutils.widgets.get("config_path")
input_table_widget = dbutils.widgets.get("input_table")
output_table_widget = dbutils.widgets.get("output_table")

assert catalog and bronze_schema and silver_schema and config_path, "Missing required parameters"

# Load configuration
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

poi_cleaning_config = config['poi_cleaning']
table_config = config['table_names']

# Use explicit tables if provided, otherwise construct from config
if input_table_widget and input_table_widget.strip():
    input_table = input_table_widget.strip()
else:
    input_table = f"{catalog}.{bronze_schema}.bronze_{table_config['bronze_raw_suffix']}"

if output_table_widget and output_table_widget.strip():
    output_table = output_table_widget.strip()
else:
    output_table = f"{catalog}.{silver_schema}.silver_{table_config['silver_cleaned_suffix']}"

In [None]:
# Read raw POI data
pois_raw = spark.read.table(input_table)

# Validate table exists and has data
poi_count = pois_raw.count()

# Diagnostic: Show table info
table_info = spark.createDataFrame([
    ("Input table", input_table),
    ("Row count", str(poi_count))
], ["info", "value"])
display(table_info)

if poi_count == 0:
    raise RuntimeError(f"No POIs found in input table: {input_table}")

%md
## Extract and Clean Columns


In [None]:
# Extract category and subcategory from tags using UDF - more reliable for Python dict handling
category_priority = poi_cleaning_config.get('category_priority', ['shop', 'amenity', 'leisure', 'tourism', 'office', 'public_transport', 'railway'])

def get_category_and_subcategory(tags):
    """Extract category and subcategory from tags dict
    Returns (category, subcategory) tuple
    Example: {'amenity': 'pharmacy'} -> ('amenity', 'pharmacy')
    """
    if tags is None or not isinstance(tags, dict):
        return (None, None)
    
    # Check each priority tag in order
    for category_tag in category_priority:
        if category_tag in tags:
            tag_value = tags[category_tag]
            if tag_value and str(tag_value).strip():
                return (category_tag, str(tag_value).strip())
    
    return (None, None)

# Create UDF with struct return type
category_schema = StructType([
    StructField("category", StringType(), True),
    StructField("subcategory", StringType(), True)
])
get_category_udf = F.udf(get_category_and_subcategory, category_schema)

# Extract address components using configured address fields
address_fields = poi_cleaning_config.get('address_fields', ['addr:housenumber', 'addr:street', 'addr:city', 'addr:state', 'addr:postcode'])

def build_address(tags):
    """Build address string from tags using configured address fields"""
    if tags is None or not isinstance(tags, dict):
        return None
    
    parts = []
    for field in address_fields:
        if field in tags and tags[field]:
            parts.append(str(tags[field]))
    
    return ', '.join(parts) if parts else None

build_address_udf = F.udf(build_address, StringType())

# Clean POI data
poi_id_prefix = poi_cleaning_config.get('poi_id_prefix', 'poi_')
pois_with_category = pois_raw \
    .withColumn("poi_id", F.concat(F.lit(poi_id_prefix), F.col("osm_id"))) \
    .withColumn("name", F.col("tags")["name"]) \
    .withColumn("category_struct", get_category_udf(F.col("tags"))) \
    .withColumn("poi_category", F.col("category_struct.category")) \
    .withColumn("poi_subcategory", F.col("category_struct.subcategory")) \
    .withColumn("latitude", F.col("latitude").cast("double")) \
    .withColumn("longitude", F.col("longitude").cast("double")) \
    .withColumn("address", build_address_udf(F.col("tags"))) \
    .withColumn("ingestion_timestamp", F.lit(datetime.now()))

# Apply filters and select final columns
pois_cleaned = pois_with_category \
    .select(
        "poi_id",
        "name",
        "poi_category",
        "poi_subcategory",
        "latitude",
        "longitude",
        "address",
        "osm_id",
        "osm_type",
        "ingestion_timestamp"
    ) \
    .filter(
        F.col("latitude").isNotNull() &
        F.col("longitude").isNotNull() &
        F.col("poi_category").isNotNull()
    )

%md
## Validate Data Quality


In [None]:
# Validate coordinate bounds using configured bounds
coord_bounds = poi_cleaning_config.get('coordinate_bounds', {
    'latitude_min': -90,
    'latitude_max': 90,
    'longitude_min': -180,
    'longitude_max': 180
})

pois_cleaned = pois_cleaned.filter(
    (F.col("latitude") >= coord_bounds['latitude_min']) & 
    (F.col("latitude") <= coord_bounds['latitude_max']) &
    (F.col("longitude") >= coord_bounds['longitude_min']) & 
    (F.col("longitude") <= coord_bounds['longitude_max'])
)

display(pois_cleaned.limit(10))

%md
## Write to Silver Table


In [None]:
# Write to Silver table
pois_cleaned.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("delta.autoOptimize.optimizeWrite", "true") \
    .option("delta.autoOptimize.autoCompact", "true") \
    .saveAsTable(output_table)

# Summary statistics
summary = spark.sql(f"""
    SELECT 
        COUNT(*) as total_pois,
        COUNT(DISTINCT poi_category) as poi_categories,
        COUNT(DISTINCT poi_subcategory) as poi_subcategories,
        COUNT(DISTINCT osm_type) as osm_types,
        COUNT(CASE WHEN name IS NOT NULL THEN 1 END) as pois_with_name,
        COUNT(CASE WHEN address IS NOT NULL THEN 1 END) as pois_with_address
    FROM {output_table}
""")

display(summary)