In [None]:
# Databricks notebook source# MAGIC %md# MAGIC # Urbanicity-Based Routing with Valhalla# MAGIC# MAGIC Generates drive-time isochrones based on urbanicity classification.# MAGIC# MAGIC **Drive Times:** Urban=10min, Suburban=20min (default), Rural=30min

%md## Parameters

In [None]:
%pip install -q pyyaml h3

In [None]:
dbutils.widgets.text("catalog", "geo_site_selection")dbutils.widgets.text("bronze_schema", "bronze")dbutils.widgets.text("silver_schema", "silver")dbutils.widgets.text("gold_schema", "gold")dbutils.widgets.text("config_path", "/Workspace/resources/configs/isochrone_config.yml")dbutils.widgets.text("input_table", "", "Input Table (optional)")dbutils.widgets.text("output_table_override", "", "Output Table (optional)")dbutils.widgets.dropdown("skip_setup", "yes", ["yes", "no"], "Skip Valhalla Setup")catalog = dbutils.widgets.get("catalog")bronze_schema = dbutils.widgets.get("bronze_schema")silver_schema = dbutils.widgets.get("silver_schema")gold_schema = dbutils.widgets.get("gold_schema")config_path = dbutils.widgets.get("config_path")input_table_override = dbutils.widgets.get("input_table")output_table_override = dbutils.widgets.get("output_table_override")skip_setup = dbutils.widgets.get("skip_setup") == "yes"BUILD_PATH = "/local_disk0/valhalla_build"VALHALLA_CONFIG = f"{BUILD_PATH}/valhalla.json"OSM_VOLUME = f"/Volumes/{catalog}/{bronze_schema}/osm_data"PERSIST_VOLUME = f"/Volumes/{catalog}/{silver_schema}/valhalla_data"

In [None]:
input_table_override

In [None]:
import yamlwith open(config_path, 'r') as f:    config = yaml.safe_load(f)urbanicity_config = config['urbanicity_routing']perf_config = config['performance']output_config = config['output']h3_features_table = urbanicity_config['h3_features_table']drive_times = urbanicity_config['drive_times']repartition_factor = perf_config.get('repartition_factor', 8)# Use parameter override if provided, otherwise use configif input_table_override and input_table_override.strip():    locations_table = input_table_override.strip()else:    locations_table = config.get('isochrone', {}).get('input_tables', {}).get('rmc',        f"{catalog}.{bronze_schema}.rmc_retail_locations_grocery")if output_table_override and output_table_override.strip():    output_table = output_table_override.strip()else:    output_table = urbanicity_config['output_table']print(f"Input: {locations_table}")print(f"Output: {catalog}.{silver_schema}.{output_table}")

%md## Setup Valhalla (One-Time)

In [None]:
%%shif [ "$skip_setup" = "yes" ]; then exit 0; fisudo apt-get update -y > /dev/null 2>&1sudo apt-get install -y cmake build-essential git curl wget > /dev/null 2>&1

In [None]:
%%shif [ -n "${skip_setup}" ] && [ "${skip_setup}" != "no" ]; then exit 0; fiif command -v valhalla_build_config >/dev/null 2>&1 && python3 -c "import valhalla" 2>/dev/null; then    exit 0fiset -eBUILD_DIR="/local_disk0/tmp/valhalla_build"rm -rf "$BUILD_DIR"mkdir -p "$BUILD_DIR"cd "$BUILD_DIR"git clone --quiet --recurse-submodules https://github.com/valhalla/valhalla.gitcd valhallasudo ./scripts/install-linux-deps.sh > /dev/null 2>&1cmake -B build \  -DCMAKE_BUILD_TYPE=Release \  -DENABLE_PYTHON_BINDINGS=ON \  -DCMAKE_CXX_FLAGS="-Wno-error=format-truncation" \  > /dev/null 2>&1make -C build -j$(nproc) > /dev/null 2>&1sudo make -C build install > /dev/null 2>&1sudo ldconfig

In [None]:
%%shif [ -n "${skip_setup}" ] && [ "${skip_setup}" != "no" ]; then exit 0; fiBUILD_DIR="/local_disk0/valhalla_build"OSM_VOLUME="/Volumes/retail_consumer_goods/geospatial_site_selection/osm_data"if [ -f "$BUILD_DIR/valhalla_tiles.tar" ]; then exit 0; fimkdir -p "$BUILD_DIR"cd "$BUILD_DIR"PBF_FILE=$(ls "$OSM_VOLUME"/*.osm.pbf 2>/dev/null | head -1)if [ -z "$PBF_FILE" ]; then    echo "No .osm.pbf file found in $OSM_VOLUME"    exit 1fimkdir -p valhalla_tilesvalhalla_build_config \  --mjolnir-tile-dir "${BUILD_DIR}/valhalla_tiles" \  --mjolnir-tile-extract "${BUILD_DIR}/valhalla_tiles.tar" \  --mjolnir-timezone "${BUILD_DIR}/valhalla_tiles/timezones.sqlite" \  --mjolnir-admin "${BUILD_DIR}/valhalla_tiles/admins.sqlite" > valhalla.jsonvalhalla_build_timezones > valhalla_tiles/timezones.sqlitevalhalla_build_admins -c valhalla.json "$PBF_FILE"valhalla_build_tiles -c valhalla.json "$PBF_FILE"valhalla_build_extract -c valhalla.json -v

In [None]:
import shutilif not skip_setup:    import os        spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{silver_schema}.silver_valhalla_data")        config_source = f"{BUILD_PATH}/valhalla.json"    config_dest = f"{PERSIST_VOLUME}/valhalla.json"    if os.path.exists(config_source):        shutil.copy(config_source, config_dest.replace("dbfs:", "/dbfs"))        tiles_source = f"{BUILD_PATH}/valhalla_tiles.tar"    tiles_dest = f"{PERSIST_VOLUME}/valhalla_tiles.tar"    if os.path.exists(tiles_source):        shutil.copy(tiles_source, tiles_dest.replace("dbfs:", "/dbfs"))

In [None]:
import osimport jsonimport valhallaif not os.path.exists(VALHALLA_CONFIG):    raise FileNotFoundError(f"Config not found: {VALHALLA_CONFIG}. Set skip_setup=no and rerun.")actor = valhalla.Actor(VALHALLA_CONFIG)status_json = actor.status()status = json.loads(status_json) if isinstance(status_json, str) else status_jsonprint(f"Valhalla {status.get('version', 'unknown')} ready")

%md## Load Data

In [None]:
from pyspark.sql.functions import col, expr, coalesce, monotonically_increasing_id, broadcast, lit# Read table and auto-detect columnsdf = spark.read.table(locations_table)columns = df.columns# Flexible column mappingid_col = next((c for c in columns if c in ['store_number', 'point_id', 'id', 'location_id']), None)lat_col = next((c for c in columns if c in ['latitude', 'lat', 'y']), None)lon_col = next((c for c in columns if c in ['longitude', 'lon', 'lng', 'x']), None)type_col = next((c for c in columns if c in ['store_type', 'type', 'category']), None)city_col = next((c for c in columns if c in ['city', 'municipality']), None)state_col = next((c for c in columns if c in ['state', 'region']), None)if not lat_col or not lon_col:    raise ValueError(f"Could not find latitude/longitude columns in {locations_table}. Available: {columns}")# Build select with available columnsselect_cols = [    coalesce(col(id_col), monotonically_increasing_id().cast("string")).alias("store_number") if id_col else monotonically_increasing_id().cast("string").alias("store_number"),    col(lat_col).alias("latitude"),    col(lon_col).alias("longitude"),    col(type_col).alias("store_type") if type_col else lit(None).alias("store_type"),    col(city_col).alias("city") if city_col else lit(None).alias("city"),    col(state_col).alias("state") if state_col else lit(None).alias("state")]locations = df.select(*select_cols).filter(col("latitude").isNotNull() & col("longitude").isNotNull())location_count = locations.count()print(f"{location_count} locations")

In [None]:
h3_features = (spark.read.table(f'{catalog}.{gold_schema}.silver_h3_features')    .select(        col("h3_cell_id"),        col("urbanicity_category"),        col("urbanicity_score")    ))locations_with_h3 = locations.withColumn(    "h3_cell_id",    expr("h3_longlatash3string(longitude, latitude, 8)"))locations_with_urbanicity = locations_with_h3.join(    broadcast(h3_features),    "h3_cell_id",    "left").fillna(    {"urbanicity_category": "suburban", "urbanicity_score": 0.0})

In [None]:
locations_with_drive_time = locations_with_urbanicity.withColumn(    "drive_time_minutes",    expr(f"""        CASE            WHEN urbanicity_category = 'urban' THEN {drive_times['urban']}            WHEN urbanicity_category = 'suburban' THEN {drive_times['suburban']}            WHEN urbanicity_category = 'rural' THEN {drive_times['rural']}            ELSE {drive_times['suburban']}        END    """))display(    locations_with_drive_time    .groupBy("urbanicity_category", "drive_time_minutes")    .count()    .orderBy("urbanicity_category"))

In [None]:
display(locations_with_drive_time.limit(5))

%md## Generate Isochrones

In [None]:
import jsonfrom pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringTypedef geojson_to_wkt(geojson_geom):    """Convert GeoJSON geometry to WKT"""    geom_type = geojson_geom.get('type')    coords = geojson_geom.get('coordinates', [])        if geom_type == 'Polygon':        rings = []        for ring in coords:            ring_coords = ', '.join([f"{lon} {lat}" for lon, lat in ring])            rings.append(f"({ring_coords})")        return f"POLYGON ({', '.join(rings)})"        elif geom_type == 'MultiPolygon':        polygons = []        for polygon in coords:            rings = []            for ring in polygon:                ring_coords = ', '.join([f"{lon} {lat}" for lon, lat in ring])                rings.append(f"({ring_coords})")            polygons.append(f"({', '.join(rings)})")        return f"MULTIPOLYGON ({', '.join(polygons)})"        else:        raise ValueError(f"Unsupported geometry type: {geom_type}")def generate_isochrone(row):    try:        query = {            "locations": [{"lat": float(row.latitude), "lon": float(row.longitude)}],            "costing": "auto",            "contours": [{"time": float(row.drive_time_minutes)}],            "polygons": True        }                result_json = actor.isochrone(json.dumps(query))        result = json.loads(result_json) if isinstance(result_json, str) else result_json                if result and 'features' in result and len(result['features']) > 0:            feature = result['features'][0]            geometry = feature.get('geometry')                        if geometry:                wkt = geojson_to_wkt(geometry)                                return (                    row.store_number,                    row.latitude,                    row.longitude,                    row.store_type,                    row.city,                    row.state,                    row.urbanicity_category,                    float(row.urbanicity_score),                    # float(row.population_density),                    int(row.drive_time_minutes),                    wkt                )    except Exception as e:        return None

In [None]:
isochrone_schema = StructType([    StructField("store_number", StringType(), False),    StructField("latitude", DoubleType(), False),    StructField("longitude", DoubleType(), False),    StructField("store_type", StringType(), True),    StructField("city", StringType(), True),    StructField("state", StringType(), True),    StructField("urbanicity_category", StringType(), True),    StructField("urbanicity_score", DoubleType(), True),    # StructField("population_density", DoubleType(), True),    StructField("drive_time_minutes", IntegerType(), False),    StructField("geometry_wkt", StringType(), False)])location_rows = locations_with_drive_time.collect()results = []for i, row in enumerate(location_rows):    if i % 100 == 0:        print(f"{i}/{len(location_rows)}")    result = generate_isochrone(row)    if result:        results.append(result)isochrones = spark.createDataFrame(results, schema=isochrone_schema)generated_count = len(results)print(f"{generated_count} isochrones generated")

%md## Write to Silver

In [None]:
from pyspark.sql.functions import current_timestampisochrones_final = (    isochrones    .withColumn("geometry", expr("ST_GeomFromText(geometry_wkt, 4326)"))    .withColumn("area_sqkm", expr("ST_Area(geometry) / 1000000"))    .withColumn("created_timestamp", current_timestamp())    .drop("geometry_wkt")    .select(        "store_number",        "latitude",        "longitude",        "store_type",        "city",        "state",        "urbanicity_category",        "urbanicity_score",        "drive_time_minutes",        "geometry",        "area_sqkm",        "created_timestamp"    ))full_table_name = f"{catalog}.{silver_schema}.silver_{output_table}"write_mode = output_config['write_mode'](    isochrones_final    .write    .format("delta")    .mode(write_mode)    .option("overwriteSchema", "true")    .saveAsTable(full_table_name))print(f"Written {generated_count} isochrones to {full_table_name}")

In [None]:
display(spark.sql(f"""  SELECT    urbanicity_category,    drive_time_minutes,    COUNT(*) as count,    ROUND(AVG(area_sqkm), 2) as avg_area_sqkm  FROM {full_table_name}  GROUP BY urbanicity_category, drive_time_minutes  ORDER BY urbanicity_category, drive_time_minutes"""))

In [None]:
!pip install folium

In [None]:
import folium

In [None]:
import pyspark.sql.functions as Fh3_sample = isochrones_final.limit(200)h3_geojson = h3_sample.withColumn(    "geojson",    F.expr("ST_AsGeoJSON(geometry)")).select("store_number", "geojson").collect()features = [    {        "type": "Feature",        # "properties": {"h3_cell_id": row["h3_cell_id"]},        "geometry": json.loads(row["geojson"])    }    for row in h3_geojson]m = folium.Map(location=[ 42.40, -71.38], zoom_start=6)folium.GeoJson(    {"type": "FeatureCollection", "features": features},    style_function=lambda x: {"fillColor": "blue", "color": "black", "weight": 1, "fillOpacity": 0.3}).add_to(m)m