In [None]:
# Databricks notebook source
# MAGIC %md
# MAGIC # OSM Road Network Download - Bronze Layer
# MAGIC
# MAGIC Downloads OpenStreetMap road network data from Geofabrik and stores in Unity Catalog.
# MAGIC
# MAGIC **Data Source:** Geofabrik OSM Extracts  
# MAGIC **Format:** PBF (Protocolbuffer Binary Format)  
# MAGIC **Region:** Massachusetts
# MAGIC
# MAGIC **Output:**
# MAGIC - Volume: `/Volumes/{catalog}/bronze/osm_data/massachusetts-latest.osm.pbf`
# MAGIC - Table: `{catalog}.{bronze_schema}.bronze_osm_downloads` (tracking metadata)
# MAGIC

In [None]:
import requests
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime
import uuid

# Notebook parameters
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("bronze_schema", "")
dbutils.widgets.text("osm_data_volume", "")
dbutils.widgets.text("osm_url", "")
dbutils.widgets.text("region", "")

# Extract parameters
catalog = dbutils.widgets.get("catalog")
bronze_schema = dbutils.widgets.get("bronze_schema")
osm_data_volume = dbutils.widgets.get("osm_data_volume")
osm_url = dbutils.widgets.get("osm_url")
region = dbutils.widgets.get("region")

assert catalog and bronze_schema and osm_url, "Missing required parameters"

In [None]:
import shutil
# Download OSM file
download_id = str(uuid.uuid4())
download_start = datetime.now()

# Extract filename from URL
osm_filename = osm_url.split('/')[-1]
volume_file_path = f"{osm_data_volume}{osm_filename}"

# Check if file already exists (idempotency)
try:
    existing_files = dbutils.fs.ls(volume_file_path)
    file_size_mb = existing_files[0].size / (1024 * 1024)
    status = "existing"
    download_end = download_start
except:
    # File doesn't exist, download it
    status = "downloading"
    
    # Create volume if needed
    spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{bronze_schema}.bronze_osm_data")
    
    # Download to local temp location
    temp_path = f"/tmp/{osm_filename}"
    
    with requests.get(osm_url, stream=True, timeout=600) as response:
        response.raise_for_status()
        
        file_size_bytes = int(response.headers.get('content-length', 0))
        file_size_mb = file_size_bytes / (1024 * 1024)
        
        with open(temp_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192*1024):
                if chunk:
                    f.write(chunk)
    
    # Copy to volume using dbutils
    # Direct download to volume (no temp file)

    with requests.get(osm_url, stream=True) as r:
        r.raise_for_status()
        with open(volume_file_path.replace("dbfs:", "/dbfs"), "wb") as f:
            shutil.copyfileobj(r.raw, f)
        
        # Cleanup
        import os
        os.remove(temp_path)
        
        download_end = datetime.now()
        status = "completed"

    duration_seconds = (download_end - download_start).total_seconds()

In [None]:
status

In [None]:
# Write tracking metadata to Unity Catalog
osm_table = f"{catalog}.{bronze_schema}.bronze_osm_downloads"

data = [(
    download_id,
    datetime.now().date(),
    region,
    volume_file_path,
    int(file_size_mb),
    status,
    duration_seconds,
    download_start,
    download_end
)]

schema = StructType([
    StructField("download_id", StringType(), False),
    StructField("download_date", DateType(), False),
    StructField("region", StringType(), False),
    StructField("osm_file_path", StringType(), False),
    StructField("file_size_mb", LongType(), False),
    StructField("download_status", StringType(), False),
    StructField("duration_seconds", DoubleType(), False),
    StructField("download_start", TimestampType(), False),
    StructField("download_end", TimestampType(), False)
])

osm_df = spark.createDataFrame(data, schema)

(osm_df
 .write
 .mode("append")
 .saveAsTable(osm_table))