# Installations

In [0]:
%pip install "numpy<2.0.0" pyrosm

Collecting pyrosm
  Downloading pyrosm-0.6.2.tar.gz (2.5 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/2.5 MB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m1.8/2.5 MB[0m [31m23.7 MB/s[0m eta [36m0:00:01[0m
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m26.7 MB/s[0m eta [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies: started
  Installing build dependencies: still running...
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.to

In [0]:
dbutils.library.restartPython()

# Imports & Configuration

In [0]:
# Imports
from pyrosm import OSM
import pandas as pd
import time
import os

# Parameters
REGION_NAME = "Cyprus"
INPUT_FILE_PATH = "/dbfs/FileStore/tables/cyprus_latest_osm.pbf"
OUTPUT_TABLE_NAME = f"{REGION_NAME.lower()}_data_delta"
TAGS = {'amenity': ['bar', 'pub', 'biergarten', 'nightclub']}

print(f"Configuration loaded - Processing region: {REGION_NAME}")
print(f"Reading from: {INPUT_FILE_PATH}")
print(f"Output Delta table: {OUTPUT_TABLE_NAME}")
print(f"Filtering OpenStreetMap data for amenity types: {TAGS}\n")

Configuration loaded - Processing region: Cyprus
Reading from: /dbfs/FileStore/tables/cyprus_latest_osm.pbf
Output Delta table: cyprus_data_delta
Filtering OpenStreetMap data for amenity types: {'amenity': ['bar', 'pub', 'biergarten', 'nightclub']}



# Data Processing

In [0]:
# Data Acquisition
if os.path.exists(INPUT_FILE_PATH):
    print("Acquiring data from PBF source...")
    osm = OSM(INPUT_FILE_PATH)
    print(f"Data source loaded: {INPUT_FILE_PATH}\n")

    print("Parsing data...")
    start_time = time.time()
    gdf = osm.get_pois(custom_filter=TAGS)
    end_time = time.time()
    print(f"Data parsing took {end_time - start_time:.2f} seconds.\n")

    # Geometry Processing
    if not gdf.empty:
        print("Processing geometries (converting to centroids)...")
        # Project to UTM -> Centroid -> Back to WGS84
        gdf_projected = gdf.to_crs(gdf.estimate_utm_crs())
        gdf_projected['geometry'] = gdf_projected.geometry.centroid
        gdf = gdf_projected.to_crs(epsg=4326)

        # Data Extraction
        print("Extracting latitude and longitude coordinates...")
        gdf['latitude'] = gdf.geometry.y
        gdf['longitude'] = gdf.geometry.x

        # Selecting relevant columns
        print("Selecting relevant columns...\n")
        cols = ["name", "amenity", "latitude", "longitude"]
        # Ensure columns exist before selection to avoid errors
        existing_cols = [c for c in cols if c in gdf.columns]
        pdf = gdf[existing_cols]

        # Output / Export preview
        print(f"Found {len(pdf)} locations in {REGION_NAME}.")
        print(pdf.head())

    else:
        print("No locations found using the specified filter.")
        pdf = pd.DataFrame()

else:
    raise FileNotFoundError(f"File not found at: {INPUT_FILE_PATH}")

Acquiring data from PBF source...
Data source loaded: /dbfs/FileStore/tables/cyprus_latest_osm.pbf

Parsing data...
Data parsing took 48.68 seconds.

Processing geometries (converting to centroids)...
Extracting latitude and longitude coordinates...
Selecting relevant columns...

Found 685 locations in Cyprus.
           name amenity   latitude  longitude
0  Orange Grove     pub  35.082584  33.876064
1    Robin Hood     pub  34.757374  32.416904
2       Rainbow     pub  34.757816  32.416504
3   Water whole     pub  34.758175  32.415543
4       Bubbles     pub  34.756954  32.417984


# Save to Delta

In [0]:
# Output / Export to Delta Table
if 'pdf' in locals() and not pdf.empty:
    print("Converting Pandas DataFrame to Spark DataFrame...")
    
    # Create Spark DataFrame
    spark_df = spark.createDataFrame(pdf)
    
    # Save to Delta
    print(f"Saving results to Delta table: {OUTPUT_TABLE_NAME}...")
    spark_df.write.format("delta").mode("overwrite").saveAsTable(OUTPUT_TABLE_NAME)
    print(f"Saved results to table: {OUTPUT_TABLE_NAME}")
    
    # Display result
    display(spark.table(OUTPUT_TABLE_NAME))

else:
    print("Nothing to save (DataFrame is empty).")

Converting Pandas DataFrame to Spark DataFrame...
Saving results to Delta table: cyprus_data_delta...
Saved results to table: cyprus_data_delta


name,amenity,latitude,longitude
Orange Grove,pub,35.08258438110352,33.87606430053711
Robin Hood,pub,34.75737380981446,32.41690444946289
Rainbow,pub,34.757816314697266,32.41650390625
Water whole,pub,34.758174896240234,32.41554260253906
Bubbles,pub,34.75695419311524,32.41798400878906
LOFT,nightclub,34.75644302368163,32.41977310180664
Steve's,bar,34.75543975830078,32.407928466796875
Notos,pub,34.75553512573242,32.40788650512695
Starsky and Hutch,pub,34.75633239746094,32.41862869262695
Boogies,bar,34.75717544555663,32.41771697998047
