# Amenities and Walkability

**Objective:** Add amenity proximity data and calculate walkability scores for all Portugal buildings 

### Data Sources
 - OpenStreetMap via Geofabrik

### Amenities
1. Supermarkets (grocery access)
2. Hospitals (healthcare access)
3. Pharmacies (health services)
4. Schools (education access)
5. Metro stations (public transportation)

### Deliverables
 - Distance to nearest amenity of each type (meters)
 - Component scores for each amenity (0-100)
 - Overall walkability score (0-100)

In [1]:
import pyrosm
import geopandas as gpd
import pandas as pd
import numpy as np
import os

In [2]:
#buildings
buildings = gpd.read_file('../data/portugal_buildings_processed.geojson')

#amenities from Portugal .pbf file
pbf_file = "../data/portugal-latest.osm.pbf"

if not os.path.exists(pbf_file):
    print(f"File not found: {pbf_file}")
else:
    print("Found .pbf file")

osm = pyrosm.OSM(pbf_file)

Found .pbf file


In [3]:
#buildings
buildings = gpd.read_file('../data/portugal_buildings_processed.geojson')

#amenities from Portugal .pbf file
pbf_file = "../data/portugal-latest.osm.pbf"

if not os.path.exists(pbf_file):
    print(f"File not found: {pbf_file}")
else:
    print("Found .pbf file")

osm = pyrosm.OSM(pbf_file)

Found .pbf file


In [4]:
# extracting amenities

amenities = {}

# 1. Supermarkets
amenities['supermarket'] = osm.get_data_by_custom_criteria(
    custom_filter = {'shop' : ['supermarket']},
    filter_type = 'keep'
)
print(f"Supermarket: {len(amenities['supermarket'])}")

# 2. Hospitals
amenities['hospital'] = osm.get_data_by_custom_criteria(
    custom_filter = {'amenity' : ['hospital']},
    filter_type = 'keep'
)
print(f"Hospital: {len(amenities['hospital'])}")

# 3. Pharmacies
amenities['pharmacy'] = osm.get_data_by_custom_criteria(
    custom_filter = {'amenity' : ['pharmacy']},
    filter_type = 'keep'
)
print(f"Pharmacy: {len(amenities['pharmacy'])}")

# 4. Schools
amenities['school'] = osm.get_data_by_custom_criteria(
    custom_filter = {'amenity' : ['school']},
    filter_type = 'keep'
)
print(f"School: {len(amenities['school'])}")

# 5. Metro stations
amenities['subway'] = osm.get_data_by_custom_criteria(
    custom_filter = {'station' : ['subway']},
    filter_type = 'keep'
)
print(f"Subway Station: {len(amenities['subway'])}")

print("All amenities extracted.")

Supermarket: 3733
Hospital: 332
Pharmacy: 2680
School: 6955
Subway Station: 78
All amenities extracted.


In [5]:
# crs convertion for distance calculation

buildings_m = buildings.to_crs('EPSG:3763')
buildings_m['centroid'] = buildings_m.geometry.centroid

In [6]:
# distance calculation

print("Calculating distances to nearest amenities...")

# distance calculation for each amenity type
for amenity_type, amenity_data in amenities.items():
    print(f"\nProcessing {amenity_type}...")
    print(f"Amenities: {len(amenity_data):,}")

    # convert amenities to meters crs
    amenity_m = amenity_data.to_crs('EPSG:3763')
    amenity_centroids = amenity_m.geometry.centroid

    # calculate distance from each building to nearest amenity
    distances = []

    total_buildings = len(buildings_m)
    for idx, building in buildings_m.iterrows():
        # progress indicator
        if (idx + 1) % 100000 == 0:
            print(f"   Progress: {idx+1:,} / {total_buildings:,} ({(idx+1)/total_buildings*100:.1f}%)")

        # calculate distance to all amenities of this type
        dists = building['centroid'].distance(amenity_centroids)
        min_dist = dists.min()
        distances.append(min_dist)

    # Add distance columns to buildings
    column_name = f'dist_{amenity_type}_m'
    buildings_m[column_name] = distances

    #statistics
    avg_dist = sum(distances) / len(distances)
    print(f"Average: {avg_dist:.0f}m, Min: {min(distances):.0f}m, Max: {max(distances):.0f}m")

print("All distances calculated.")

Calculating distances to nearest amenities...

Processing supermarket...
Amenities: 3,733
   Progress: 100,000 / 2,017,059 (5.0%)
   Progress: 200,000 / 2,017,059 (9.9%)
   Progress: 300,000 / 2,017,059 (14.9%)
   Progress: 400,000 / 2,017,059 (19.8%)
   Progress: 500,000 / 2,017,059 (24.8%)
   Progress: 600,000 / 2,017,059 (29.7%)
   Progress: 700,000 / 2,017,059 (34.7%)
   Progress: 800,000 / 2,017,059 (39.7%)
   Progress: 900,000 / 2,017,059 (44.6%)
   Progress: 1,000,000 / 2,017,059 (49.6%)
   Progress: 1,100,000 / 2,017,059 (54.5%)
   Progress: 1,200,000 / 2,017,059 (59.5%)
   Progress: 1,300,000 / 2,017,059 (64.5%)
   Progress: 1,400,000 / 2,017,059 (69.4%)
   Progress: 1,500,000 / 2,017,059 (74.4%)
   Progress: 1,600,000 / 2,017,059 (79.3%)
   Progress: 1,700,000 / 2,017,059 (84.3%)
   Progress: 1,800,000 / 2,017,059 (89.2%)
   Progress: 1,900,000 / 2,017,059 (94.2%)
   Progress: 2,000,000 / 2,017,059 (99.2%)
Average: 2424m, Min: 0m, Max: 42951m

Processing hospital...
Amenities

In [8]:
# extracting bus stops
amenities['bus'] = osm.get_data_by_custom_criteria(
    custom_filter = {'highway' : ['bus_stop']},
    filter_type = 'keep'
)

print(f"Bus stops: {len(amenities['bus']):,}")

Bus stops: 48,181


In [10]:
# calculate distance to bus stops

print("Calculating distances to bus stops...")

amenity_type = 'bus'
amenity_data = amenities['bus']

amenity_m = amenity_data.to_crs('EPSG:3763')
amenity_centroids = amenity_m.geometry.centroid

distances = []
total_buildings = len(buildings_m)

for idx, building in buildings_m.iterrows():
    #progress indicator
    if(idx + 1) % 100000 == 0:
        print(f"   Progress: {idx+1:,} / {total_buildings:,} ({(idx+1)/total_buildings*100:.1f}%)")

    dists = building['centroid'].distance(amenity_centroids)
    min_dist = dists.min()
    distances.append(min_dist)

# add column
buildings_m['dist_bus_m'] = distances

avg_dist = sum(distances) / len(distances)
print(f"Average: {avg_dist:.0f}m, Min: {min(distances):.0f}m, Max: {max(distances):.0f}m")


Calculating distances to bus stops...
   Progress: 100,000 / 2,017,059 (5.0%)
   Progress: 200,000 / 2,017,059 (9.9%)
   Progress: 300,000 / 2,017,059 (14.9%)
   Progress: 400,000 / 2,017,059 (19.8%)
   Progress: 500,000 / 2,017,059 (24.8%)
   Progress: 600,000 / 2,017,059 (29.7%)
   Progress: 700,000 / 2,017,059 (34.7%)
   Progress: 800,000 / 2,017,059 (39.7%)
   Progress: 900,000 / 2,017,059 (44.6%)
   Progress: 1,000,000 / 2,017,059 (49.6%)
   Progress: 1,100,000 / 2,017,059 (54.5%)
   Progress: 1,200,000 / 2,017,059 (59.5%)
   Progress: 1,300,000 / 2,017,059 (64.5%)
   Progress: 1,400,000 / 2,017,059 (69.4%)
   Progress: 1,500,000 / 2,017,059 (74.4%)
   Progress: 1,600,000 / 2,017,059 (79.3%)
   Progress: 1,700,000 / 2,017,059 (84.3%)
   Progress: 1,800,000 / 2,017,059 (89.2%)
   Progress: 1,900,000 / 2,017,059 (94.2%)
   Progress: 2,000,000 / 2,017,059 (99.2%)
Average: 1601m, Min: 0m, Max: 31478m


In [11]:
# combine bus and subway -> take the nearest one. 
buildings_m['dist_transport_m'] = buildings_m[['dist_subway_m', 'dist_bus_m']].min(axis=1)

print("Transport Distance Statistics:")
print(f"Average: {buildings_m['dist_transport_m'].mean():.0f}m")
print(f"Min: {buildings_m['dist_transport_m'].min():.0f}m")
print(f"Max: {buildings_m['dist_transport_m'].max():.0f}m")

#Comparison
print("Comparison:")
print(f"Subway Only Average: {buildings_m['dist_subway_m'].mean():.0f}m")
print(f"Bus Only Average: {buildings_m['dist_bus_m'].mean():.0f}m")
print(f"Combined Average (nearest): {buildings_m['dist_transport_m'].mean():.0f}m")

Transport Distance Statistics:
Average: 1601m
Min: 0m
Max: 31478m
Comparison:
Subway Only Average: 296477m
Bus Only Average: 1601m
Combined Average (nearest): 1601m


In [13]:
# walkability scores

max_distances = {
    'supermarket': 1000,
    'hospital': 3000,
    'pharmacy': 500,
    'school': 1000,
    'transport': 1500
}

weights = {
    'supermarket': 0.25,
    'hospital': 0.15,
    'pharmacy': 0.20,
    'school': 0.20,
    'transport': 0.20
}

# component scores
amenity_types = ['supermarket', 'hospital', 'pharmacy', 'school', 'transport']

for amenity_type in amenity_types:
    dist_col = f'dist_{amenity_type}_m'
    score_col = f'score_{amenity_type}'
    max_dist = max_distances[amenity_type]

    # score formula: 100 * (1 - distance/max_distance)
    buildings_m[score_col] = 100 * (1 - buildings_m[dist_col] / max_dist)
    buildings_m[score_col] = buildings_m[score_col].clip(lower=0)

    avg_score = buildings_m[score_col].mean()
    print(f"{amenity_type}: avg score = {avg_score:.1f}")

#calculate weighted walkability score 
walkability_score = 0
for amenity_type in amenity_types:
    score_col = f'score_{amenity_type}'
    weight = weights[amenity_type]
    walkability_score += buildings_m[score_col] * weight

buildings_m['walkability_score'] = walkability_score.round(1)

print("Walkability scores calculated.")

#statistics
print("Walkability Score Statistics:")
print(f"Mean: {buildings_m['walkability_score'].mean():.1f}")
print(f"Max: {buildings_m['walkability_score'].max():.1f}")
print(f"Min: {buildings_m['walkability_score'].min():.1f}")

#distribution
excellent = (buildings_m['walkability_score'] >= 80).sum()
good = ((buildings_m['walkability_score'] >= 60) & (buildings_m['walkability_score'] < 80)).sum()
fair = ((buildings_m['walkability_score'] >= 40) & (buildings_m['walkability_score'] < 60)).sum()
poor = (buildings_m['walkability_score'] < 40).sum()

print("Score Distribution:")
print(f"Excellent (80-100): {excellent:,} ({excellent/len(buildings_m)*100:.1f}%)")
print(f"Good (60-80): {good:,} ({good/len(buildings_m)*100:.1f}%)")
print(f"Fair (40-60): {fair:,} ({fair/len(buildings_m)*100:.1f}%)")
print(f"Poor (<40): {poor:,} ({poor/len(buildings_m)*100:.1f}%)")

supermarket: avg score = 31.8
hospital: avg score = 21.8
pharmacy: avg score = 16.3
school: avg score = 44.4
transport: avg score = 61.3
Walkability scores calculated.
Walkability Score Statistics:
Mean: 35.7
Max: 96.8
Min: 0.0
Score Distribution:
Excellent (80-100): 67,340 (3.3%)
Good (60-80): 347,021 (17.2%)
Fair (40-60): 457,117 (22.7%)
Poor (<40): 1,145,581 (56.8%)


In [15]:
# saving the final dataset

buildings_final = buildings_m.to_crs('EPSG:4326')
buildings_final = buildings_final.drop(columns=['centroid'])
output_file = '../data/portugal_buildings_with_amenities.geojson'
buildings_final.to_file(output_file, driver='GeoJSON')

print("Dataset saved.")
print(f"File: {output_file}\nSize: {os.path.getsize(output_file)}")

Dataset saved.
File: ../data/portugal_buildings_with_amenities.geojson
Size: 3392811589


In [20]:
#columns added
distance_cols = [col for col in buildings_final.columns if 'dist_' in col]
score_cols = [col for col in buildings_final.columns if 'score_' in col]
walkability_cols = [col for col in buildings_final.columns if 'walkability' in col]

print("New columns added:")
print(f"Distance columns ({len(distance_cols)}): \n{distance_cols}")
print(f"Score columns ({len(score_cols)}): \n{score_cols}")
print(f"Walkability: \n{walkability_cols}")

New columns added:
Distance columns (7): 
['dist_supermarket_m', 'dist_hospital_m', 'dist_pharmacy_m', 'dist_school_m', 'dist_subway_m', 'dist_bus_m', 'dist_transport_m']
Score columns (5): 
['score_supermarket', 'score_hospital', 'score_pharmacy', 'score_school', 'score_transport']
Walkability: 
['walkability_score']
