# Full Portugal Dataset with Geofabrik

**Date:** December 15, 2025

**Objective:** Extract ALL buildings in Portugal using Geofabrik .pbf file

In [1]:
# Import libraries
import pyrosm
import geopandas as gpd
import pandas as pd
import os
from datetime import datetime

print(f"Current time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Current time: 2025-12-15 08:56:56


In [2]:
# Download Portugal .pbf file
import urllib.request

# Create data folder if doesn't exist
os.makedirs('../data', exist_ok=True)

# URL for Portugal data
url = "https://download.geofabrik.de/europe/portugal-latest.osm.pbf"
output_file = "../data/portugal-latest.osm.pbf"

print(f"URL: {url}")
print(f"Saving to: {output_file}")

# Download the file
urllib.request.urlretrieve(url, output_file)

# Check file size
file_size_mb = os.path.getsize(output_file) / (1024 * 1024)

print(f"\n Download complete!")
print(f"File size: {file_size_mb:.1f} MB")
print(f"File location: {output_file}")

URL: https://download.geofabrik.de/europe/portugal-latest.osm.pbf
Saving to: ../data/portugal-latest.osm.pbf

 Download complete!
File size: 374.3 MB
File location: ../data/portugal-latest.osm.pbf


In [3]:
# Initialize OSM object with the .pbf file
osm = pyrosm.OSM("../data/portugal-latest.osm.pbf")

# Extract all buildings
portugal_buildings = osm.get_buildings()

print(f"\nExtraction complete!")
print(f"Total buildings extracted: {len(portugal_buildings):,}")
print(f"Total columns: {len(portugal_buildings.columns)}")
print(f"Memory usage: {portugal_buildings.memory_usage(deep=True).sum() / (1024**2):.1f} MB")
print(portugal_buildings.columns.tolist())


Extraction complete!
Total buildings extracted: 2,065,315
Total columns: 42
Memory usage: 2040.3 MB
['addr:city', 'addr:country', 'addr:full', 'addr:housenumber', 'addr:housename', 'addr:postcode', 'addr:place', 'addr:street', 'email', 'name', 'opening_hours', 'operator', 'phone', 'ref', 'url', 'visible', 'website', 'building', 'amenity', 'building:flats', 'building:levels', 'building:material', 'building:max_level', 'building:min_level', 'building:use', 'craft', 'height', 'internet_access', 'landuse', 'levels', 'office', 'shop', 'source', 'start_date', 'wikipedia', 'id', 'timestamp', 'version', 'tags', 'osm_type', 'geometry', 'changeset']


In [4]:
portugal_buildings.head()

Unnamed: 0,addr:city,addr:country,addr:full,addr:housenumber,addr:housename,addr:postcode,addr:place,addr:street,email,name,...,source,start_date,wikipedia,id,timestamp,version,tags,osm_type,geometry,changeset
0,Lisboa,,,,Panteão Nacional,1100-471,,Campo de Santa Clara,geral@panteao.dgpc.pt,Panteão Nacional,...,,,pt:Panteão Nacional,9253917,1753268252,28,"{""building:colour"":""white"",""description"":""Monu...",way,"POLYGON ((-9.12496 38.71486, -9.12498 38.71486...",
1,,,,,,,,,,Estação Ferroviária de Lisboa - Santa Apolónia,...,,,pt:Estação Ferroviária de Lisboa-Santa Apolónia,9254766,1753470869,24,"{""building:colour"":""#76CFE3"",""layer"":""1"",""name...",way,"POLYGON ((-9.12215 38.7147, -9.12217 38.71471,...",
2,,,,,,,,,,Strada Outlet,...,,,,18942721,1764441393,10,,way,"POLYGON ((-9.19316 38.78315, -9.19298 38.7832,...",
3,Braga,,,,,4700-424,,Rua Dom Paio Mendes,,Sé Catedral de Santa Maria Maior,...,,,pt:Sé de Braga,22746378,1751374042,37,"{""alt_name"":""S\u00E9 Catedral de Braga"",""basil...",way,"POLYGON ((-8.42739 41.54982, -8.42705 41.54987...",
4,Paul,,,,,6215-424,,Largo Doutor Carlos Coelho,,GNR - Posto Territorial de Paul,...,,,,23240038,1633797399,9,"{""alt_name"":""Guarda Nacional Republicana - Pos...",way,"POLYGON ((-7.63669 40.20315, -7.63655 40.20315...",


In [5]:
print(portugal_buildings.geometry.geom_type.value_counts())

Polygon            2064550
MultiPolygon           736
MultiLineString         20
LineString               9
Name: count, dtype: int64


In [6]:
# Convert to Portugal's CRS (meters) for accurate area calculation
portugal_buildings_projected = portugal_buildings.to_crs('EPSG:3763')

# Calculate area in square meters
portugal_buildings['area_sqm'] = portugal_buildings_projected.geometry.area

# Fix negative areas (reverse polygons)
portugal_buildings['area_sqm'] = portugal_buildings['area_sqm'].abs()

# Round to 2 decimal places
portugal_buildings['area_sqm'] = portugal_buildings['area_sqm'].round(2)

print(f"   Min area: {portugal_buildings['area_sqm'].min():.2f} m²")
print(f"   Max area: {portugal_buildings['area_sqm'].max():.2f} m²")
print(f"   Average area: {portugal_buildings['area_sqm'].mean():.2f} m²")

# Filter out very small or very large (likely errors)
reasonable_buildings = portugal_buildings[
    (portugal_buildings['area_sqm'] > 10) & 
    (portugal_buildings['area_sqm'] < 50000)
]
print(f"   Buildings in range: {len(reasonable_buildings):,} ({len(reasonable_buildings)/len(portugal_buildings)*100:.1f}%)")
print(f"   Average area: {reasonable_buildings['area_sqm'].mean():.2f} m²")

   Min area: 0.00 m²
   Max area: 162378.19 m²
   Average area: 226.72 m²
   Buildings in range: 2,017,059 (97.7%)
   Average area: 229.26 m²


In [7]:
# Save the processed buildings 

output_file = '../data/portugal_buildings_processed.geojson'

# Save the filtered dataset
reasonable_buildings.to_file(output_file, driver='GeoJSON')

file_size_mb = os.path.getsize(output_file) / (1024 * 1024)

print(f"   File: {output_file}")
print(f"   Size: {file_size_mb:.1f} MB")
print(f"   Buildings: {len(reasonable_buildings):,}")

   File: ../data/portugal_buildings_processed.geojson
   Size: 2365.4 MB
   Buildings: 2,017,059


In [8]:
# DATA QUALITY ANALYSIS - Attribute Coverage
print("DATA QUALITY ANALYSIS - ALL PORTUGAL")

total = len(reasonable_buildings)

print(f"\nDataset Overview:")
print(f"   Total buildings: {total:,}")
print(f"   Min area: {reasonable_buildings['area_sqm'].min():.2f} m²")
print(f"   Max area: {reasonable_buildings['area_sqm'].max():.2f} m²")
print(f"   Average area: {reasonable_buildings['area_sqm'].mean():.2f} m²")

print(f"\nAttribute Coverage:")

# Building levels
has_levels = reasonable_buildings['building:levels'].notna().sum()
print(f"   Building levels: {has_levels:,} ({has_levels/total*100:.1f}%)")

# Height
has_height = reasonable_buildings['height'].notna().sum()
print(f"   Height: {has_height:,} ({has_height/total*100:.1f}%)")

# Address - street
has_street = reasonable_buildings['addr:street'].notna().sum()
print(f"   Street address: {has_street:,} ({has_street/total*100:.1f}%)")

# Address - house number
has_number = reasonable_buildings['addr:housenumber'].notna().sum()
print(f"   House number: {has_number:,} ({has_number/total*100:.1f}%)")

# Building type
has_type = reasonable_buildings['building'].notna().sum()
print(f"   Building type: {has_type:,} ({has_type/total*100:.1f}%)")


DATA QUALITY ANALYSIS - ALL PORTUGAL

Dataset Overview:
   Total buildings: 2,017,059
   Min area: 10.01 m²
   Max area: 49700.36 m²
   Average area: 229.26 m²

Attribute Coverage:
   Building levels: 91,819 (4.6%)
   Height: 16,320 (0.8%)
   Street address: 108,968 (5.4%)
   House number: 102,792 (5.1%)
   Building type: 2,017,059 (100.0%)
