Cell 1: Import

In [28]:
import sys
from pathlib import Path

project_root = Path.cwd().parent  # assuming notebooks/ is one level below root
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

%load_ext autoreload
%autoreload 2

import pandas as pd

from ndw.traffic_speed import parse_trafficspeed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Cell 2: Load traffic speed

In [29]:
df_speed = parse_trafficspeed()
print("Rows:", len(df_speed))
df_speed = df_speed[df_speed["avg_speed_kmh"] != -1]
df_speed.head()

Rows: 19889


Unnamed: 0,site_id,measurement_time,avg_speed_kmh,flow_veh_per_hour
6,PZH01_MST_0635_01_01,2025-12-01T14:59:00Z,59.0,900.0
9,PZH01_MST_0635_01_00,2025-12-01T14:59:00Z,33.0,300.0
10,PZH01_MST_0982_00,2025-12-01T14:59:00Z,86.0,1440.0
76,PZH01_MST_0981_01,2025-12-01T14:59:00Z,64.0,1440.0
89,PZH01_MST_0911_00,2025-12-01T14:59:00Z,84.0,60.0


Cell 3: Basic info

In [30]:
df_speed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15263 entries, 6 to 19882
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   site_id            15263 non-null  object 
 1   measurement_time   15263 non-null  object 
 2   avg_speed_kmh      15262 non-null  float64
 3   flow_veh_per_hour  15263 non-null  float64
dtypes: float64(2), object(2)
memory usage: 596.2+ KB


Cell 4: Quick sanity checks

In [31]:
print("Unique sites:", df_speed["site_id"].nunique())
print("measurement_time range:",
      df_speed["measurement_time"].min(),
      "→",
      df_speed["measurement_time"].max())

print("\nSpeed stats (km/h):")
print(df_speed["avg_speed_kmh"].describe())

print("\nFlow stats (veh/h):")
print(df_speed["flow_veh_per_hour"].describe())

Unique sites: 15263
measurement_time range: 2025-12-01T14:58:00Z → 2025-12-01T14:59:00Z

Speed stats (km/h):
count    15262.000000
mean        70.975298
std         43.944930
min          0.000000
25%         26.000000
50%         91.000000
75%        105.000000
max        176.000000
Name: avg_speed_kmh, dtype: float64

Flow stats (veh/h):
count    15263.000000
mean       766.038131
std        683.799104
min          0.000000
25%         60.000000
50%        660.000000
75%       1260.000000
max       3900.000000
Name: flow_veh_per_hour, dtype: float64


Cell 5 – Example: top slowest sites right now

In [32]:
# drop rows with missing speed
slow = (
    df_speed.dropna(subset=["avg_speed_kmh"])
    .sort_values("avg_speed_kmh")
    .head(20)
)

slow[["site_id", "measurement_time", "avg_speed_kmh", "flow_veh_per_hour"]]

Unnamed: 0,site_id,measurement_time,avg_speed_kmh,flow_veh_per_hour
19882,RDH01_TI152R,2025-12-01T14:59:00Z,0.0,0.0
4366,PLB02_280360_RP_PST,2025-12-01T14:59:00Z,0.0,0.0
4367,PLB02_297265_RP,2025-12-01T14:59:00Z,0.0,0.0
4368,PLB02_MSTR038_LP,2025-12-01T14:59:00Z,0.0,0.0
4369,PLB02_MSTR050_LP,2025-12-01T14:59:00Z,0.0,0.0
4370,PLB02_297150_LP,2025-12-01T14:59:00Z,0.0,0.0
4371,PLB02_WZRWG01_N,2025-12-01T14:59:00Z,0.0,0.0
4372,PLB02_276540_LP,2025-12-01T14:59:00Z,0.0,0.0
4373,PLB02_276620_RP,2025-12-01T14:59:00Z,0.0,0.0
4374,PLB02_298220_RP,2025-12-01T14:59:00Z,0.0,0.0


Shapefile

In [33]:
from ndw.measurement_sites import parse_location_raw, load_measurement_sites

df_sites = load_measurement_sites()
print("Measurement sites:", len(df_sites))
df_sites.head()

df_sites_parsed = df_sites.copy()

df_sites_parsed[
    ["lat", "lon", "carriageway_type", "carriageway", "direction_ref"]
] = df_sites_parsed["location_raw"].apply(parse_location_raw)

df_sites_parsed.head()

Measurement sites: 101529


Unnamed: 0,site_id,version,site_name,location_raw,lat,lon,carriageway_type,carriageway,direction_ref
0,PZH01_MST_0629_00,3,,52.0263 | 4.634289 | mainCarriageway | 8 | 6.1...,52.0263,4.634289,mainCarriageway,A,positive
1,PZH01_MST_0629_01,3,,52.0262451 | 4.634219 | mainCarriageway | 8 | ...,52.026245,4.634219,mainCarriageway,A,negative
2,PZH01_MST_0634_02,3,,51.9836769 | 4.220052 | mainCarriageway | 8 | ...,51.983677,4.220052,mainCarriageway,A,negative
3,PZH01_MST_0635_01_00,4,,51.994175 | 4.259996 | mainCarriageway | 8 | 6...,51.994175,4.259996,mainCarriageway,A,positive
4,PZH01_MST_0635_01_01,3,,51.9942322 | 4.259998 | mainCarriageway | 8 | ...,51.994232,4.259998,mainCarriageway,A,negative


In [34]:
df_speed_enriched = df_speed.merge(
    df_sites_parsed[
        ["site_id", "lat", "lon", "carriageway_type", "carriageway", "direction_ref"]
    ],
    on="site_id",
    how="left",
)

df_speed_enriched.head()

Unnamed: 0,site_id,measurement_time,avg_speed_kmh,flow_veh_per_hour,lat,lon,carriageway_type,carriageway,direction_ref
0,PZH01_MST_0635_01_01,2025-12-01T14:59:00Z,59.0,900.0,51.994232,4.259998,mainCarriageway,A,negative
1,PZH01_MST_0635_01_00,2025-12-01T14:59:00Z,33.0,300.0,51.994175,4.259996,mainCarriageway,A,positive
2,PZH01_MST_0982_00,2025-12-01T14:59:00Z,86.0,1440.0,52.127636,4.470652,mainCarriageway,A,positive
3,PZH01_MST_0981_01,2025-12-01T14:59:00Z,64.0,1440.0,52.01038,4.456722,mainCarriageway,A,negative
4,PZH01_MST_0911_00,2025-12-01T14:59:00Z,84.0,60.0,51.85979,4.42473,mainCarriageway,A,negative


Save data

In [35]:
output_dir = project_root / "data"
output_dir.mkdir(exist_ok=True)

csv_path = output_dir / "ndw_trafficspeed_enriched.csv"
df_speed_enriched.to_csv(csv_path, index=False)

print("Saved:", csv_path)

Saved: /Users/Bruno/Library/CloudStorage/OneDrive-TUEindhoven/IGNITE/data/data/ndw_trafficspeed_enriched.csv


Further Shapefile to make easier locating

In [36]:
import geopandas as gpd
from shapely.geometry import Point

# Only keep rows with valid coordinates
df_sites_geo = df_sites_parsed.dropna(subset=["lat", "lon"]).copy()

gdf_sites = gpd.GeoDataFrame(
    df_sites_geo,
    geometry=gpd.points_from_xy(df_sites_geo["lon"], df_sites_geo["lat"]),
    crs="EPSG:4326",   # WGS84
)

In [37]:
from ndw.ndw_shapefile_utils import load_shapefile_from_url
gdf_msi = load_shapefile_from_url() 

gdf_sites_rd = gdf_sites.to_crs(28992)
gdf_msi_rd   = gdf_msi.to_crs(28992)

In [38]:
gdf_join = gpd.sjoin_nearest(
    gdf_sites_rd,
    gdf_msi_rd[["road", "carriagew0", "lane", "km", "wegvak", "wegbeheer0", "geometry"]],
    how="left",
    distance_col="dist_m",
)

Final version

In [39]:
df_site_msi = gdf_join[
    ["site_id", "road", "carriagew0", "lane", "km", "wegvak", "wegbeheer0"]
].copy()

df_speed_mega_enriched = df_speed_enriched.merge(
    df_site_msi,
    on="site_id",
    how="left",
)
df_speed_mega_enriched.head()

Unnamed: 0,site_id,measurement_time,avg_speed_kmh,flow_veh_per_hour,lat,lon,carriageway_type,carriageway,direction_ref,road,carriagew0,lane,km,wegvak,wegbeheer0
0,PZH01_MST_0635_01_01,2025-12-01T14:59:00Z,59.0,900.0,51.994232,4.259998,mainCarriageway,A,negative,A4,R,1,54.487,162293009,RWS West-Nederland Zuid
1,PZH01_MST_0635_01_01,2025-12-01T14:59:00Z,59.0,900.0,51.994232,4.259998,mainCarriageway,A,negative,A4,R,3,54.487,162293009,RWS West-Nederland Zuid
2,PZH01_MST_0635_01_01,2025-12-01T14:59:00Z,59.0,900.0,51.994232,4.259998,mainCarriageway,A,negative,A4,R,2,54.487,162293009,RWS West-Nederland Zuid
3,PZH01_MST_0635_01_00,2025-12-01T14:59:00Z,33.0,300.0,51.994175,4.259996,mainCarriageway,A,positive,A4,R,1,54.487,162293009,RWS West-Nederland Zuid
4,PZH01_MST_0635_01_00,2025-12-01T14:59:00Z,33.0,300.0,51.994175,4.259996,mainCarriageway,A,positive,A4,R,3,54.487,162293009,RWS West-Nederland Zuid


Sort by fasters avg speed or slowest

In [40]:
# sort by avg speed (fastest or slowest)
def sort_by_speed(df=df_speed_mega_enriched, n=20, fastest=True, dropna=True, cols=None):
    """
    Return top n rows sorted by avg_speed_kmh.
    - fastest=True  -> highest speeds first
    - fastest=False -> lowest speeds first
    """
    order = False if fastest else True
    q = df
    if dropna:
        q = q.dropna(subset=["avg_speed_kmh"])
    res = q.sort_values("avg_speed_kmh", ascending=order).head(n)
    if cols is None:
        cols = [
            "site_id",
            "measurement_time",
            "avg_speed_kmh",
            "flow_veh_per_hour",
            "road",
            "carriagew0",
            "lane",
            "km",
        ]
    return res.loc[:, [c for c in cols if c in res.columns]]

# Examples
print("Top 10 fastest sites:")
display(sort_by_speed(n=10, fastest=True))

print("\nTop 10 slowest sites:")
display(sort_by_speed(n=10, fastest=False))

Top 10 fastest sites:


Unnamed: 0,site_id,measurement_time,avg_speed_kmh,flow_veh_per_hour,road,carriagew0,lane,km
31760,RWS01_MONIBAS_0121hrr0441ra,2025-12-01T14:58:00Z,176.0,180.0,A12,R,2,44.106
31759,RWS01_MONIBAS_0121hrr0441ra,2025-12-01T14:58:00Z,176.0,180.0,A12,R,4,44.106
31761,RWS01_MONIBAS_0121hrr0441ra,2025-12-01T14:58:00Z,176.0,180.0,A12,R,3,44.106
31758,RWS01_MONIBAS_0121hrr0441ra,2025-12-01T14:58:00Z,176.0,180.0,A12,R,1,44.106
431,PGL10_N794-01_hmp_15.50_Re_HTN2661,2025-12-01T14:59:00Z,169.0,60.0,A28,R,1,82.4
430,PGL10_N794-01_hmp_15.50_Re_HTN2661,2025-12-01T14:59:00Z,169.0,60.0,A28,R,2,82.4
7040,RWS01_MONIBAS_0281hrr0063ra,2025-12-01T14:58:00Z,158.0,60.0,A28,R,2,6.295
7039,RWS01_MONIBAS_0281hrr0063ra,2025-12-01T14:58:00Z,158.0,60.0,A28,R,3,6.295
7041,RWS01_MONIBAS_0281hrr0063ra,2025-12-01T14:58:00Z,158.0,60.0,A28,R,1,6.295
31735,RWS01_MONIBAS_0201hrl0202ra,2025-12-01T14:58:00Z,151.0,780.0,A20,L,2,20.25



Top 10 slowest sites:


Unnamed: 0,site_id,measurement_time,avg_speed_kmh,flow_veh_per_hour,road,carriagew0,lane,km
42543,RDH01_TI152R,2025-12-01T14:59:00Z,0.0,0.0,A44,L,2,21.1
5885,PLB02_SG010,2025-12-01T14:59:00Z,0.0,0.0,A2,L,2,234.2
5884,PLB02_SG010,2025-12-01T14:59:00Z,0.0,0.0,A2,L,1,234.2
5883,PLB02_SG010,2025-12-01T14:59:00Z,0.0,0.0,A2,L,3,234.2
5882,PLB02_MSTR_052_LP,2025-12-01T14:59:00Z,0.0,0.0,A2,L,2,257.089
5881,PLB02_MSTR_052_LP,2025-12-01T14:59:00Z,0.0,0.0,A2,L,1,257.089
5880,PLB02_274060_RP,2025-12-01T14:59:00Z,0.0,0.0,A76,L,2,14.33
5879,PLB02_274060_RP,2025-12-01T14:59:00Z,0.0,0.0,A76,L,3,14.33
5878,PLB02_274060_RP,2025-12-01T14:59:00Z,0.0,0.0,A76,L,1,14.33
5877,PLB02_562146_LP,2025-12-01T14:59:00Z,0.0,0.0,A73,L,1,27.717
