In [1]:
import pandas as pd 
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import LineString
import pyproj
import pandas as pd
import regex as reg
import networkx as nx
import osmnx as ox
import geopandas as gpd


In [2]:
# Load the Time-Series data from Mastra
mastra_df = pd.read_csv('mastra.csv')

# Load GraphML for Aalborg Municipality
G = ox.load_graphml('aalborg_network.graphml')

# Project the graph to UTM (osmnx will automatically select the appropriate UTM zone)
G_proj = ox.project_graph(G)
nodes_proj, edges_proj = ox.graph_to_gdfs(G_proj)

Clean Mastra

In [3]:
# Verify the column names in the DataFrame
print(mastra_df.columns)

# reg ex to replace , with . on the whole document
mastra_df = mastra_df.replace({',': '.'}, regex=True)
mastra_df = mastra_df.replace({'LÃ¸rdag': "Loerdag"}, regex=True)
mastra_df = mastra_df.replace({'SÃ¸ndag': "Soendag"}, regex=True)


mastra_df.rename(columns={'MÃ¥ned': 'Maened'}, inplace=True)
mastra_df.rename(columns={'Ãrstal': 'Aarstal'}, inplace=True)
mastra_df.rename(columns={'ï»¿Vej-Id': 'Vej-Id'}, inplace=True)

#mastra_df.drop(columns=['Unnamed: 1', 'Unnamed: 2'], inplace=True)

# Correct column names if necessary
columns_to_convert = [
    'Trafiktype',
    'Dagtype',
    'Vej-Id',
    'Retning',
    'Trafiktype'
]

# Convert the specified columns to float64
mastra_df = mastra_df.astype({col: 'string' for col in columns_to_convert})

mastra_df = mastra_df.replace({'<NA>': ""}, regex=True)

# Identify the columns that are not part of the time intervals
id_vars_columns = [
    'Vej-Id', 'Kilometer', 'Retning', 'Spor', 'Dato', 'Aarstal', 
    'Maened', 'Dag', 'Dagtype', 'Trafiktype', 'Xkoordinat', 'Ykoordinat'
]

# The time interval columns
time_columns = [
    "00:00-00:15","00:15-00:30","00:30-00:45","00:45-01:00","01:00-01:15","01:15-01:30","01:30-01:45","01:45-02:00","02:00-02:15","02:15-02:30","02:30-02:45","02:45-03:00","03:00-03:15","03:15-03:30","03:30-03:45","03:45-04:00","04:00-04:15","04:15-04:30","04:30-04:45","04:45-05:00","05:00-05:15","05:15-05:30","05:30-05:45","05:45-06:00","06:00-06:15","06:15-06:30","06:30-06:45","06:45-07:00","07:00-07:15","07:15-07:30","07:30-07:45","07:45-08:00","08:00-08:15","08:15-08:30","08:30-08:45","08:45-09:00","09:00-09:15","09:15-09:30","09:30-09:45","09:45-10:00","10:00-10:15","10:15-10:30","10:30-10:45","10:45-11:00","11:00-11:15","11:15-11:30","11:30-11:45","11:45-12:00","12:00-12:15","12:15-12:30","12:30-12:45","12:45-13:00","13:00-13:15","13:15-13:30","13:30-13:45","13:45-14:00","14:00-14:15","14:15-14:30","14:30-14:45","14:45-15:00","15:00-15:15","15:15-15:30","15:30-15:45","15:45-16:00","16:00-16:15","16:15-16:30","16:30-16:45","16:45-17:00","17:00-17:15","17:15-17:30","17:30-17:45","17:45-18:00","18:00-18:15","18:15-18:30","18:30-18:45","18:45-19:00","19:00-19:15","19:15-19:30","19:30-19:45","19:45-20:00","20:00-20:15","20:15-20:30","20:30-20:45","20:45-21:00","21:00-21:15","21:15-21:30","21:30-21:45","21:45-22:00","22:00-22:15","22:15-22:30","22:30-22:45","22:45-23:00","23:00-23:15","23:15-23:30","23:30-23:45","23:45-24:00"
]

# Melt the DataFrame to reshape it from wide format to long format
mastra_long = pd.melt(
    mastra_df,
    id_vars=id_vars_columns,  # Columns that uniquely identify each row
    value_vars=time_columns,  # Time interval columns
    var_name='time_interval',  # Name of the new column for the time intervals
    value_name='traffic_flow'  # Name for the traffic flow data
)

# Extract the start time from 'time_interval'
mastra_long['time_start'] = mastra_long['time_interval'].str[:5]  # Extracts the "00:00" part for sorting

mastra_long['datetime'] = pd.to_datetime(mastra_long['Dato'] + " " + mastra_long['time_start'], dayfirst=True)

mastra_long = mastra_long.sort_values(by=['Vej-Id', 'Dato', 'time_start'])

# Result
mastra_long.head()


Index(['ï»¿Vej-Id', 'Unnamed: 1', 'Unnamed: 2', 'Kilometer', 'Retning', 'Spor',
       'Xkoordinat', 'Ykoordinat', 'Dato', 'Ãrstal',
       ...
       '21:30-21:45', '21:45-22:00', '22:00-22:15', '22:15-22:30',
       '22:30-22:45', '22:45-23:00', '23:00-23:15', '23:15-23:30',
       '23:30-23:45', '23:45-24:00'],
      dtype='object', length=110)


Unnamed: 0,Vej-Id,Kilometer,Retning,Spor,Dato,Aarstal,Maened,Dag,Dagtype,Trafiktype,Xkoordinat,Ykoordinat,time_interval,traffic_flow,time_start,datetime
33655,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:00-00:15,10.0,00:00,2023-02-01 00:00:00
302697,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:15-00:30,10.0,00:15,2023-02-01 00:15:00
571739,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:30-00:45,7.0,00:30,2023-02-01 00:30:00
840781,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:45-01:00,3.0,00:45,2023-02-01 00:45:00
1109823,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,01:00-01:15,3.0,01:00,2023-02-01 01:00:00


Group Time Series data

In [4]:
# Create a dictionary to store the DataFrames with unique key identifiers
grouped_dataframes = {}

# Group by the specified columns
for key, group in mastra_long.groupby(["Vej-Id", "Xkoordinat", "Ykoordinat"]):
    # Use the unique key for naming or accessing each DataFrame
    sorted_group = group.sort_values(by=['Vej-Id', 'Dato', 'time_start'])
    grouped_dataframes[key] = sorted_group.reset_index(drop=True,inplace=False).set_index('datetime', inplace=False)

In [5]:
grouped_dataframes[key]

Unnamed: 0_level_0,Vej-Id,Kilometer,Retning,Spor,Dato,Aarstal,Maened,Dag,Dagtype,Trafiktype,Xkoordinat,Ykoordinat,time_interval,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01 00:00:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:00-00:15,21.0,00:00
2023-01-01 00:15:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:15-00:30,65.0,00:15
2023-01-01 00:30:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:30-00:45,65.0,00:30
2023-01-01 00:45:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:45-01:00,65.0,00:45
2023-01-01 01:00:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,01:00-01:15,69.0,01:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 22:45:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,22:45-23:00,50.0,22:45
2023-12-31 23:00:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,23:00-23:15,70.0,23:00
2023-12-31 23:15:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,23:15-23:30,75.0,23:15
2023-12-31 23:30:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,23:30-23:45,64.0,23:30


In [6]:
grouped_dataframes[key]['traffic_flow']

datetime
2023-01-01 00:00:00    21.0
2023-01-01 00:15:00    65.0
2023-01-01 00:30:00    65.0
2023-01-01 00:45:00    65.0
2023-01-01 01:00:00    69.0
                       ... 
2023-12-31 22:45:00    50.0
2023-12-31 23:00:00    70.0
2023-12-31 23:15:00    75.0
2023-12-31 23:30:00    64.0
2023-12-31 23:45:00    65.0
Name: traffic_flow, Length: 61344, dtype: float64

In [7]:
len(edges_proj)

31470

In [None]:
# import math
# import time
# import pandas as pd

# # Function to calculate Euclidean distance between two points
# def euclidean_distance(x1, y1, x2, y2):
#     return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

# # Initialize tracking and parameters
# total_dataframes = len(grouped_dataframes)
# start_time = time.time()
# x = 5  # Print status every x DataFrames processed
# distance_threshold = 10000000  # Set higher to test distance limitation
# assigned_edges = set()  # Track which edges have been assigned data

# # Ensure 'time_series_data' column in edges_proj can store complex data
# edges_proj['time_series_data'] = pd.Series([None] * len(edges_proj), dtype="object")

# # Iterate over each grouped DataFrame in `grouped_dataframes`
# for df_index, (key, df) in enumerate(grouped_dataframes.items(), start=1):
#     vej_id, df_x, df_y = key

#     # List to store edges by distance
#     edge_distances = []

#     # Loop over all edges in `edges_proj` and calculate distances
#     for edge_index, edge_data in edges_proj.iterrows():
#         if edge_index in assigned_edges:  # Skip edges already assigned data
#             continue

#         edge_geometry = edge_data['geometry']
#         midpoint_index = len(edge_geometry.coords.xy[0]) // 2
#         edge_x = edge_geometry.coords.xy[0][midpoint_index]
#         edge_y = edge_geometry.coords.xy[1][midpoint_index]

#         # Calculate distance from edge to DataFrame's x, y
#         distance = euclidean_distance(edge_x, edge_y, df_x, df_y)

#         # Track each edge with its distance, if within threshold
#         if distance <= distance_threshold:
#             edge_distances.append((edge_index, distance))

#     # Sort edges by distance
#     edge_distances.sort(key=lambda x: x[1])

#     # Assign data to the first unassigned edge within the threshold
#     assigned = False
#     for edge_index, distance in edge_distances:
#         if edge_index not in assigned_edges:  # Ensure edge hasn't been reused
#             time_series_data = list(zip(df.index.to_pydatetime(), df['traffic_flow'].tolist()))
#             edges_proj.at[edge_index, 'time_series_data'] = time_series_data
#             assigned_edges.add(edge_index)  # Mark as assigned
#             print(f"Assigned data to edge {edge_index} for DataFrame {df_index}/{total_dataframes}")
#             assigned = True
#             break

#     # If no edge could be assigned, log the result
#     if not assigned:
#         print(f"No suitable unassigned edge found within threshold for DataFrame {df_index}/{total_dataframes}")

#     # Progress update every x dataframes
#     if df_index % x == 0:
#         avg_time_per_df = (time.time() - start_time) / df_index
#         estimated_total_time = avg_time_per_df * total_dataframes
#         remaining_time = estimated_total_time - (time.time() - start_time)
#         print(f"Processed {df_index}/{total_dataframes} DataFrames. Remaining time: {remaining_time / 60:.2f} minutes")

# # Final summary
# print("\nProcessing complete.")
# print(f"Total DataFrames processed: {total_dataframes}")
# print(f"Total edges assigned time series data: {len(assigned_edges)}")

# # Verification of assigned data
# print("\nVerification of assigned data:")
# print(edges_proj['time_series_data'].dropna().head())
# print("\nEdges with no data assigned:", edges_proj[edges_proj['time_series_data'].isna()])


In [9]:

# ox.save_graphml(G, filepath='total.graphml')

# ox.save_graphml(G_proj, filepath='total_proj.graphml')


In [10]:
# G_proj = ox.load_graphml('total_proj.graphml')

# edges_proj = ox.graph_to_gdfs(G_proj)

In [11]:
# go to cell (29972545, 29972549, 0) of the G graph edge

G_proj[29972545][29972549][0]

{'osmid': [941949214, 9249815],
 'lanes': '1',
 'highway': 'motorway_link',
 'oneway': True,
 'reversed': False,
 'length': 88.227,
 'allowed_speed': 'None',
 'surface_type': 'None',
 'is_bridge': 'False',
 'is_tunnel': 'False',
 'is_oneway': 'True',
 'access_restriction': 'None',
 'geometry': <LINESTRING (558334.637 6319964.764, 558345.969 6319972.416, 558364.02 63199...>}

In [12]:

# Inspect a few entries to confirm data assignment
print("\nVerification of assigned data:")
print(edges_proj['time_series_data'].dropna().head())
print("\nEdges with no data assigned:", edges_proj[edges_proj['time_series_data'].isna()])


Verification of assigned data:


KeyError: 'time_series_data'

In [54]:

# Verification in G_proj
missing_data_edges = 0
total_edges = len(G_proj.edges())

print("\nVerification of assigned data in `G_proj`:")
for u, v, key, edge_data in G_proj.edges(keys=True, data=True):
    if 'time_series_data' in edge_data:
        print(f"Edge ({u}, {v}, {key}) has time series data.")
    else:
        missing_data_edges += 1

print(f"\nTotal edges in G_proj: {total_edges}")
print(f"Edges with missing time series data: {missing_data_edges}")
print(f"Edges with time series data: {total_edges - missing_data_edges}")



Verification of assigned data in `G_proj`:
Edge (28390914, 1031808321, 0) has time series data.
Edge (28390914, 84863652, 0) has time series data.
Edge (1031808321, 28390914, 0) has time series data.
Edge (28390935, 323187714, 0) has time series data.
Edge (323187714, 29551197, 0) has time series data.
Edge (29551197, 1801885466, 0) has time series data.
Edge (29551197, 32993239, 0) has time series data.
Edge (1801885466, 28390935, 0) has time series data.
Edge (32993239, 1801882912, 0) has time series data.
Edge (29972525, 29977423, 0) has time series data.
Edge (29977423, 6150580690, 0) has time series data.
Edge (29972545, 3708603819, 0) has time series data.
Edge (29972545, 29972549, 0) has time series data.
Edge (29972575, 29972525, 0) has time series data.
Edge (1801885465, 1801885466, 0) has time series data.
Edge (29972655, 29972684, 0) has time series data.
Edge (29972684, 32993241, 0) has time series data.
Edge (32993241, 3465083213, 0) has time series data.
Edge (32993241, 

In [13]:
len(grouped_dataframes.keys())

485

In [52]:
import math
import time
import pandas as pd

# Function to calculate Euclidean distance between two points
def euclidean_distance(x1, y1, x2, y2):
    return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

# Initialize progress tracking
total_dataframes = len(grouped_dataframes)
start_time = time.time()
x = 5  # Print status every x DataFrames processed
distance_threshold = 1000  # Only assign if edge is within this distance

# Ensure 'time_series_data' column in edges_proj can store complex data
edges_proj['time_series_data'] = pd.Series([None] * len(edges_proj), dtype="object")

# Iterate over each grouped DataFrame in `grouped_dataframes`
for df_index, (key, df) in enumerate(grouped_dataframes.items(), start=1):
    vej_id, df_x, df_y = key

    # List to store edges and distances
    edge_distances = []

    # Loop over all edges in `edges_proj` and calculate distances
    for edge_index, edge_data in edges_proj.iterrows():
        edge_geometry = edge_data['geometry']
        midpoint_index = len(edge_geometry.coords.xy[0]) // 2
        edge_x = edge_geometry.coords.xy[0][midpoint_index]
        edge_y = edge_geometry.coords.xy[1][midpoint_index]
        
        # Calculate distance from edge to DataFrame's x, y
        distance = euclidean_distance(edge_x, edge_y, df_x, df_y)

        # Track each edge with its distance, if within the threshold
        if distance <= distance_threshold:
            edge_distances.append((edge_index, distance))

    # Sort edges by distance in ascending order
    edge_distances.sort(key=lambda x: x[1])

    # Try to assign data to the closest unoccupied edge within the threshold
    assigned = False
    for edge_index, distance in edge_distances:
        # Check if the edge's `time_series_data` is empty
        time_series_content = edges_proj.at[edge_index, 'time_series_data']
        if isinstance(time_series_content, float):
            if pd.isna(time_series_content):
                time_series_data = list(zip(df.index.to_pydatetime(), df['traffic_flow'].tolist()))
                edges_proj.at[edge_index, 'time_series_data'] = time_series_data
                G_proj[edge_index[0]][edge_index[1]][0]['time_series_data'] = time_series_data
                print(f"Assigned data to edge {edge_index} for DataFrame {df_index}/{total_dataframes}")
                assigned = True
                break
        elif time_series_content is None or (isinstance(time_series_content, list) and len(time_series_content) == 0):
            time_series_data = list(zip(df.index.to_pydatetime(), df['traffic_flow'].tolist()))
            edges_proj.at[edge_index, 'time_series_data'] = time_series_data
            G_proj[edge_index[0]][edge_index[1]][0]['time_series_data'] = time_series_data
            print(f"Assigned data to edge {edge_index} for DataFrame {df_index}/{total_dataframes}")
            assigned = True
            break  # Stop once data is assigned

    # If no unoccupied edge is found within the threshold, log the result
    if not assigned:
        print(f"No suitable unoccupied edge found within threshold for DataFrame {df_index}/{total_dataframes}")

    # Progress update every x DataFrames
    if df_index % x == 0:
        avg_time_per_df = (time.time() - start_time) / df_index
        estimated_total_time = avg_time_per_df * total_dataframes
        remaining_time = estimated_total_time - (time.time() - start_time)
        print(f"Processed {df_index}/{total_dataframes} DataFrames. Remaining time: {remaining_time / 60:.2f} minutes")

# Final summary
print("\nProcessing complete.")
print(f"Total DataFrames processed: {total_dataframes}")

# Inspect a few entries to confirm data assignment
print("\nVerification of assigned data:")
print(edges_proj['time_series_data'].dropna().head())
print("\nEdges with no data assigned:", edges_proj[edges_proj['time_series_data'].isna()])
# 

Assigned data to edge (1153165839, 1153164044, 0) for DataFrame 1/485
Assigned data to edge (1126296974, 1126297109, 0) for DataFrame 2/485
Assigned data to edge (1126297109, 1126296974, 0) for DataFrame 3/485
Assigned data to edge (1153165763, 1153164240, 0) for DataFrame 4/485
Assigned data to edge (3468114003, 1153165839, 0) for DataFrame 5/485
Processed 5/485 DataFrames. Remaining time: 15.72 minutes
Assigned data to edge (3468114004, 1153165839, 0) for DataFrame 6/485
Assigned data to edge (3468114003, 3468114004, 0) for DataFrame 7/485
Assigned data to edge (911620005, 280821804, 0) for DataFrame 8/485
Assigned data to edge (280821804, 911620005, 0) for DataFrame 9/485
Assigned data to edge (128653314, 280821804, 0) for DataFrame 10/485
Processed 10/485 DataFrames. Remaining time: 15.54 minutes
Assigned data to edge (280821804, 128653314, 0) for DataFrame 11/485
Assigned data to edge (911620005, 128653314, 0) for DataFrame 12/485
Assigned data to edge (1415036542, 1667019075, 0) 

In [53]:
# Count edges with assigned time series data
edges_with_data = edges_proj['time_series_data'].notna().sum()
total_edges = len(edges_proj)

# Check and print verification results
print(f"Total edges in graph: {total_edges}")
print(f"Edges with time series data: {edges_with_data}")

# Check if any edges are missing data
missing_data_edges = edges_proj[edges_proj['time_series_data'].isna()]

if edges_with_data == total_edges:
    print("Verification complete: All edges have time series data.")
else:
    print(f"Verification complete: {total_edges - edges_with_data} edges are missing time series data.")

Total edges in graph: 31470
Edges with time series data: 484
Verification complete: 30986 edges are missing time series data.


In [55]:

ox.save_graphml(G_proj, filepath='total_proj.graphml')
