In [1]:
import pandas as pd 
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import LineString
import pyproj
import pandas as pd
import regex as reg
import networkx as nx
import osmnx as ox
import geopandas as gpd


In [2]:
# Load the Time-Series data from Mastra
mastra_df = pd.read_csv('mastra.csv')

# Load GraphML for Aalborg Municipality
G = ox.load_graphml('aalborg_network.graphml')

# Project the graph to UTM (osmnx will automatically select the appropriate UTM zone)
G_proj = ox.project_graph(G)
nodes_proj, edges_proj = ox.graph_to_gdfs(G_proj)

Clean Mastra

In [3]:
# Verify the column names in the DataFrame
print(mastra_df.columns)

# reg ex to replace , with . on the whole document
mastra_df = mastra_df.replace({',': '.'}, regex=True)
mastra_df = mastra_df.replace({'LÃ¸rdag': "Loerdag"}, regex=True)
mastra_df = mastra_df.replace({'SÃ¸ndag': "Soendag"}, regex=True)


mastra_df.rename(columns={'MÃ¥ned': 'Maened'}, inplace=True)
mastra_df.rename(columns={'Ãrstal': 'Aarstal'}, inplace=True)
mastra_df.rename(columns={'ï»¿Vej-Id': 'Vej-Id'}, inplace=True)

#mastra_df.drop(columns=['Unnamed: 1', 'Unnamed: 2'], inplace=True)

# Correct column names if necessary
columns_to_convert = [
    'Trafiktype',
    'Dagtype',
    'Vej-Id',
    'Retning',
    'Trafiktype'
]

# Convert the specified columns to float64
mastra_df = mastra_df.astype({col: 'string' for col in columns_to_convert})

mastra_df = mastra_df.replace({'<NA>': ""}, regex=True)

# Identify the columns that are not part of the time intervals
id_vars_columns = [
    'Vej-Id', 'Kilometer', 'Retning', 'Spor', 'Dato', 'Aarstal', 
    'Maened', 'Dag', 'Dagtype', 'Trafiktype', 'Xkoordinat', 'Ykoordinat'
]

# The time interval columns
time_columns = [
    "00:00-00:15","00:15-00:30","00:30-00:45","00:45-01:00","01:00-01:15","01:15-01:30","01:30-01:45","01:45-02:00","02:00-02:15","02:15-02:30","02:30-02:45","02:45-03:00","03:00-03:15","03:15-03:30","03:30-03:45","03:45-04:00","04:00-04:15","04:15-04:30","04:30-04:45","04:45-05:00","05:00-05:15","05:15-05:30","05:30-05:45","05:45-06:00","06:00-06:15","06:15-06:30","06:30-06:45","06:45-07:00","07:00-07:15","07:15-07:30","07:30-07:45","07:45-08:00","08:00-08:15","08:15-08:30","08:30-08:45","08:45-09:00","09:00-09:15","09:15-09:30","09:30-09:45","09:45-10:00","10:00-10:15","10:15-10:30","10:30-10:45","10:45-11:00","11:00-11:15","11:15-11:30","11:30-11:45","11:45-12:00","12:00-12:15","12:15-12:30","12:30-12:45","12:45-13:00","13:00-13:15","13:15-13:30","13:30-13:45","13:45-14:00","14:00-14:15","14:15-14:30","14:30-14:45","14:45-15:00","15:00-15:15","15:15-15:30","15:30-15:45","15:45-16:00","16:00-16:15","16:15-16:30","16:30-16:45","16:45-17:00","17:00-17:15","17:15-17:30","17:30-17:45","17:45-18:00","18:00-18:15","18:15-18:30","18:30-18:45","18:45-19:00","19:00-19:15","19:15-19:30","19:30-19:45","19:45-20:00","20:00-20:15","20:15-20:30","20:30-20:45","20:45-21:00","21:00-21:15","21:15-21:30","21:30-21:45","21:45-22:00","22:00-22:15","22:15-22:30","22:30-22:45","22:45-23:00","23:00-23:15","23:15-23:30","23:30-23:45","23:45-24:00"
]

# Melt the DataFrame to reshape it from wide format to long format
mastra_long = pd.melt(
    mastra_df,
    id_vars=id_vars_columns,  # Columns that uniquely identify each row
    value_vars=time_columns,  # Time interval columns
    var_name='time_interval',  # Name of the new column for the time intervals
    value_name='traffic_flow'  # Name for the traffic flow data
)

# Extract the start time from 'time_interval'
mastra_long['time_start'] = mastra_long['time_interval'].str[:5]  # Extracts the "00:00" part for sorting

mastra_long['datetime'] = pd.to_datetime(mastra_long['Dato'] + " " + mastra_long['time_start'], dayfirst=True)

mastra_long = mastra_long.sort_values(by=['Vej-Id', 'Dato', 'time_start'])

# Result
mastra_long.head()


Index(['ï»¿Vej-Id', 'Unnamed: 1', 'Unnamed: 2', 'Kilometer', 'Retning', 'Spor',
       'Xkoordinat', 'Ykoordinat', 'Dato', 'Ãrstal',
       ...
       '21:30-21:45', '21:45-22:00', '22:00-22:15', '22:15-22:30',
       '22:30-22:45', '22:45-23:00', '23:00-23:15', '23:15-23:30',
       '23:30-23:45', '23:45-24:00'],
      dtype='object', length=110)


Unnamed: 0,Vej-Id,Kilometer,Retning,Spor,Dato,Aarstal,Maened,Dag,Dagtype,Trafiktype,Xkoordinat,Ykoordinat,time_interval,traffic_flow,time_start,datetime
33655,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:00-00:15,10.0,00:00,2023-02-01 00:00:00
302697,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:15-00:30,10.0,00:15,2023-02-01 00:15:00
571739,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:30-00:45,7.0,00:30,2023-02-01 00:30:00
840781,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:45-01:00,3.0,00:45,2023-02-01 00:45:00
1109823,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,01:00-01:15,3.0,01:00,2023-02-01 01:00:00


Group Time Series data

In [4]:
# Create a dictionary to store the DataFrames with unique key identifiers
grouped_dataframes = {}

# Group by the specified columns
for key, group in mastra_long.groupby(["Vej-Id", "Xkoordinat", "Ykoordinat"]):
    # Use the unique key for naming or accessing each DataFrame
    sorted_group = group.sort_values(by=['Vej-Id', 'Dato', 'time_start'])
    grouped_dataframes[key] = sorted_group.reset_index(drop=True,inplace=False).set_index('datetime', inplace=False)

In [5]:
grouped_dataframes[key]

Unnamed: 0_level_0,Vej-Id,Kilometer,Retning,Spor,Dato,Aarstal,Maened,Dag,Dagtype,Trafiktype,Xkoordinat,Ykoordinat,time_interval,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-01 00:00:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:00-00:15,21.0,00:00
2023-01-01 00:15:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:15-00:30,65.0,00:15
2023-01-01 00:30:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:30-00:45,65.0,00:30
2023-01-01 00:45:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,00:45-01:00,65.0,00:45
2023-01-01 01:00:00,851 8519680-0 0/ 470 T,0.0,T,,01.01.2023,2023.0,1.0,1.0,Helligdag,BY,556617.0,6324515.0,01:00-01:15,69.0,01:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 22:45:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,22:45-23:00,50.0,22:45
2023-12-31 23:00:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,23:00-23:15,70.0,23:00
2023-12-31 23:15:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,23:15-23:30,75.0,23:15
2023-12-31 23:30:00,851 8519680-0 0/ 470 T,0.0,T,,31.12.2023,2023.0,12.0,31.0,Helligdag,BY,556617.0,6324515.0,23:30-23:45,64.0,23:30


In [6]:
grouped_dataframes[key]['traffic_flow']

datetime
2023-01-01 00:00:00    21.0
2023-01-01 00:15:00    65.0
2023-01-01 00:30:00    65.0
2023-01-01 00:45:00    65.0
2023-01-01 01:00:00    69.0
                       ... 
2023-12-31 22:45:00    50.0
2023-12-31 23:00:00    70.0
2023-12-31 23:15:00    75.0
2023-12-31 23:30:00    64.0
2023-12-31 23:45:00    65.0
Name: traffic_flow, Length: 61344, dtype: float64

In [10]:
import math
import time
import pandas as pd
import osmnx as ox

# Ensure the graph has node and edge identifiers
edges_proj = ox.graph_to_gdfs(G_proj, nodes=False, edges=True).reset_index()  # Include 'u', 'v', and 'key'

# Add a 'time_series_data' column to hold the time series information
edges_proj['time_series_data'] = pd.Series([None] * len(edges_proj), dtype="object")

# Function to calculate Euclidean distance between two points
def euclidean_distance(x1, y1, x2, y2):
    return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

# Track progress and statistics
total_dataframes = len(grouped_dataframes)
start_time = time.time()
x = 5  # Status print interval
high_distance_threshold = 1000  # Threshold in meters

# Initialize distance metrics
total_distance, min_distance, max_distance = 0, float('inf'), float('-inf')
distance_count = 0

# Process each DataFrame in `grouped_dataframes`
for df_index, (key, df) in enumerate(grouped_dataframes.items(), start=1):
    vej_id, df_x, df_y = key
    closest_edge, closest_distance = None, float('inf')

    # Find the closest edge
    for _, edge_data in edges_proj.iterrows():
        # Midpoint coordinates of the edge
        edge_geometry = edge_data['geometry']
        midpoint_index = len(edge_geometry.coords.xy[0]) // 2
        edge_x = edge_geometry.coords.xy[0][midpoint_index]
        edge_y = edge_geometry.coords.xy[1][midpoint_index]

        # Calculate the distance to the current DataFrame's location
        distance = euclidean_distance(edge_x, edge_y, df_x, df_y)
        if distance < closest_distance:
            closest_distance = distance
            closest_edge = (edge_data['u'], edge_data['v'], edge_data['key'])

    # Assign time-series data if within threshold
    if closest_edge and closest_distance <= high_distance_threshold:
        time_series_data = list(zip(df.index.to_pydatetime(), df['traffic_flow'].tolist()))

        # Update both `G_proj` and `edges_proj`
        u, v, key = closest_edge
        if (u, v, key) in G_proj.edges(keys=True):
            G_proj[u][v][key]['time_series_data'] = time_series_data

        # Use pd.Series to wrap time_series_data as a single object
        edges_proj.loc[(edges_proj['u'] == u) & (edges_proj['v'] == v) & (edges_proj['key'] == key), 'time_series_data'] = pd.Series([time_series_data], dtype="object")

        # Update distance stats
        total_distance += closest_distance
        min_distance = min(min_distance, closest_distance)
        max_distance = max(max_distance, closest_distance)
        distance_count += 1
        print(f"Edge found within threshold for DataFrame {df_index}/{total_dataframes}")
    else:
        print(f"No edge found within threshold for DataFrame {df_index}/{total_dataframes}")

    # Status print every x dataframes
    if df_index % x == 0:
        avg_distance = total_distance / distance_count if distance_count > 0 else 0
        avg_time_per_df = (time.time() - start_time) / df_index
        estimated_total_time = avg_time_per_df * total_dataframes
        remaining_time = estimated_total_time - (time.time() - start_time)

        # Print progress and distance stats
        print(f"Processed {df_index}/{total_dataframes} DataFrames. Estimated remaining time: {remaining_time / 60:.2f} minutes")
        print(f"Min Distance: {min_distance:.2f} meters, Max Distance: {max_distance:.2f} meters, Avg Distance: {avg_distance:.2f} meters")

# Print final summary
print("\nProcessing complete.")
print(f"Total edges with time series data in G_proj: {sum(1 for _, _, _, data in G_proj.edges(keys=True, data=True) if 'time_series_data' in data)}")
print(f"Distance stats - Min: {min_distance:.2f}m, Max: {max_distance:.2f}m, Avg: {total_distance / distance_count if distance_count > 0 else 0:.2f}m")

# Verify that time series data is actually assigned in `edges_proj`
print("\nVerification of assigned data in `edges_proj`:")
print(edges_proj['time_series_data'].dropna().head())


Edge found within threshold for DataFrame 1/485
Edge found within threshold for DataFrame 2/485
Edge found within threshold for DataFrame 3/485
Edge found within threshold for DataFrame 4/485
Edge found within threshold for DataFrame 5/485
Processed 5/485 DataFrames. Estimated remaining time: 15.41 minutes
Min Distance: 97.58 meters, Max Distance: 97.58 meters, Avg Distance: 97.58 meters
Edge found within threshold for DataFrame 6/485
Edge found within threshold for DataFrame 7/485
Edge found within threshold for DataFrame 8/485
Edge found within threshold for DataFrame 9/485
Edge found within threshold for DataFrame 10/485
Processed 10/485 DataFrames. Estimated remaining time: 15.09 minutes
Min Distance: 97.58 meters, Max Distance: 366.85 meters, Avg Distance: 178.36 meters
Edge found within threshold for DataFrame 11/485
Edge found within threshold for DataFrame 12/485
Edge found within threshold for DataFrame 13/485
Edge found within threshold for DataFrame 14/485
Edge found within 

In [7]:
# import math
# import time
# import pandas as pd

# # Function to calculate Euclidean distance between two points
# def euclidean_distance(x1, y1, x2, y2):
#     return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)

# # Initialize total progress tracking
# total_dataframes = len(grouped_dataframes)
# start_time = time.time()
# x = 5  # Print status every x DataFrames processed
# initial_threshold = 10  # Start with 10 meters and increase if no match
# max_threshold = 500000000000  # Maximum distance to prevent excessive searches

# # Variables to track distances
# total_distance = 0
# min_distance = float('inf')
# max_distance = float('-inf')
# distance_count = 0  # Count of distances for averaging

# # Ensure 'time_series_data' column in edges_proj can store complex data
# edges_proj['time_series_data'] = pd.Series([None] * len(edges_proj), dtype="object")

# # Iterate over each grouped DataFrame in `grouped_dataframes`
# for df_index, (key, df) in enumerate(grouped_dataframes.items(), start=1):
#     vej_id, df_x, df_y = key

#     # Initialize variables for closest match search
#     closest_edge = None
#     closest_distance = float('inf')
#     distance_threshold = initial_threshold  # Start with initial threshold

#     # Gradually increase threshold if no match is found within the current threshold
#     while closest_edge is None and distance_threshold <= max_threshold:
#         # Check each edge in `edges_proj` to find the closest to (df_x, df_y) within the current threshold
#         for edge_index, edge_data in edges_proj.iterrows():
#             # Get the midpoint coordinates of the current edge
#             edge_geometry = edge_data['geometry']
#             midpoint_index = len(edge_geometry.coords.xy[0]) // 2
#             edge_x = edge_geometry.coords.xy[0][midpoint_index]
#             edge_y = edge_geometry.coords.xy[1][midpoint_index]

#             # Calculate distance between edge and DataFrame coordinates
#             distance = euclidean_distance(edge_x, edge_y, df_x, df_y)

#             # Track the closest edge if within the current threshold
#             if distance < closest_distance and distance <= distance_threshold:
#                 closest_distance = distance
#                 closest_edge = edge_index

#         # Increase threshold if no match found within current threshold
#         if closest_edge is None:
#             distance_threshold += 100  # Increase by 100 meters

#     # If a closest edge is found within an acceptable threshold, add time-series data
#     if closest_edge is not None and closest_distance <= max_threshold:
#         # Convert the time series data to a list of (datetime, traffic_flow) tuples
#         time_series_data = list(zip(df.index.to_pydatetime(), df['traffic_flow'].tolist()))

#         # Attach the time series data to the edge in both `edges_proj` and `G_proj`
#         edges_proj.at[closest_edge, 'time_series_data'] = time_series_data

#         # Retrieve u, v, key identifiers from `edges_proj` to match in `G_proj`
#         u, v, key = edges_proj.loc[closest_edge, ['u', 'v', 'key']]
#         if (u, v, key) in G_proj.edges(keys=True):
#             G_proj[u][v][key]['time_series_data'] = time_series_data

#         # Update distance statistics
#         total_distance += closest_distance
#         min_distance = min(min_distance, closest_distance)
#         max_distance = max(max_distance, closest_distance)
#         distance_count += 1
#         print(f"Edge found within {closest_distance:.2f} meters for DataFrame {df_index}/{total_dataframes}")
#     else:
#         print(f"No edge found within {max_threshold} meters for DataFrame {df_index}/{total_dataframes}")

#     # Print status every x DataFrames
#     if df_index % x == 0:
#         avg_distance = total_distance / distance_count if distance_count > 0 else 0
#         avg_time_per_df = (time.time() - start_time) / df_index
#         estimated_total_time = avg_time_per_df * total_dataframes
#         remaining_time = estimated_total_time - (time.time() - start_time)

#         # Print progress and distance stats
#         print(f"Processed {df_index}/{total_dataframes} DataFrames. Estimated remaining time: {remaining_time / 60:.2f} minutes")
#         print(f"Min Distance: {min_distance:.2f} meters, Max Distance: {max_distance:.2f} meters, Avg Distance: {avg_distance:.2f} meters")

# # Print final summary
# print("\nProcessing complete.")
# print(f"Total DataFrames processed: {total_dataframes}")
# print(f"Total edges with time series data: {distance_count}")
# print(f"Min Distance: {min_distance:.2f} meters, Max Distance: {max_distance:.2f} meters, Avg Distance: {total_distance / distance_count if distance_count > 0 else 0:.2f} meters")

# # Verify that time series data is actually assigned
# print("\nVerification of assigned data in `edges_proj`:")
# print(edges_proj['time_series_data'].dropna().head())


KeyError: "None of [Index(['u', 'v', 'key'], dtype='object')] are in the [columns]"

In [11]:

# Verification in G_proj
missing_data_edges = 0
total_edges = len(G_proj.edges())

print("\nVerification of assigned data in `G_proj`:")
for u, v, key, edge_data in G_proj.edges(keys=True, data=True):
    if 'time_series_data' in edge_data:
        print(f"Edge ({u}, {v}, {key}) has time series data.")
    else:
        missing_data_edges += 1

print(f"\nTotal edges in G_proj: {total_edges}")
print(f"Edges with missing time series data: {missing_data_edges}")
print(f"Edges with time series data: {total_edges - missing_data_edges}")



Verification of assigned data in `G_proj`:
Edge (28390914, 1031808321, 0) has time series data.
Edge (28390935, 323187714, 0) has time series data.
Edge (323187714, 29551197, 0) has time series data.
Edge (29551197, 32993239, 0) has time series data.
Edge (32993239, 1801882912, 0) has time series data.
Edge (29977423, 6150580690, 0) has time series data.
Edge (29972545, 3708603819, 0) has time series data.
Edge (29972545, 29972549, 0) has time series data.
Edge (29972575, 29972525, 0) has time series data.
Edge (1801885465, 1801885466, 0) has time series data.
Edge (29972655, 29972684, 0) has time series data.
Edge (32993241, 3465083213, 0) has time series data.
Edge (2720419870, 29973010, 0) has time series data.
Edge (29973010, 29972684, 0) has time series data.
Edge (6150580690, 5970912136, 0) has time series data.
Edge (324154751, 2753877799, 0) has time series data.
Edge (324154751, 310129156, 0) has time series data.
Edge (29977650, 3684837993, 0) has time series data.
Edge (299

In [13]:
len(grouped_dataframes.keys())

485

KeyError: 0

In [9]:
# Traverse each edge in the projected graph and perform actions based on 'osmid'
for u, v, key, edge_data in G_proj.edges(keys=True, data=True):
    # Chekc if they ahve time series data
    

SyntaxError: incomplete input (2279883416.py, line 4)

In [68]:
# Traverse each edge in the projected graph and perform actions based on 'osmid'
for u, v, key, edge_data in G_proj.edges(keys=True, data=True):
    print(edge_data)  # Display edge data for reference
    osmid = edge_data.get('osmid')

    for index in stuff:
        if isinstance(stuff[index]['osmid'], int):
            if osmid == stuff[index]['osmid']:
                print("Found1")
                print(stuff[index])
                # Add a custom attribute to the edge
                
                #edge_data['traffic_flow'] = None
        else:
            for osmids in stuff[index]['osmid']:
                if osmid == osmids:
                    print("Found2")
                    # Add a different custom attribute to the edge
                    #edge_data['custom_attribute'] = 'some_value_2'

    # Just breaking to limit the loop as in your example
    break


{'osmid': 26316217, 'lanes': '2', 'name': 'Kirkegade', 'highway': 'tertiary', 'maxspeed': '50', 'oneway': False, 'reversed': True, 'length': 7.838, 'allowed_speed': '50', 'surface_type': 'None', 'is_bridge': 'False', 'is_tunnel': 'False', 'is_oneway': 'False', 'access_restriction': 'None'}
Found1
{'osmid': 26316217, 'x': 555709.9852663105, 'y': 6324413.691876737}
Found1
{'osmid': 26316217, 'x': 555703.2459350697, 'y': 6324409.648959474}
Found1
{'osmid': 26316217, 'x': 555720.5637663284, 'y': 6324420.024219587}


In [35]:
grouped_dataframes[index]

KeyError: (12209399551, 12209399583, 0)