Library

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import math
import time

Load Grouped Dataframes from Mastra

In [2]:
grouped_dataframes = pd.read_pickle('grouped_dataframes.pkl')

In [3]:
def adjust_traffic_flow(df, column_name, multiplier=2):
    """
    Adjusts the traffic flow value of 1.0 in a DataFrame to 1000 if it is between
    two values that are significantly higher (default: multiple of 2).

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_name (str): The name of the traffic flow column.
        multiplier (float): The factor that determines "significantly higher".

    Returns:
        pd.DataFrame: Modified DataFrame with adjusted traffic flow values.
    """
    traffic_flow = df[column_name].values
    for i in range(1, len(traffic_flow) - 1):
        # Check if the current value is 1.0 and it's between two values that are higher
        if (traffic_flow[i] == 1.0 and 
            traffic_flow[i - 1] > 900 and 
            traffic_flow[i + 1] > 900):
            traffic_flow[i] = 1000
    
    # Update the DataFrame column
    df[column_name] = traffic_flow
    return df

grouped_dataframes = {key: adjust_traffic_flow(df, 'traffic_flow') for key, df in grouped_dataframes.items()}

In [4]:
columns_to_keep = ['Vej-Id', 'Dato', 'Xkoordinat', 'Ykoordinat', 'traffic_flow', 'time_start']
cleaned_dataframes = {}

for key in grouped_dataframes.keys():
    grouped_dataframes[key] = grouped_dataframes[key][columns_to_keep]
    grouped_dataframes[key]['traffic_flow'] = grouped_dataframes[key]['traffic_flow'].apply(lambda x: x * 10**3 if not pd.isna(x) and x != int(x) else x)

    # check for index duplicates
    if grouped_dataframes[key].index.duplicated().any():
        # drop duplicates
        cleaned_dataframes[key] = grouped_dataframes[key].reset_index(drop=True)


Only use Total Dataframes

In [5]:
# Filtered and unique dataframes processing
filtered_dataframes = {}
unique_dataframes = {}
cords = []
names = list(grouped_dataframes.keys())
k = 0

for name in names:
    single_name = name[0]
    
    # Check if the name ends with "T"
    if single_name[-1] in ["T"]:
        # Add to filtered dataframes
        filtered_dataframes[name] = grouped_dataframes[name]
        
        # Extract coordinates
        x, y = grouped_dataframes[name]['Xkoordinat'].iloc[0], grouped_dataframes[name]['Ykoordinat'].iloc[0]
        
        # Check for uniqueness and add to unique_dataframes
        if (x, y) not in cords:
            unique_dataframes[name] = grouped_dataframes[name]
            cords.append((x, y))
            k += 1
    
print("Unique Dataframes", k)

Unique Dataframes 136


Look at NA Values

In [8]:
for key in filtered_dataframes.keys():
    df = filtered_dataframes[key]
    print(key, "NA", df['traffic_flow'].isna().sum(), "\n", "NotNA", df['traffic_flow'].notna().sum())

('0 450-0 7/ 300 T', 552601.0, 6329842.0) NA 1123 
 NotNA 33533
('0 450-0 9/1675 T', 549360.0, 6330576.0) NA 1917 
 NotNA 59427
('0 70-0 277/ 100 T', 552269.0, 6311346.0) NA 2070 
 NotNA 51594
('0 70-0 278/ 201 T', 552730.0, 6312348.0) NA 1931 
 NotNA 51829
('0 70-0 279/ 100 T', 553084.0, 6313171.0) NA 1117 
 NotNA 30371
('0 70-0 284/ 400 T', 555132.0, 6317792.0) NA 1911 
 NotNA 59241
('0 70-0 284/ 950 T', 555308.0, 6318304.0) NA 1911 
 NotNA 59241
('0 70-0 285/ 700 T', 555789.0, 6318873.0) NA 1173 
 NotNA 28587
('0 70-0 286/ 100 T', 556145.0, 6319061.0) NA 2038 
 NotNA 53738
('0 70-0 288/ 424 T', 558245.0, 6319907.0) NA 2118 
 NotNA 57882
('0 70-0 289/ 350 T', 558753.0, 6320685.0) NA 1352 
 NotNA 38296
('0 70-0 289/ 950 T', 559007.0, 6321227.0) NA 2179 
 NotNA 57341
('0 70-0 290/ 951 T', 559026.0, 6322227.0) NA 1912 
 NotNA 48488
('0 70-0 291/ 452 T', 558976.0, 6322726.0) NA 2212 
 NotNA 57788
('0 70-0 291/ 625 T', 558956.0, 6322898.0) NA 2034 
 NotNA 59022
('0 70-0 292/ 0 T', 558880.

Find Longest period and Interpolate NA Values

In [9]:
import pandas as pd

def find_and_apply_longest_period(filtered_dataframes, threshold=5):
    """
    Loops over all dataframes in filtered_dataframes, finds the longest period 
    without exceeding the threshold of consecutive NaN values, and filters 
    the dataframe to include only that period.

    Parameters:
    - filtered_dataframes (dict): A dictionary of dataframes to process.
    - threshold (int): Maximum allowed consecutive NaN values.

    Returns:
    - dict: Updated filtered_dataframes with filtered dataframes.
    """
    def find_longest_valid_period(series, threshold):
        """
        Finds the longest valid period in the series, allowing up to `threshold` NaN values consecutively.
        """
        mask = series.notna()
        count = 0
        start = end = 0
        max_start = max_end = 0
        nans = 0

        for i in range(len(mask)):
            if mask.iloc[i]:
                count += 1
                nans = 0
            else:
                nans += 1
                if nans > threshold:
                    count = 0
                    nans = 0
                    start = i + 1
            if count > (max_end - max_start):
                max_start, max_end = start, i + 1
        return max_start, max_end

    for name, df in filtered_dataframes.items():
        # Assuming the column to check for NaN values is named 'Value'
        if 'traffic_flow' in df.columns:
            start_idx, end_idx = find_longest_valid_period(df['traffic_flow'], threshold)
            filtered_dataframes[name] = df.iloc[start_idx:end_idx]

    return filtered_dataframes

filtered_dataframes_period = find_and_apply_longest_period(filtered_dataframes.copy(), threshold=5)

In [10]:
for name, df in filtered_dataframes_period.items():
    # Interpolate NaN values in all columns with numeric data
    filtered_dataframes_period[name]['traffic_flow'] = df['traffic_flow'].interpolate(method='linear', limit_direction='both')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataframes_period[name]['traffic_flow'] = df['traffic_flow'].interpolate(method='linear', limit_direction='both')


Verify amount of Na Values

In [None]:
# longest consecutive streak of NA in filtered_dataframes_period[name]['traffic_flow']
for name in filtered_dataframes_period:
    print(name, filtered_dataframes_period[name]['traffic_flow'].isna().astype(int).groupby(filtered_dataframes_period[name]['traffic_flow'].notna().astype(int).cumsum()).cumsum().max())

('0 450-0 7/ 300 T', 552601.0, 6329842.0) 0
('0 450-0 9/1675 T', 549360.0, 6330576.0) 0
('0 70-0 277/ 100 T', 552269.0, 6311346.0) 0
('0 70-0 278/ 201 T', 552730.0, 6312348.0) 0
('0 70-0 279/ 100 T', 553084.0, 6313171.0) 0
('0 70-0 284/ 400 T', 555132.0, 6317792.0) 0
('0 70-0 284/ 950 T', 555308.0, 6318304.0) 0
('0 70-0 285/ 700 T', 555789.0, 6318873.0) 0
('0 70-0 286/ 100 T', 556145.0, 6319061.0) 0
('0 70-0 288/ 424 T', 558245.0, 6319907.0) 0
('0 70-0 289/ 350 T', 558753.0, 6320685.0) 0
('0 70-0 289/ 950 T', 559007.0, 6321227.0) 0
('0 70-0 290/ 951 T', 559026.0, 6322227.0) 0
('0 70-0 291/ 452 T', 558976.0, 6322726.0) 0
('0 70-0 291/ 625 T', 558956.0, 6322898.0) 0
('0 70-0 292/ 0 T', 558880.0, 6323318.0) 0
('0 70-0 293/ 370 T', 557951.0, 6324241.0) 0
('0 70-0 293/ 400 T', 557929.0, 6324261.0) 0
('0 70-0 293/ 640 T', 557757.0, 6324428.0) 0
('0 70-0 293/ 960 T', 557594.0, 6324700.0) 0
('0 70-0 293/ 980 T', 557587.0, 6324719.0) 0
('0 70-0 294/ 850 T', 557734.0, 6325564.0) 0
('0 70-0 295/ 

Find the dataframes which have the most data over the longest time

In [None]:
def remove_extreme_dataframes(one_week_dfs, n, m):
    # Sort dataframes by their starting index value
    sorted_by_start = sorted(one_week_dfs.items(), key=lambda item: item[1].index[0])
    
    # Sort dataframes by their ending index value
    sorted_by_end = sorted(one_week_dfs.items(), key=lambda item: item[1].index[-1])
    
    # Remove n dataframes with the highest starting index value
    to_remove_start = sorted_by_start[-n:]
    
    # Remove m dataframes with the lowest ending index value
    to_remove_end = sorted_by_end[:m]
    
    # Create a set of keys to remove
    keys_to_remove = set(key for key, _ in to_remove_start + to_remove_end)
    
    # Remove the dataframes from one_week_dfs
    one_week_dfs = {key: df for key, df in one_week_dfs.items() if key not in keys_to_remove}
    
    return one_week_dfs

highest_duration = 0
count_removing = 200

for n in range(0, 80):
    for m in range(0, 80):
        temp_one_week_dfs = remove_extreme_dataframes(filtered_dataframes_period, n, m)
        
        if temp_one_week_dfs:  # Check if temp_one_week_dfs is not empty
            highest_starting_index = max(df.index[0] for df in temp_one_week_dfs.values())
            lowest_ending_index = min(df.index[-1] for df in temp_one_week_dfs.values())
            duration_days = ((lowest_ending_index - highest_starting_index) / 3600) / 24
            if duration_days >= 0 and duration_days > highest_duration:
                print(f"n = {n}, m = {m}")
                print("Number of dataframes after removal:", len(temp_one_week_dfs))
                print("Highest starting index value:", highest_starting_index)
                print("Lowest ending index value:", lowest_ending_index)
                print("Duration in days:", duration_days)
                if duration_days > highest_duration and count_removing > n+m:	
                    highest_duration = duration_days
                    count_removing = n + m
                    best_start = highest_starting_index
                    best_end = lowest_ending_index
                    best_n = n
                    best_m = m


print("Best start:", best_start)
print("Best end:", best_end)
print("Best n:", best_n)
print("Best m:", best_m)
print("Highest duration in days:", highest_duration)


n = 26, m = 71
Number of dataframes after removal: 54
Highest starting index value: 1704063600
Lowest ending index value: 1704191400
Duration in days: 1.4791666666666667
n = 26, m = 72
Number of dataframes after removal: 53
Highest starting index value: 1704063600
Lowest ending index value: 1705328100
Duration in days: 14.635416666666666
n = 26, m = 73
Number of dataframes after removal: 52
Highest starting index value: 1704063600
Lowest ending index value: 1705328100
Duration in days: 14.635416666666666
n = 26, m = 74
Number of dataframes after removal: 51
Highest starting index value: 1704063600
Lowest ending index value: 1705328100
Duration in days: 14.635416666666666
n = 26, m = 75
Number of dataframes after removal: 50
Highest starting index value: 1704063600
Lowest ending index value: 1705582800
Duration in days: 17.583333333333332
n = 26, m = 76
Number of dataframes after removal: 49
Highest starting index value: 1704063600
Lowest ending index value: 1705988700
Duration in days:

Drop Duplicates in index and apply the Range

In [15]:
start = 1678963500 - 3588300
end = 1701363600

final_dataframes = {}

for name, df in filtered_dataframes_period.items():
    # Ensure the index is unique and monotonic
    if not df.index.is_unique:
        print(f"Duplicate indices found in {name}. Dropping duplicates.")
        df = df[~df.index.duplicated(keep='first')]

    if not df.index.is_monotonic_increasing:
        print(f"Index of {name} is not sorted. Sorting the index.")
        df = df.sort_index()

    if start in df.index and end in df.index:
        final_dataframes[name] = df.loc[start:end]
    elif start > df.index[0] and end < df.index[-1]:
        try:
            final_dataframes[name] = df.loc[start:end]
        except KeyError:
            print("Trying something else")
            # Find the nearest index to start and end
            start_index = df.index.get_indexer([start], method='nearest')[0]
            end_index = df.index.get_indexer([end], method='nearest')[0]
            final_dataframes[name] = df.iloc[start_index:end_index+1]
    else:
        print(f"Data range does not fit for {name}. Skipping.")
print("Final Dataframes", len(final_dataframes))


Duplicate indices found in ('0 450-0 7/ 300 T', 552601.0, 6329842.0). Dropping duplicates.
Index of ('0 450-0 7/ 300 T', 552601.0, 6329842.0) is not sorted. Sorting the index.
Duplicate indices found in ('0 450-0 9/1675 T', 549360.0, 6330576.0). Dropping duplicates.
Index of ('0 450-0 9/1675 T', 549360.0, 6330576.0) is not sorted. Sorting the index.
Duplicate indices found in ('0 70-0 277/ 100 T', 552269.0, 6311346.0). Dropping duplicates.
Index of ('0 70-0 277/ 100 T', 552269.0, 6311346.0) is not sorted. Sorting the index.
Duplicate indices found in ('0 70-0 278/ 201 T', 552730.0, 6312348.0). Dropping duplicates.
Index of ('0 70-0 278/ 201 T', 552730.0, 6312348.0) is not sorted. Sorting the index.
Duplicate indices found in ('0 70-0 279/ 100 T', 553084.0, 6313171.0). Dropping duplicates.
Index of ('0 70-0 279/ 100 T', 553084.0, 6313171.0) is not sorted. Sorting the index.
Duplicate indices found in ('0 70-0 284/ 400 T', 555132.0, 6317792.0). Dropping duplicates.
Index of ('0 70-0 284/

In [16]:
# Gather all start and end index values for analysis
start_values = []
end_values = []

for name, df in final_dataframes.items():
    start_values.append(df.index[0])
    end_values.append(df.index[-1])

start_values = pd.Series(start_values)
end_values = pd.Series(end_values)

# Determine the most common start and end indices
common_start = start_values.value_counts().idxmax()
common_end = end_values.value_counts().idxmax()

# Apply the common range to all DataFrames
updated_final_dataframes = {}
for name, df in final_dataframes.items():
    # Check if the common_start and common_end exist in the DataFrame
    if common_start in df.index and common_end in df.index:
        updated_final_dataframes[name] = df.loc[common_start:common_end].copy()
    else:
        print(f"Skipping {name} - range {common_start} to {common_end} not found.")

# Replace the final_dataframes with updated ones
final_dataframes = updated_final_dataframes

# Verify results
print(f"Updated DataFrames: {len(final_dataframes)}")
#for name, df in final_dataframes.items():
    #print(f"{name}: Start {df.index[0]}, End {df.index[-1]}")


Skipping ('0 450-0 7/ 300 T', 552601.0, 6329842.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 277/ 100 T', 552269.0, 6311346.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 278/ 201 T', 552730.0, 6312348.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 279/ 100 T', 553084.0, 6313171.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 288/ 424 T', 558245.0, 6319907.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 289/ 350 T', 558753.0, 6320685.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 289/ 950 T', 559007.0, 6321227.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 290/ 951 T', 559026.0, 6322227.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 291/ 452 T', 558976.0, 6322726.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 291/ 625 T', 558956.0, 6322898.0) - range 1675375200 to 1701363600 not found.
Skipping ('0 70-0 292/ 0 T', 558880.0, 63

____________________________________________________________________

Create a Graph from Final Dataframes

Create GCN Model 