In [59]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import math
import time

# Load the Time-Series data from Mastra
mastra_df = pd.read_csv('mastra.csv')

# reg ex to replace , with . on the whole document
mastra_df = mastra_df.replace({',': '.'}, regex=True)
mastra_df = mastra_df.replace({'LÃ¸rdag': "Loerdag"}, regex=True)
mastra_df = mastra_df.replace({'SÃ¸ndag': "Soendag"}, regex=True)


mastra_df.rename(columns={'MÃ¥ned': 'Maened'}, inplace=True)
mastra_df.rename(columns={'Ãrstal': 'Aarstal'}, inplace=True)
mastra_df.rename(columns={'ï»¿Vej-Id': 'Vej-Id'}, inplace=True)

#mastra_df.drop(columns=['Unnamed: 1', 'Unnamed: 2'], inplace=True)

# Correct column names if necessary
columns_to_convert = [
    'Trafiktype',
    'Dagtype',
    'Vej-Id',
    'Retning',
    'Trafiktype'
]

# Convert the specified columns to float64
mastra_df = mastra_df.astype({col: 'string' for col in columns_to_convert})
mastra_df = mastra_df.replace({'<NA>': ""}, regex=True)

# Identify the columns that are not part of the time intervals
id_vars_columns = [
    'Vej-Id', 'Kilometer', 'Retning', 'Spor', 'Dato', 'Aarstal', 
    'Maened', 'Dag', 'Dagtype', 'Trafiktype', 'Xkoordinat', 'Ykoordinat'
]

# The time interval columns
time_columns = [
    "00:00-00:15","00:15-00:30","00:30-00:45","00:45-01:00","01:00-01:15","01:15-01:30","01:30-01:45","01:45-02:00","02:00-02:15","02:15-02:30","02:30-02:45","02:45-03:00","03:00-03:15","03:15-03:30","03:30-03:45","03:45-04:00","04:00-04:15","04:15-04:30","04:30-04:45","04:45-05:00","05:00-05:15","05:15-05:30","05:30-05:45","05:45-06:00","06:00-06:15","06:15-06:30","06:30-06:45","06:45-07:00","07:00-07:15","07:15-07:30","07:30-07:45","07:45-08:00","08:00-08:15","08:15-08:30","08:30-08:45","08:45-09:00","09:00-09:15","09:15-09:30","09:30-09:45","09:45-10:00","10:00-10:15","10:15-10:30","10:30-10:45","10:45-11:00","11:00-11:15","11:15-11:30","11:30-11:45","11:45-12:00","12:00-12:15","12:15-12:30","12:30-12:45","12:45-13:00","13:00-13:15","13:15-13:30","13:30-13:45","13:45-14:00","14:00-14:15","14:15-14:30","14:30-14:45","14:45-15:00","15:00-15:15","15:15-15:30","15:30-15:45","15:45-16:00","16:00-16:15","16:15-16:30","16:30-16:45","16:45-17:00","17:00-17:15","17:15-17:30","17:30-17:45","17:45-18:00","18:00-18:15","18:15-18:30","18:30-18:45","18:45-19:00","19:00-19:15","19:15-19:30","19:30-19:45","19:45-20:00","20:00-20:15","20:15-20:30","20:30-20:45","20:45-21:00","21:00-21:15","21:15-21:30","21:30-21:45","21:45-22:00","22:00-22:15","22:15-22:30","22:30-22:45","22:45-23:00","23:00-23:15","23:15-23:30","23:30-23:45","23:45-24:00"
]

# Melt the DataFrame to reshape it from wide format to long format
mastra_df = pd.melt(
    mastra_df,
    id_vars=id_vars_columns,  # Columns that uniquely identify each row
    value_vars=time_columns,  # Time interval columns
    var_name='time_interval',  # Name of the new column for the time intervals
    value_name='traffic_flow'  # Name for the traffic flow data
)

# Extract the start time from 'time_interval'
mastra_df['time_start'] = mastra_df['time_interval'].str[:5]  # Extracts the "00:00" part for sorting
mastra_df['datetime'] = pd.to_datetime(mastra_df['Dato'] + " " + mastra_df['time_start'], dayfirst=True)

mastra_df = mastra_df.sort_values(by=['Vej-Id', 'Dato', 'time_start'])

#mastra_df['datetime'] = mastra_df['datetime'].apply(lambda x: int(time.mktime(x.timetuple())))

# Create a dictionary to store the DataFrames with unique key identifiers
grouped_dataframes = {}

# Group by the specified columns
for key, group in mastra_df.groupby(["Vej-Id", "Xkoordinat", "Ykoordinat"]):
    # Convert the 'datetime' column to Unix timestamps in seconds
    group['datetime'] = group['datetime'].apply(lambda x: int(time.mktime(x.timetuple())))

    # Use the unique key for naming or accessing each DataFrame
    sorted_group = group.sort_values(by=['Vej-Id', 'Dato', 'time_start'])
    grouped_dataframes[key] = sorted_group.reset_index(drop=True, inplace=False).set_index('datetime', inplace=False)

In [60]:
grouped_dataframes = pd.read_pickle('grouped_dataframes.pkl')

In [61]:
names = list(grouped_dataframes.keys())
print(len(grouped_dataframes))

485


In [62]:
# Get the shape of the first dataframe
first_shape = list(grouped_dataframes.values())[0].shape

# Check if all dataframes have the same shape
all_same_size = all(df.shape == first_shape for df in grouped_dataframes.values())

print("All dataframes are the same size:", all_same_size)

All dataframes are the same size: False


In [63]:
# Count the number of dataframes that do not have the same shape as the first dataframe
not_same_size_count = sum(1 for df in grouped_dataframes.values() if df.shape != first_shape)

print("Number of dataframes that are not the same size:", not_same_size_count)

Number of dataframes that are not the same size: 478


In [64]:
grouped_dataframes[names[0]].head()

Unnamed: 0_level_0,Vej-Id,Kilometer,Retning,Spor,Dato,Aarstal,Maened,Dag,Dagtype,Trafiktype,Xkoordinat,Ykoordinat,time_interval,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1675206000,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:00-00:15,10.0,00:00
1675206900,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:15-00:30,10.0,00:15
1675207800,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:30-00:45,7.0,00:30
1675208700,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,00:45-01:00,3.0,00:45
1675209600,0 450-0 7/ 300 +,7.0,+,,01.02.2023,2023.0,2.0,1.0,Hverdag,BY,552601.0,6329842.0,01:00-01:15,3.0,01:00


In [65]:
columns_to_keep = ['Vej-Id', 'Dato', 'Xkoordinat', 'Ykoordinat', 'traffic_flow', 'time_start']

for key in grouped_dataframes.keys():
    grouped_dataframes[key] = grouped_dataframes[key][columns_to_keep]


# Verify Verify the the columns columns of 
grouped_dataframes[names[73]]


Unnamed: 0_level_0,Vej-Id,Dato,Xkoordinat,Ykoordinat,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1672527600,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,21.0,00:00
1672528500,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,24.0,00:15
1672529400,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,61.0,00:30
1672530300,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,95.0,00:45
1672531200,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,114.0,01:00
...,...,...,...,...,...,...
1704059100,0 70-0 290/ 540 +,31.12.2023,559046.0,6321816.0,40.0,22:45
1704060000,0 70-0 290/ 540 +,31.12.2023,559046.0,6321816.0,,23:00
1704060900,0 70-0 290/ 540 +,31.12.2023,559046.0,6321816.0,,23:15
1704061800,0 70-0 290/ 540 +,31.12.2023,559046.0,6321816.0,53.0,23:30


In [66]:
lowest_datetime = min(df.index.min() for df in grouped_dataframes.values())
print("Lowest 'datetime' index for all dataframes:", lowest_datetime)

Lowest 'datetime' index for all dataframes: 1672527600


In [67]:
# Convert 'Dato' column to datetime if not already
for key in grouped_dataframes.keys():
    grouped_dataframes[key]['Dato'] = pd.to_datetime(grouped_dataframes[key]['Dato'], format='%d.%m.%Y')

# Find the earliest and latest date
earliest_date = min(df['Dato'].min() for df in grouped_dataframes.values())
latest_date = max(df['Dato'].max() for df in grouped_dataframes.values())

print("Earliest date:", earliest_date)
print("Latest date:", latest_date)

Earliest date: 2023-01-01 00:00:00
Latest date: 2024-08-31 00:00:00


In [68]:
import pandas as pd
import numpy as np
import time

# Define the date range
date_range = pd.date_range(start='2023-01-01', end='2024-08-31', freq='D')

# Define the time range with 15-minute intervals
time_range = pd.date_range(start='00:00', end='23:45', freq='15T').time

# Create a MultiIndex with all combinations of dates and times
multi_index = pd.MultiIndex.from_product([date_range, time_range], names=['Dato', 'time_start'])

# Create the DataFrame
empty_df = pd.DataFrame(index=multi_index, columns=['Vej-Id', 'Xkoordinat', 'Ykoordinat', 'traffic_flow'])

# Reset the index to have 'datetime' as a column
empty_df = empty_df.reset_index()

# Format 'Dato' as mm.dd.yyyy
empty_df['Dato'] = empty_df['Dato'].dt.strftime('%d.%m.%Y')

# Combine 'Dato' and 'time_start' into a single datetime column (ensure correct formatting)
empty_df['datetime'] = pd.to_datetime(empty_df['Dato'] + ' ' + empty_df['time_start'].astype(str), format='%d.%m.%Y %H:%M:%S')

# Convert datetime to UNIX timestamp
empty_df['datetime'] = empty_df['datetime'].apply(lambda dt: int(time.mktime(dt.timetuple())))

# Set 'datetime' as the index
empty_df = empty_df.set_index('datetime')

# Ensure 'time_start' contains only hour and minutes as strings
empty_df['time_start'] = empty_df.index.map(lambda ts: pd.to_datetime(ts, unit='s').strftime('%H:%M'))

# Fill other columns with placeholder data
empty_df['Vej-Id'] = np.nan  # Placeholder for Vej-Id
empty_df['Dato'] = empty_df.index.map(lambda ts: pd.to_datetime(ts, unit='s').strftime('%d.%m.%Y'))  # Re-format Dato column

# Set NaN for other columns
empty_df['Xkoordinat'] = np.nan          # Set NaN for Xkoordinat
empty_df['Ykoordinat'] = np.nan          # Set NaN for Ykoordinat
empty_df['traffic_flow'] = np.nan        # Set NaN for traffic_flow

# Reorder columns
empty_df = empty_df[['Vej-Id', 'Dato', 'Xkoordinat', 'Ykoordinat', 'traffic_flow', 'time_start']]

# Remove the first 4 rows and adjust index
empty_df = empty_df.iloc[4:]
empty_df.index = empty_df.index - 3600  # Subtract 3600 seconds (1 hour) from each index value

# Display the first few rows of the modified DataFrame
empty_df.shape


  time_range = pd.date_range(start='00:00', end='23:45', freq='15T').time


(58460, 6)

In [69]:
# Define the additional time range for 31.08.2024
additional_time_range = pd.date_range(start='2024-08-31 22:00', end='2024-08-31 23:45', freq='15T')

# Create a DataFrame for the new rows
additional_df = pd.DataFrame({
    'datetime': additional_time_range.map(lambda dt: int(time.mktime(dt.timetuple()))),  # Convert to UNIX timestamp
    'Vej-Id': np.nan,
    'Dato': additional_time_range.strftime('%d.%m.%Y'),
    'Xkoordinat': np.nan,
    'Ykoordinat': np.nan,
    'traffic_flow': np.nan,
    'time_start': additional_time_range.strftime('%H:%M'),
})

# Set 'datetime' as the index
additional_df = additional_df.set_index('datetime')

# Append the new rows to the original DataFrame
empty_df = pd.concat([empty_df, additional_df])

# Display the last few rows to verify
empty_df.shape


  additional_time_range = pd.date_range(start='2024-08-31 22:00', end='2024-08-31 23:45', freq='15T')


(58468, 6)

In [70]:
for key in grouped_dataframes.keys():
    grouped_dataframes[key] = grouped_dataframes[key][~grouped_dataframes[key].index.duplicated(keep='first')]

In [71]:
for key in grouped_dataframes.keys():
    grouped_dataframes[key] = empty_df.combine_first(grouped_dataframes[key])

# Verify the result for one of the dataframes
#grouped_dataframes[names[73]].head()

In [72]:
for key in grouped_dataframes.keys():
    grouped_dataframes[key] = grouped_dataframes[key][~grouped_dataframes[key].index.duplicated(keep='first')]

In [73]:
grouped_dataframes[names[73]].head()

Unnamed: 0_level_0,Vej-Id,Dato,Xkoordinat,Ykoordinat,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1672527600,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,21.0,00:00
1672528500,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,24.0,00:15
1672529400,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,61.0,00:30
1672530300,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,95.0,00:45
1672531200,0 70-0 290/ 540 +,01.01.2023,559046.0,6321816.0,114.0,01:00


In [74]:
for key in grouped_dataframes.keys():
    grouped_dataframes[key]['Vej-Id'] = grouped_dataframes[key]['Vej-Id'].ffill().bfill()
    grouped_dataframes[key]['Xkoordinat'] = grouped_dataframes[key]['Xkoordinat'].ffill().bfill()
    grouped_dataframes[key]['Ykoordinat'] = grouped_dataframes[key]['Ykoordinat'].ffill().bfill()


# Verify the result for one of the dataframes
grouped_dataframes[names[0]].head()

Unnamed: 0_level_0,Vej-Id,Dato,Xkoordinat,Ykoordinat,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1672527600,0 450-0 7/ 300 +,01.01.2023,552601.0,6329842.0,,00:00
1672528500,0 450-0 7/ 300 +,01.01.2023,552601.0,6329842.0,,00:15
1672529400,0 450-0 7/ 300 +,01.01.2023,552601.0,6329842.0,,00:30
1672530300,0 450-0 7/ 300 +,01.01.2023,552601.0,6329842.0,,00:45
1672531200,0 450-0 7/ 300 +,01.01.2023,552601.0,6329842.0,,01:00


In [75]:
for key in grouped_dataframes.keys():
    grouped_dataframes[key]['Vej-Id'] = grouped_dataframes[key]['Vej-Id'].str[:-1]


In [76]:
# Create a new dictionary to store the aggregated dataframes
aggregated_dataframes = {}

# Iterate over the grouped dataframes
for key, df in grouped_dataframes.items():
    # Create a unique key based on Xkoordinat and Ykoordinat
    coord_key = (df['Xkoordinat'].iloc[0], df['Ykoordinat'].iloc[0])
    
    if coord_key not in aggregated_dataframes:
        # Initialize the dataframe in the aggregated dictionary
        aggregated_dataframes[coord_key] = df.copy()
    else:
        # Aggregate traffic_flow by summing up the values, ignoring NaNs
        aggregated_dataframes[coord_key]['traffic_flow'] += df['traffic_flow'].fillna(0)
        
        # Keep the shortest Vej-Id
        aggregated_dataframes[coord_key]['Vej-Id'] = aggregated_dataframes[coord_key]['Vej-Id'].combine(
            df['Vej-Id'], lambda x, y: x if len(x) < len(y) else y
        )

# Verify the result for one of the aggregated dataframes
aggregated_dataframes[list(aggregated_dataframes.keys())[0]].head()

Unnamed: 0_level_0,Vej-Id,Dato,Xkoordinat,Ykoordinat,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1672527600,0 450-0 7/ 300,01.01.2023,552601.0,6329842.0,,00:00
1672528500,0 450-0 7/ 300,01.01.2023,552601.0,6329842.0,,00:15
1672529400,0 450-0 7/ 300,01.01.2023,552601.0,6329842.0,,00:30
1672530300,0 450-0 7/ 300,01.01.2023,552601.0,6329842.0,,00:45
1672531200,0 450-0 7/ 300,01.01.2023,552601.0,6329842.0,,01:00


In [77]:
nan_percentage = {}

for key, df in aggregated_dataframes.items():
    nan_count = df['traffic_flow'].isna().sum()
    total_count = len(df)
    nan_percentage[key] = (nan_count / total_count) * 100

thresholds = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65]
counts = {threshold: sum(percentage > threshold for percentage in nan_percentage.values()) for threshold in thresholds}

for threshold, count in counts.items():
    print(f"Number of values above {threshold}%: {count}")

Number of values above 5%: 114
Number of values above 10%: 76
Number of values above 15%: 61
Number of values above 20%: 46
Number of values above 25%: 40
Number of values above 30%: 38
Number of values above 35%: 35
Number of values above 40%: 35
Number of values above 45%: 16
Number of values above 50%: 13
Number of values above 55%: 9
Number of values above 60%: 9
Number of values above 65%: 0


In [78]:
#aggregated_dataframes = {key: df for key, df in aggregated_dataframes.items() if nan_percentage[key] <= 45}


In [79]:

def find_longest_non_nan_period(df, threshold=None, tolerance=0):
    """
    Finds the longest continuous period in a DataFrame where there are no NaN values in most columns,
    with tolerance for short gaps.

    Parameters:
    df (pd.DataFrame): The DataFrame to analyze.
    threshold (float): The minimum proportion of non-NaN values in each row (e.g., 0.8 for 80% of columns).
                       If None, defaults to 50% (more than half of columns).
    tolerance (int): The maximum number of consecutive rows with NaN values that are tolerated as part of the period.

    Returns:
    pd.DataFrame: Cropped DataFrame to the longest continuous period with minimal NaN values.
    """
    # Set default threshold to more than half of the columns if not specified
    if threshold is None:
        threshold = 0.5

    # Calculate the required number of non-NaN values per row
    required_non_nan = int(len(df.columns) * threshold)

    # Create a boolean mask where each row has True if it meets the non-NaN requirement
    valid_rows = df.notna().sum(axis=1) >= required_non_nan

    # Variables to track the longest valid period
    max_start = max_end = temp_start = temp_length = max_length = 0
    gap_count = 0  # Count of consecutive invalid rows within tolerance

    for i, valid in enumerate(valid_rows):
        if valid:
            if temp_length == 0:
                temp_start = i  # Start a new period
            temp_length += 1
            gap_count = 0  # Reset gap count as we are in a valid row
        else:
            if gap_count >= tolerance:
                if temp_length > max_length:
                    max_start, max_end = temp_start, temp_start + temp_length
                    max_length = temp_length
                temp_length = 0
                gap_count = 0


    # Final check in case the longest period ends at the last row
    if temp_length > max_length:
        max_start, max_end = temp_start, temp_start + temp_length

    # Crop the DataFrame to the longest continuous non-NaN period
    return df.iloc[max_start:max_end]
    # Iterate through every dataframe in aggregated_dataframes and find the longest non-NaN period

one_week_dfs1 = {}
for key in aggregated_dataframes.keys():
        one_week_dfs1[key] = find_longest_non_nan_period(aggregated_dataframes[key], threshold=1, tolerance=0)



In [80]:
one_week_dfs1[list(one_week_dfs1.keys())[0]]

Unnamed: 0_level_0,Vej-Id,Dato,Xkoordinat,Ykoordinat,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1672959600,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,66.0,00:00
1672960500,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,42.0,00:15
1672961400,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,24.0,00:30
1672962300,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,33.0,00:45
1672963200,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,12.0,01:00
...,...,...,...,...,...,...
1673037900,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,216.0,21:45
1673038800,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,297.0,22:00
1673039700,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,231.0,22:15
1673040600,0 450-0 7/ 300,06.01.2023,552601.0,6329842.0,204.0,22:30


In [81]:
one_week_dfs = {}
for key in aggregated_dataframes.keys():
        one_week_dfs[key] = find_longest_non_nan_period(aggregated_dataframes[key], threshold=1, tolerance=2)



In [82]:
one_week_dfs[list(one_week_dfs.keys())[3]]

Unnamed: 0_level_0,Vej-Id,Dato,Xkoordinat,Ykoordinat,traffic_flow,time_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1672527600,0 70-0 278/ 201,01.01.2023,552730.0,6312348.0,66.0,00:00
1672528500,0 70-0 278/ 201,01.01.2023,552730.0,6312348.0,102.0,00:15
1672529400,0 70-0 278/ 201,01.01.2023,552730.0,6312348.0,198.0,00:30
1672530300,0 70-0 278/ 201,01.01.2023,552730.0,6312348.0,336.0,00:45
1672531200,0 70-0 278/ 201,01.01.2023,552730.0,6312348.0,360.0,01:00
...,...,...,...,...,...,...
1716752700,0 70-0 278/ 201,26.05.2024,552730.0,6312348.0,624.0,20:45
1716753600,0 70-0 278/ 201,26.05.2024,552730.0,6312348.0,504.0,21:00
1716754500,0 70-0 278/ 201,26.05.2024,552730.0,6312348.0,777.0,21:15
1716755400,0 70-0 278/ 201,26.05.2024,552730.0,6312348.0,453.0,21:30


In [83]:
# Calculate the percentage of NaN values in 'traffic_flow' for each dataframe
nan_percentage_one_week = {key: (df['traffic_flow'].isna().sum() / len(df)) * 100 for key, df in one_week_dfs.items()}

# Filter out dataframes with NaN percentage above 5%
one_week_dfs = {key: df for key, df in one_week_dfs.items() if nan_percentage_one_week[key] <= 5}

# Verify the result
print(f"Number of dataframes after filtering: {len(one_week_dfs)}")

Number of dataframes after filtering: 102


[]
