In [153]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.impute import SimpleImputer, KNNImputer
import networkx as nx
import seaborn as sns
import json
import glob
import os


In [154]:
# Create the graph from the JSON files

with open('../json/nodes.json', 'r') as f:
    nodes = json.load(f)

with open('../json/links.json', 'r') as f:
    links = json.load(f)

G = nx.DiGraph()

for node in nodes:
    G.add_node(node['id'], **node)

for link in links:
    G.add_edge(link['us_node_id'], link['ds_node_id'], **link)

pos = {node["id"]: (node["x"], node["y"]) for node in nodes}




In [155]:
# Load the data

# Directory containing all the CSV files
folder_path = '../Run_C2100_Gebeurtenis_DWA_20250422_003112'

# Prefix of the files to be loaded
prefix = 'Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_'

# multiple pickle path
multiple_pickle_path = '../pickle/multiple/depnod_volume_05.pkl'

# List to collect matching files
matching_files = []

# Walk through the folder
for file_name in os.listdir(folder_path):
    if file_name.startswith(prefix):
        matching_files.append(file_name)

# Print or use the list
for f in matching_files:
    print(f)


Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_depnod.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_flooddepth.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_floodvolume.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_flvol.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_qinfnod.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_qnode.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_qrain.csv
Node_Run_C2100_Gebeurtenis_DWA_20250422_003112_BRC_PDCs_6_33;1_61; bui#10_volume.csv


In [156]:

# 1. Load the data of first matching file (depnod)
depnod_df = pd.read_csv(os.path.join(folder_path, matching_files[0]), skiprows=[1])
depnod_df.drop(columns=['Time'], inplace=True)
depnod_df.replace(-9998.0, np.nan, inplace=True)

# 2. Select depth columns and identify negative values
depth_columns = depnod_df.columns.difference(['Seconds'])
negatives = depnod_df[depth_columns] < 0

# 3. Count total negative values and nodes with negatives
total_negatives = negatives.sum().sum()
nodes_with_negatives = negatives.any()
count_nodes_with_negatives = nodes_with_negatives.sum()
nodes_list = nodes_with_negatives[nodes_with_negatives].index.tolist()

print(f"Total number of negative depth entries: {total_negatives}")
print(f"Number of nodes with at least one negative depth: {count_nodes_with_negatives}")
print(f"Nodes with negatives: {nodes_list}")

# 4. Columns where ALL values are negative
columns_all_negative = (depnod_df[depth_columns] < 0).all()
columns_all_negative = columns_all_negative[columns_all_negative].index.tolist()
depnod_df[columns_all_negative] = np.nan

# 5. Columns where ALL values are NaN
nan_columns_names = depnod_df.columns[depnod_df.isna().all()].tolist()
Percentage = len(nan_columns_names) / len(depnod_df.columns) * 100
print(f"Percentage of columns where ALL values are NaN: {Percentage:.2f}%")
print(f"Number of columns where ALL values are NaN: {len(nan_columns_names)}")
print(f'This could be imputed with KNN or SimpleImputer')

# depnod_df




Total number of negative depth entries: 3537
Number of nodes with at least one negative depth: 862
Nodes with negatives: ['1013', '1025', '1026', '1028', '1029', '1031', '1032', '1033', '1034', '1036', '1036A', '1037', '1038', '1039', '1040', '1041', '1053', '1054', '110060', '110061', '110061C2', '110066', '110079', '110107', '110113', '110161', '110162', '110164', '110235', '110235C2', '110236', '110237', '110237HH', '110249', '110253', '110298', '110302', '110311', '110315', '110316', '110321', '110322', '110323', '110324', '119999', '121550', '121552', '121580', '121584G', '130489', '130491', '130555', '130590', '140899', '140915', '140981', '140982', '140983', '141017', '141033', '141116', '141117', '141118', '141119', '141123', '141131', '141152', '141178', '141180', '141181', '141182', '141187', '141195', '141200', '141206', '141210', '141237', '141238', '141240', '141245', '141262', '141263', '141264', '141271', '141289', '141292', '143207', '150005', '150006', '150007', '15000

In [157]:
print(f'There are {len(nan_columns_names)} columns where all values are NaN')
# Let's add the data to the graph
for node in depnod_df.columns[1:]:
    if node in G.nodes:
        G.nodes[node]['depth'] = depnod_df[node].tolist()
    else:
        print(f"Node {node} not found in the graph.")

print((G.nodes(data=True)))

There are 244 columns where all values are NaN
[('1001', {'id': '1001', 'x': 461000.65616613504, 'y': 1351690.2559001052, 'depth': [1.12051, 1.18279, 1.74043, 2.90192, 2.97986, 3.03615, 3.07451, 3.09978, 3.11576, 3.12408, 3.12541, 3.1221, 3.11706]}), ('1002', {'id': '1002', 'x': 460883.5301818835, 'y': 1351714.5341153282, 'depth': [1.04101, 1.09897, 1.75588, 2.89637, 2.96546, 3.01849, 3.05703, 3.08344, 3.10009, 3.10857, 3.1098, 3.10637, 3.10133]}), ('1003', {'id': '1003', 'x': 460754.625982409, 'y': 1351744.3241415746, 'depth': [0.91088, 0.95011, 1.66943, 2.88485, 2.94737, 2.98897, 3.02149, 3.04463, 3.05973, 3.06772, 3.0692, 3.0664, 3.06206]}), ('1004', {'id': '1004', 'x': 460639.6981608872, 'y': 1351745.505243937, 'depth': [0.84101, 0.87855, 1.59096, 2.83723, 2.90732, 2.94641, 2.97352, 2.99373, 3.00766, 3.01552, 3.01765, 3.01583, 3.0123]}), ('1005', {'id': '1005', 'x': 460507.77558870916, 'y': 1351746.6207295011, 'depth': [0.75108, 0.8539, 1.51705, 2.69795, 2.81182, 2.86532, 2.89756, 

In [158]:
from collections import deque

# Function to find closest valid neighbors using BFS
count = 0
countn = 0
def find_closest_valid_neighbors(graph, start_node):
    valid_neighbors = []
    visited = set()
    queue = [(start_node, 0)]  # BFS starts from the start node with distance 0
    
    # Perform BFS
    while queue:
        current, distance = queue.pop(0)
        if current in visited:
            continue
        visited.add(current)
        
        # Skip the start node itself
        if current != start_node:
            depths = graph.nodes[current]['depth']
            # Check if any depth is valid (not NaN)
            if any(not np.isnan(d) for d in depths):
                valid_neighbors.append(current)
        
        # Enqueue neighbors with distance incremented by 1
        for neighbor in graph.neighbors(current):
            if neighbor not in visited:
                queue.append((neighbor, distance + 1))
    
    return valid_neighbors

valid_neighbors_list = []
invalid_neighbors_list = []
# Fill missing data
for node in G.nodes:
    depths = G.nodes[node]['depth']
    if all(np.isnan(d) for d in depths):
        count += 1
        print(f"\nFilling node {node}")
        
        # Find neighbors
        valid_neighbors = find_closest_valid_neighbors(G, node)
        print(f"  Found valid neighbors: {valid_neighbors}")
        
        if valid_neighbors:
            countn += 1
            valid_neighbors_list.append(node)
            # Get the depth data for valid neighbors
            neighbor_depths = [G.nodes[n]['depth'] for n in valid_neighbors]
            aggregated = []
            for time_step in zip(*neighbor_depths):
                valid_values = [v for v in time_step if not np.isnan(v)]
                if valid_values:
                    aggregated.append(np.mean(valid_values))
                else:
                    aggregated.append(np.nan)
            # Update the node's depth
            G.nodes[node]['depth'] = aggregated
        else:
            print(f"  No valid data found for node {node} (remains NaN)")
            invalid_neighbors_list.append(node)

# Print results
print("\nUpdated node depths:")
for node in G.nodes:
    print(f"{node}: {G.nodes[node]['depth']}")

print(f'Total number of nodes with all NaN values: {count}')
print(f'Total number of nodes with at least one valid neighbor: {countn}')
print(f'Valid neighbors: {valid_neighbors_list}')
print(f'Invalid neighbors: {invalid_neighbors_list}')
print(f"Percentage of nodes which were filled: {countn / count * 100:.2f}%")
print(f"Percentage of nodes which were not filled: {100 - (countn / count * 100):.2f}%")


Filling node 1014U
  Found valid neighbors: []
  No valid data found for node 1014U (remains NaN)

Filling node 1030U
  Found valid neighbors: []
  No valid data found for node 1030U (remains NaN)

Filling node 110076U1
  Found valid neighbors: []
  No valid data found for node 110076U1 (remains NaN)

Filling node 110076U2
  Found valid neighbors: ['110061C2']

Filling node 110126KRAT_out
  Found valid neighbors: []
  No valid data found for node 110126KRAT_out (remains NaN)

Filling node 110413I_out
  Found valid neighbors: []
  No valid data found for node 110413I_out (remains NaN)

Filling node 110414I_out
  Found valid neighbors: []
  No valid data found for node 110414I_out (remains NaN)

Filling node 110426_out
  Found valid neighbors: []
  No valid data found for node 110426_out (remains NaN)

Filling node 110516H_OUT
  Found valid neighbors: []
  No valid data found for node 110516H_OUT (remains NaN)

Filling node 110527I_out
  Found valid neighbors: []
  No valid data found f

In [159]:
# drop the nodes with all NaN values (unconnected nodes, unable to fill)
count = 0
for node in list(G.nodes):  # make a static copy of the node list
    if all(np.isnan(d) for d in G.nodes[node]['depth']):
        G.remove_node(node)
        print(f"Node {node} removed due to all NaN values in depth data.")
        count += 1
print(f'Total number of nodes removed: {count}')



Node 1014U removed due to all NaN values in depth data.
Node 1030U removed due to all NaN values in depth data.
Node 110076U1 removed due to all NaN values in depth data.
Node 110126KRAT_out removed due to all NaN values in depth data.
Node 110413I_out removed due to all NaN values in depth data.
Node 110414I_out removed due to all NaN values in depth data.
Node 110426_out removed due to all NaN values in depth data.
Node 110516H_OUT removed due to all NaN values in depth data.
Node 110527I_out removed due to all NaN values in depth data.
Node 110528I_out removed due to all NaN values in depth data.
Node 1116U removed due to all NaN values in depth data.
Node 121589-1U2 removed due to all NaN values in depth data.
Node 130527A_out removed due to all NaN values in depth data.
Node 140784_INF_OUT removed due to all NaN values in depth data.
Node 141236U removed due to all NaN values in depth data.
Node 150055U removed due to all NaN values in depth data.
Node 150161U removed due to all N

In [160]:
# count nodes that still contain negative values in 'depth'

for node in G.nodes:
    G.nodes[node]['depth'] = [
        np.nan if d < 0 else d for d in G.nodes[node]['depth']
    ]

count = sum(
    any(np.isnan(d) for d in G.nodes[node]['depth'])
    for node in G.nodes
)
print(f"{count} nodes still contain negative values in depth data (now turned into NaN).")



798 nodes still contain negative values in depth data (now turned into NaN).


In [161]:
# Impute NaN values in the graph using KNN
# Convert node depths to a matrix for imputation
node_list = list(G.nodes)
depth_matrix = np.array([G.nodes[node]['depth'] for node in node_list])

# Apply KNN imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform')
imputed_matrix = imputer.fit_transform(depth_matrix)

# Update graph with imputed values
for i, node in enumerate(node_list):
    G.nodes[node]['depth'] = imputed_matrix[i].tolist()

# Check if there are still NaN values
count = sum(
    any(np.isnan(d) for d in G.nodes[node]['depth'])
    for node in G.nodes
)
print(f"{count} nodes still contain negative values in depth data.")


0 nodes still contain negative values in depth data.


In [162]:
# # # Let's add the data to the graph
# # for node in depnod_df.columns[1:]:
# #     if node in G.nodes:
# #         G.nodes[node]['depth'] = depnod_df[node].tolist()
# #     else:
# #         print(f"Node {node} not found in the graph.")

# print(G.nodes(data=True))
# print(f'The length of the graph is {len(G.nodes())}')

# # Now let's visualize the graph with depth values
# depth_values = [G.nodes[node].get('depth', [np.nan])[0] for node in G.nodes()]
# depth_values = np.array(depth_values)

# # Normalize depth values for color mapping
# norm = plt.Normalize(depth_values.min(), depth_values.max())
# cmap = plt.cm.viridis

# # Create the figure and axis objects
# fig, ax = plt.subplots(figsize=(10, 10))

# # Draw the graph
# nx.draw(G, pos=pos, node_size=10, node_color=cmap(norm(depth_values)), with_labels=False, ax=ax)

# # Create the colorbar using a ScalarMappable object
# sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
# sm.set_array([])  # Empty array because we're not plotting images

# # Add the colorbar with the correct label
# fig.colorbar(sm, ax=ax, label='Depth')

# plt.show()

In [163]:
volume_df = pd.read_csv(os.path.join(folder_path, matching_files[7]), skiprows=[1])

# Get rid of time column
volume_df.drop(columns=['Time'], inplace=True)

df_node_volumes = volume_df.drop(columns=['Seconds'], errors='ignore')

# Loop through the nodes in the graph
for node_id in G.nodes:
    # Check if the node ID matches a column in the DataFrame
    if node_id in df_node_volumes.columns:
        # Assign the volume series to the node as an attribute
        G.nodes[node_id]['volume'] = df_node_volumes[node_id].tolist()

# print the first few nodes to check
print(list(G.nodes(data=True))[:5])


[('1001', {'id': '1001', 'x': 461000.65616613504, 'y': 1351690.2559001052, 'depth': [1.12051, 1.18279, 1.74043, 2.90192, 2.97986, 3.03615, 3.07451, 3.09978, 3.11576, 3.12408, 3.12541, 3.1221, 3.11706], 'volume': [0.02051, 0.08279, 0.64043, 11.9017, 27.48964, 38.74707, 46.41965, 51.47404, 54.66883, 56.33362, 56.59962, 55.93713, 54.92867]}), ('1002', {'id': '1002', 'x': 460883.5301818835, 'y': 1351714.5341153282, 'depth': [1.04101, 1.09897, 1.75588, 2.89637, 2.96546, 3.01849, 3.05703, 3.08344, 3.10009, 3.10857, 3.1098, 3.10637, 3.10133], 'volume': [0.02101, 0.07897, 0.73588, 7.67718, 21.89, 37.05985, 50.48392, 60.9431, 68.09482, 71.9045, 72.46945, 70.90369, 68.6431]}), ('1003', {'id': '1003', 'x': 460754.625982409, 'y': 1351744.3241415746, 'depth': [0.91088, 0.95011, 1.66943, 2.88485, 2.94737, 2.98897, 3.02149, 3.04463, 3.05973, 3.06772, 3.0692, 3.0664, 3.06206], 'volume': [0.02088, 0.06011, 0.77943, 2.74993, 7.99661, 15.80265, 23.66128, 30.07642, 34.65122, 37.20094, 37.68631, 36.77635, 

In [164]:
node_ids = list(G.nodes())
missing_nodes = [n for n in node_ids if 'volume' not in G.nodes[n]]
print(f"Missing nodes: {missing_nodes}")


Missing nodes: ['110076U2', '143254', '143255', '143307U', '170155U2', '334126U', '334148', '336315', '4131', '4134', '4310U1', '4349U-1', '4376u', '4379U', '4388U', '4514R', '6209U', '6217U', '6224U', '6257U', '6260U', '6459U', '6707', '6804', '6811U1', '6811U2', 'D6007', 'D6009']


In [165]:
# Delete nodes with missing volume data
for node in missing_nodes:
    G.remove_node(node)
    print(f"Node {node} removed due to missing volume data.")

Node 110076U2 removed due to missing volume data.
Node 143254 removed due to missing volume data.
Node 143255 removed due to missing volume data.
Node 143307U removed due to missing volume data.
Node 170155U2 removed due to missing volume data.
Node 334126U removed due to missing volume data.
Node 334148 removed due to missing volume data.
Node 336315 removed due to missing volume data.
Node 4131 removed due to missing volume data.
Node 4134 removed due to missing volume data.
Node 4310U1 removed due to missing volume data.
Node 4349U-1 removed due to missing volume data.
Node 4376u removed due to missing volume data.
Node 4379U removed due to missing volume data.
Node 4388U removed due to missing volume data.
Node 4514R removed due to missing volume data.
Node 6209U removed due to missing volume data.
Node 6217U removed due to missing volume data.
Node 6224U removed due to missing volume data.
Node 6257U removed due to missing volume data.
Node 6260U removed due to missing volume data

In [166]:
# import pickle

# with open(multiple_pickle_path, "wb") as f:
#     pickle.dump(G, f)

In [167]:
# # Visualize the depth of the nodes in the graph.

# # Select the first 100 nodes from the graph (ensure the graph has at least 100 nodes)
# first_100_nodes = list(G.nodes())[:100]

# # Get the positions for only the first 100 nodes
# pos_sample = {node: pos[node] for node in first_100_nodes}

# # Get depth values for the first 100 nodes
# depth_values_sample = [G.nodes[node].get('depth', [np.nan])[6] for node in first_100_nodes]
# depth_values_sample = np.array(depth_values_sample)

# # Normalize the depth values for color mapping
# norm = plt.Normalize(depth_values_sample.min(), depth_values_sample.max())
# cmap = plt.cm.viridis

# # Create the figure and axis
# fig, ax = plt.subplots(figsize=(10, 10))

# # Draw only the first 100 nodes with the depth-based coloring
# nx.draw_networkx_nodes(G, pos=pos_sample, nodelist=first_100_nodes, node_size=50, node_color=cmap(norm(depth_values_sample)), ax=ax)

# # Create the colorbar using a ScalarMappable object
# sm = plt.cm.ScalarMappable(norm=norm, cmap=cmap)
# sm.set_array([])  # Empty array because we're not plotting images

# # Add the colorbar with the correct label
# fig.colorbar(sm, ax=ax, label='Depth')

# plt.show()

In [168]:
# Calculate the mean depth per node
depth_means = {node: np.nanmean(G.nodes[node]['depth']) for node in G.nodes()}

# Convert the means into a numpy array for processing
depth_values = np.array(list(depth_means.values()))

# Calculate the overall mean and standard deviation of the depth values
overall_mean = np.nanmean(depth_values)
overall_std = np.nanstd(depth_values)

# Calculate z-scores
depth_z_scores = (depth_values - overall_mean) / overall_std

# Identify outliers based on z-scores greater than 3 or less than -3
outliers = np.where(np.abs(depth_z_scores) > 3)[0]

# Retrieve the nodes corresponding to the outliers
outlier_nodes = [list(depth_means.keys())[i] for i in outliers]

# Print the results
print(f"Outlier nodes: {outlier_nodes}")
print(f"Outlier depth values: {depth_values[outliers]}")
print(f"Amount of outliers: {len(outliers)}")


Outlier nodes: ['3087', '3100', '3101', '3102', '3103', '3104', '3179', '3180', '3181', '3182', '3184', '3185', '3186', '3187', '3188', '3189', '3190', '3191', '3280', '3281', '3282', '3293', '3773']
Outlier depth values: [5.71946692 5.50469846 5.71244231 5.69155231 5.43588846 5.06078308
 4.91508692 5.13293154 5.18190692 5.19843923 4.88576    5.23084077
 5.48365462 5.61741692 5.65568923 5.70609692 5.72028538 5.72910154
 6.40630923 6.31389154 5.88614769 5.67827538 5.3236    ]
Amount of outliers: 23


In [169]:
# print('Length of the graph before removing outliers:', len(G.nodes()))
# # Remove outlier nodes from the graph
# for node in list(G.nodes):
#     if node in outlier_nodes:
#         G.remove_node(node)
#         print(f"Node {node} removed due to being an outlier.")

# print('Length of the graph after removing outliers:', len(G.nodes()))

In [170]:
# import pickle

# with open("../pickle/depnod_graph_10.pkl", "wb") as f:
#     pickle.dump(G, f)


In [171]:
import os

folder_path = '../Run_C2100_Reeks_DWA_20250422_000959'

file_count = 0
for entry in os.listdir(folder_path):
    if os.path.isfile(os.path.join(folder_path, entry)):
        file_count += 1

print(f"Number of files: {file_count}")

Number of files: 4016
