In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import re
import os

In [63]:
experiment_series ="20250509"
precipitation_setting = "Regen"
decay_setting = "decay"

data_path = f"../simulation_results/wastewater/{experiment_series}/{decay_setting}/{precipitation_setting}"
result_path = f"preprocessed_data/{experiment_series}/"

## substances

In [64]:
memilio_id = 99

In [65]:
def tidy_substances(data_path, memilio_id):
    # Path to text file
    file_name = f'INSIDe_substances_results{memilio_id}_output_v4.txt'
    file_path = f"{data_path}/{file_name}"

    # Initialize containers
    records = []
    current_variable = None
    current_manhole = None

    # Regular expressions
    pattern_str = fr'INSIDe_substances_results_{memilio_id}_output_v4_manhole_(MUC\d+)\.txt'
    manhole_pattern = re.compile(pattern_str)
    header_pattern = re.compile(r'time\[min\]\s+(\w+)\([^)]+\) concentration')
    data_pattern = re.compile(r'^(\d+)\s+([-\d.eE]+)$')

    # Read file line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            # Skip empty lines
            if not line or line == '##':
                continue

            # Match manhole ID
            manhole_match = manhole_pattern.match(line)
            if manhole_match:
                current_manhole = manhole_match.group(1)
                continue

            # Match variable name
            header_match = header_pattern.match(line)
            if header_match:
                current_variable = header_match.group(1)
                continue

            # Match data lines
            data_match = data_pattern.match(line)
            if data_match and current_variable and current_manhole:
                time = int(data_match.group(1))
                value = float(data_match.group(2))
                records.append({
                    "time_in_minutes": time,
                    "variable": current_variable,
                    "value": value,
                    "manhole": current_manhole
                })
    df = pd.DataFrame(records)
    df = df.loc[df.variable!="T"]
    df["time_in_days"] = df["time_in_minutes"]/(24*60)
    df["simulation_id"] = memilio_id
    return df

In [66]:
df = tidy_substances(data_path, memilio_id)

os.makedirs(f"{result_path}/substances/", exist_ok=True)
df.to_csv(f"{result_path}/substances/{decay_setting}_{precipitation_setting}_{memilio_id}_output.csv", index=False)

In [67]:
df.manhole.unique()

array(['MUC012', 'MUC060', 'MUC112', 'MUC348', 'MUC362', 'MUC434',
       'MUC486', 'MUC494', 'MUC560', 'MUC562', 'MUC586', 'MUC596',
       'MUC600', 'MUC608', 'MUC612', 'MUC614', 'MUC616'], dtype=object)

In [68]:
df.variable.unique()

array(['COV19', 'PMMoV'], dtype=object)

In [69]:
df.head()

Unnamed: 0,time_in_minutes,variable,value,manhole,time_in_days,simulation_id
0,15,COV19,3.65069,MUC012,0.010417,99
1,30,COV19,3.64318,MUC012,0.020833,99
2,45,COV19,3.63653,MUC012,0.03125,99
3,60,COV19,3.6303,MUC012,0.041667,99
4,75,COV19,3.63477,MUC012,0.052083,99


In [70]:
df.loc[df.variable=="COV19", "time_in_minutes"].value_counts().describe()["std"]

np.float64(0.0)

In [71]:
df.loc[df.variable=="COV19", "value"].describe()

count    150161.000000
mean       1456.656135
std        2810.323583
min           0.000000
25%         162.074460
50%         746.470610
75%        1925.384750
max       68506.015510
Name: value, dtype: float64

In [72]:
df.loc[df.variable=="PMMoV", "value"].describe()

count    150161.000000
mean         75.677184
std          18.442026
min           0.000000
25%          81.474520
50%          83.214980
75%          83.326730
max          83.333340
Name: value, dtype: float64

## hydraulic results

In [73]:
def tidy_hydraulics(memilio_id, data_path):
    # Path to text file
    file_name = f'INSIDe_hydraulic_results{memilio_id}_output_v4.txt'
    file_path = f"{data_path}/{file_name}"

    # Containers
    records = []
    current_pipe = None
    current_variable = "flow_rate"  # Only one variable here

    # Regex patterns
    pattern_str = fr'INSIDe_hydraulic_results_{memilio_id}_output_v4_pipe_(MUC\d+)'
    pipe_pattern = re.compile(pattern_str)
    data_pattern = re.compile(r'^([\d.,]+)\s+([\d.,]+)$')

    # Read and parse the file
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if not line or line == "##":
                continue

            # Match pipe ID
            pipe_match = pipe_pattern.match(line)
            if pipe_match:
                current_pipe = pipe_match.group(1)
                continue

            # Skip headers
            if line.startswith("t [min]"):
                continue

            # Match data lines
            data_match = data_pattern.match(line)
            if data_match and current_pipe:
                time_str = data_match.group(1).replace(',', '.')
                value_str = data_match.group(2).replace(',', '.')
                try:
                    time = float(time_str)
                    value = float(value_str)
                    records.append({
                        "time_in_minutes": time,
                        "value": value,
                        "pipe_number": current_pipe
                    })
                except ValueError:
                    continue  # skip lines that don't convert properly

    # Convert to DataFrame
    df = pd.DataFrame(records)
    df["time_in_days"] = df["time_in_minutes"]/(24*60)
    df["simulation_id"] = memilio_id
    return df

In [74]:
df = tidy_hydraulics(memilio_id, data_path)

os.makedirs(f"{result_path}/hydraulics/", exist_ok=True)
df.to_csv(f"{result_path}/hydraulics/{decay_setting}_{precipitation_setting}_{memilio_id}_output.csv", index=False)

In [75]:
df.head()

Unnamed: 0,time_in_minutes,value,pipe_number,time_in_days,simulation_id
0,1.0,0.004054,MUC614,0.000694,99
1,2.0,0.00429,MUC614,0.001389,99
2,3.0,0.004489,MUC614,0.002083,99
3,4.0,0.004743,MUC614,0.002778,99
4,5.0,0.005045,MUC614,0.003472,99


In [76]:
df.pipe_number.unique()

array(['MUC614', 'MUC060', 'MUC4861', 'MUC562', 'MUC596', 'MUC494',
       'MUC612', 'MUC434', 'MUC486', 'MUC012', 'MUC112', 'MUC560',
       'MUC362', 'MUC600', 'MUC616', 'MUC608', 'MUC586', 'MUC348'],
      dtype=object)

In [77]:
df.value.describe()

count    1.915904e+06
mean     9.841649e-01
std      2.817039e+00
min      0.000000e+00
25%      3.064479e-02
50%      1.909400e-01
75%      1.304410e+00
max      7.448901e+01
Name: value, dtype: float64