In [28]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("../mydata/datasets_for_model/GP_rolling_window_changepoints_NO2_entire.csv")
df["Start"] = pd.to_datetime(df["Start"])
df["Start"] = df["Start"].dt.tz_localize("Europe/Dublin") 

print(len(df))

# Sort by timestamp
changepoints_sorted = df["Start"].sort_values().reset_index(drop=True)

# Set threshold for merging (you can change this to 6h, 12h, 24h etc.)
threshold = pd.Timedelta(hours=1)

clusters = []  # list to store clusters of close changepoints
current_cluster = [changepoints_sorted[0]]

# Group changepoints into clusters based on the threshold
for cp in changepoints_sorted[1:]:
    if cp - current_cluster[-1] <= threshold:
        current_cluster.append(cp)
    else:
        clusters.append(current_cluster)
        current_cluster = [cp]
clusters.append(current_cluster)  # add the last cluster

# For each cluster, choose the final timestamp
# Options: min (earliest), max (latest), or median (middle)
final_changepoints = [pd.Series(cluster).median() for cluster in clusters]

# Create final DataFrame
final_cp_df = pd.DataFrame({"Final_Changepoints": final_changepoints})

# Save to CSV if needed
final_cp_df.to_csv("final_changepoints.csv", index=False)

print(final_cp_df)
print(f"Total final changepoints: {len(final_cp_df)}")


200
           Final_Changepoints
0   2023-08-06 18:00:00+01:00
1   2023-08-11 18:00:00+01:00
2   2023-08-16 16:00:00+01:00
3   2023-08-16 18:00:00+01:00
4   2023-08-18 15:00:00+01:00
..                        ...
154 2024-06-16 00:00:00+01:00
155 2024-06-21 00:00:00+01:00
156 2024-06-26 00:00:00+01:00
157 2024-06-28 14:30:00+01:00
158 2024-06-29 00:00:00+01:00

[159 rows x 1 columns]
Total final changepoints: 159


In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,Window_Start,Window_End,Segment,Start,End,Mean,CI_low,CI_high,mean_CI_nonoverlap,MergeGroup
0,1,2023-08-06 18:00:00,2023-08-16 18:00:00,1,2023-08-06 18:00:00+01:00,2023-08-16 17:00:00,8.926203,8.404168,9.49371,,1
1,2,2023-08-11 18:00:00,2023-08-21 18:00:00,1,2023-08-11 18:00:00+01:00,2023-08-16 15:00:00,8.591655,7.871712,9.267392,,1
2,3,2023-08-11 18:00:00,2023-08-21 18:00:00,2,2023-08-16 16:00:00+01:00,2023-08-18 14:00:00,16.354886,14.702395,18.001092,True,2
3,4,2023-08-11 18:00:00,2023-08-21 18:00:00,3,2023-08-18 15:00:00+01:00,2023-08-21 17:00:00,6.985233,6.111929,7.99834,True,3
4,5,2023-08-16 18:00:00,2023-08-26 18:00:00,1,2023-08-16 18:00:00+01:00,2023-08-18 14:00:00,16.219467,14.601717,17.75373,,1


In [30]:
df = pd.read_csv("../mydata/datasets_for_model/cleaned/GP_cleaned.csv")
df.tail()

Unnamed: 0.1,Unnamed: 0,Timestamp,counter.1,counter.2,rain,temp,wetb,dewpt,vappr,rhum,...,wddir,ww,w,sun,vis,clht,clamt,NO2,date,hour
6215,6216,30-06-2024 19:00,467,1443.0,0.0,14.4,-49.0,9.1,11.6,71,...,290,60,62,0.0,40000,70,7,13.420667,30-06-2024,19
6216,6217,30-06-2024 20:00,331,1099.0,0.0,13.2,-49.0,9.9,12.2,80,...,320,60,62,0.0,35000,60,7,15.303667,30-06-2024,20
6217,6218,30-06-2024 21:00,246,737.0,0.2,12.2,-49.0,10.5,12.7,89,...,290,21,62,0.0,35000,60,7,12.910333,30-06-2024,21
6218,6219,30-06-2024 22:00,152,576.0,0.1,11.9,-49.0,10.1,12.4,89,...,300,60,62,0.0,30000,12,7,11.412,30-06-2024,22
6219,6220,30-06-2024 23:00,131,349.0,0.0,12.1,-49.0,10.1,12.4,88,...,300,21,62,0.0,30000,50,7,7.827,30-06-2024,23


In [31]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Load final changepoints and full NO2 dataset
final_cp = pd.read_csv("final_changepoints.csv")
final_cp["Final_Changepoints"] = pd.to_datetime(final_cp["Final_Changepoints"], utc=True)
final_cp["Final_Changepoints"] = final_cp["Final_Changepoints"].dt.tz_convert("Europe/Dublin") 


df = pd.read_csv("../mydata/datasets_for_model/cleaned/GP_cleaned.csv")
df["Timestamp"] = pd.to_datetime(df["Timestamp"], format='%d-%m-%Y %H:%M')
df["Timestamp"] = df["Timestamp"].dt.tz_localize("Europe/Dublin") 
df = df.sort_values("Timestamp")
# Ensure changepoints are sorted
final_cp = final_cp.sort_values("Final_Changepoints").reset_index(drop=True)

In [32]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Timestamp,counter.1,counter.2,rain,temp,wetb,dewpt,vappr,rhum,...,wddir,ww,w,sun,vis,clht,clamt,NO2,date,hour
6215,6216,2024-06-30 19:00:00+01:00,467,1443.0,0.0,14.4,-49.0,9.1,11.6,71,...,290,60,62,0.0,40000,70,7,13.420667,30-06-2024,19
6216,6217,2024-06-30 20:00:00+01:00,331,1099.0,0.0,13.2,-49.0,9.9,12.2,80,...,320,60,62,0.0,35000,60,7,15.303667,30-06-2024,20
6217,6218,2024-06-30 21:00:00+01:00,246,737.0,0.2,12.2,-49.0,10.5,12.7,89,...,290,21,62,0.0,35000,60,7,12.910333,30-06-2024,21
6218,6219,2024-06-30 22:00:00+01:00,152,576.0,0.1,11.9,-49.0,10.1,12.4,89,...,300,60,62,0.0,30000,12,7,11.412,30-06-2024,22
6219,6220,2024-06-30 23:00:00+01:00,131,349.0,0.0,12.1,-49.0,10.1,12.4,88,...,300,21,62,0.0,30000,50,7,7.827,30-06-2024,23


In [33]:
all_cps = [df["Timestamp"].min()] + final_cp["Final_Changepoints"].tolist()[1:-1] + [df["Timestamp"].max()]
all_cps

[Timestamp('2023-08-06 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-11 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-16 16:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-16 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-18 15:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-21 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-22 15:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-23 05:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-23 10:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-23 19:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-23 21:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-26 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-08-31 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-09-03 15:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-09-05 18:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-09-09 23:00:00+0100', tz='Europe/Dublin'),
 Timestamp('2023-09-10 18:00:00+0100', t

In [34]:

# Add start & end bounds
all_cps = [df["Timestamp"].min()] + final_cp["Final_Changepoints"].tolist()[1:-1] + [df["Timestamp"].max()]

segment_stats = []

for i in range(len(all_cps)-1):
    seg_data = df[(df["Timestamp"] >= all_cps[i]) & (df["Timestamp"] < all_cps[i+1])]["NO2"].dropna()

    if len(seg_data) == 0:
        continue

    # Bootstrap mean
    boot_means = []
    for _ in range(500):  # 500 resamples
        sample = resample(seg_data)
        boot_means.append(sample.mean())

    ci_low = np.percentile(boot_means, 2.5)
    ci_high = np.percentile(boot_means, 97.5)

    segment_stats.append({
        "Segment": i+1,
        "Start": all_cps[i],
        "End": all_cps[i+1],
        "Mean_NO2": seg_data.mean(),
        "CI_low": ci_low,
        "CI_high": ci_high
    })

segment_df = pd.DataFrame(segment_stats)

# Check if adjacent segments overlap
segment_df["Overlap_with_prev"] = [np.nan] + [
    not (segment_df.iloc[i]["CI_low"] > segment_df.iloc[i-1]["CI_high"] or
         segment_df.iloc[i-1]["CI_low"] > segment_df.iloc[i]["CI_high"])
    for i in range(1, len(segment_df))
]

segment_df.to_csv("final_segments_with_bootstrap.csv", index=False)
print(segment_df.head())


   Segment                     Start                       End   Mean_NO2  \
0        1 2023-08-06 18:00:00+01:00 2023-08-11 18:00:00+01:00   9.080582   
1        2 2023-08-11 18:00:00+01:00 2023-08-16 16:00:00+01:00   8.591655   
2        3 2023-08-16 16:00:00+01:00 2023-08-16 18:00:00+01:00  19.401833   
3        4 2023-08-16 18:00:00+01:00 2023-08-18 15:00:00+01:00  16.219467   
4        5 2023-08-18 15:00:00+01:00 2023-08-21 18:00:00+01:00   6.985233   

      CI_low    CI_high Overlap_with_prev  
0   8.162900   9.938886               NaN  
1   7.879322   9.288494              True  
2  17.613667  21.190000             False  
3  14.569792  17.975119              True  
4   6.257935   7.882766             False  


In [35]:
segment_df.head(10)

Unnamed: 0,Segment,Start,End,Mean_NO2,CI_low,CI_high,Overlap_with_prev
0,1,2023-08-06 18:00:00+01:00,2023-08-11 18:00:00+01:00,9.080582,8.1629,9.938886,
1,2,2023-08-11 18:00:00+01:00,2023-08-16 16:00:00+01:00,8.591655,7.879322,9.288494,True
2,3,2023-08-16 16:00:00+01:00,2023-08-16 18:00:00+01:00,19.401833,17.613667,21.19,False
3,4,2023-08-16 18:00:00+01:00,2023-08-18 15:00:00+01:00,16.219467,14.569792,17.975119,True
4,5,2023-08-18 15:00:00+01:00,2023-08-21 18:00:00+01:00,6.985233,6.257935,7.882766,False
5,6,2023-08-21 18:00:00+01:00,2023-08-22 15:00:00+01:00,7.328587,6.102139,8.74576,True
6,7,2023-08-22 15:00:00+01:00,2023-08-23 05:00:00+01:00,10.784786,9.29812,12.393558,False
7,8,2023-08-23 05:00:00+01:00,2023-08-23 10:00:00+01:00,24.9874,18.102933,29.600467,False
8,9,2023-08-23 10:00:00+01:00,2023-08-23 19:00:00+01:00,9.02537,5.268032,13.94576,False
9,10,2023-08-23 19:00:00+01:00,2023-08-23 21:00:00+01:00,9.489167,5.96,13.018333,True


In [36]:
import pandas as pd

merged_segments = []
for i, row in segment_df.iterrows():
    if i == 0:
        merged_segments.append(row.to_dict())
    else:
        prev = merged_segments[-1]

        if str(row["Overlap_with_prev"]) == "True":
            # Merge with previous segment
            new_entry = {
                "Segment": prev["Segment"],  # keep previous segment number
                "Start": prev["Start"],
                "End": row["End"],
                # Weighted mean using segment durations
                "Mean_NO2": (prev["Mean_NO2"] + row["Mean_NO2"]) / 2,
                "CI_low": min(prev["CI_low"], row["CI_low"]),
                "CI_high": max(prev["CI_high"], row["CI_high"]),
                "Overlap_with_prev": False
            }
            merged_segments[-1] = new_entry  # replace previous entry
        else:
            merged_segments.append(row.to_dict())

merged_df = pd.DataFrame(merged_segments)
merged_df.to_csv("GP/full_merged_final_segments.csv", index=False)
print(merged_df.head())


   Segment                     Start                       End   Mean_NO2  \
0        1 2023-08-06 18:00:00+01:00 2023-08-16 16:00:00+01:00   8.836119   
1        3 2023-08-16 16:00:00+01:00 2023-08-18 15:00:00+01:00  17.810650   
2        5 2023-08-18 15:00:00+01:00 2023-08-22 15:00:00+01:00   7.156910   
3        7 2023-08-22 15:00:00+01:00 2023-08-23 05:00:00+01:00  10.784786   
4        8 2023-08-23 05:00:00+01:00 2023-08-23 10:00:00+01:00  24.987400   

      CI_low    CI_high  Overlap_with_prev  
0   7.879322   9.938886              False  
1  14.569792  21.190000              False  
2   6.102139   8.745760              False  
3   9.298120  12.393558              False  
4  18.102933  29.600467              False  


In [121]:
len(merged_df)

146

In [89]:
merged_df.head()

Unnamed: 0,Segment,Start,End,Mean_NO2,CI_low,CI_high,Overlap_with_prev
0,1,2023-01-01 00:00:00+00:00,2023-01-16 00:00:00+00:00,30.578458,28.698123,32.613665,
1,2,2023-01-16 00:00:00+00:00,2023-02-12 00:00:00+00:00,42.397236,36.056039,51.510518,False
2,5,2023-02-12 00:00:00+00:00,2023-02-26 00:00:00+00:00,25.979296,23.797948,28.090957,False
3,7,2023-02-26 00:00:00+00:00,2023-03-11 23:30:00+00:00,32.114651,29.636493,35.454993,False
4,9,2023-03-11 23:30:00+00:00,2023-03-26 00:00:00+00:00,25.311875,23.764761,26.797289,False


In [35]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Timestamp,counter.1,counter.2,rain,temp,wetb,dewpt,vappr,rhum,...,wddir,ww,w,sun,vis,clht,clamt,NO2,date,hour
6215,6216,2024-06-30 19:00:00+01:00,467,1443.0,0.0,14.4,-49.0,9.1,11.6,71,...,290,60,62,0.0,40000,70,7,13.420667,30-06-2024,19
6216,6217,2024-06-30 20:00:00+01:00,331,1099.0,0.0,13.2,-49.0,9.9,12.2,80,...,320,60,62,0.0,35000,60,7,15.303667,30-06-2024,20
6217,6218,2024-06-30 21:00:00+01:00,246,737.0,0.2,12.2,-49.0,10.5,12.7,89,...,290,21,62,0.0,35000,60,7,12.910333,30-06-2024,21
6218,6219,2024-06-30 22:00:00+01:00,152,576.0,0.1,11.9,-49.0,10.1,12.4,89,...,300,60,62,0.0,30000,12,7,11.412,30-06-2024,22
6219,6220,2024-06-30 23:00:00+01:00,131,349.0,0.0,12.1,-49.0,10.1,12.4,88,...,300,21,62,0.0,30000,50,7,7.827,30-06-2024,23
