# Water Level Data

### Code Summary:

- Detrending, normalizing, and shifting data to zero mean
- Subsampled data plots
- Tapered data plots
- Interactive plotly plots (Note: plotly plots work best for smaller code file sizes)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
import scipy.stats as ss
from sklearn.preprocessing import MinMaxScaler
import scipy.fft
import pywt
import plotly.graph_objects as go
import plotly.tools as tls
from plotly.subplots import make_subplots
from scipy.signal.windows import hann
from scipy import interpolate
from scipy.ndimage import uniform_filter

In [None]:
df1 = pd.read_csv("Toronto-daily-mean-water-1960-2024.csv", index_col="Obs_date")

# Check for null values in dataframe:
any_null = df1.isnull().any().any()
print(any_null) # Will show false if no null values

***
### Plotting the Time Series
#### Notes:
- Plot of the original water level data with no changes made
- Use the horizontal slider to zoom in/out of certain date ranges

In [None]:
# Interactive plot of time series 
fig = go.Figure()
fig.add_trace(go.Scatter(x= list(df1.index), y= list(df1['SLEV(metres)']), mode='lines', name='Time Series'))
# Set title
fig.update_layout(title_text="Toronto Daily Mean Water Level (1960-2024)", width=1100, height=600, 
                  xaxis_title='Date', yaxis_title='Water Level (m)', title_x=0.5)
# Add range slider
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
fig.show()

print(ss.describe(df1['SLEV(metres)']))
print("mode:", ss.mode(df1['SLEV(metres)'])) 
print("standard deviation:", df1['SLEV(metres)'].std())
print("median:", df1["SLEV(metres)"].median())

***
### Normalizing, Detrending and Shifting Data
#### Notes:
- data range of time series was normalized using MinMaxScaler from sklearn
- data was linearly detrended using signal.detrend from scipy
- the mean of the data was shifted to approximately zero, as shown by the red dashed line on the plot

In [None]:
# Linearly detrend data, normalize and shift to zero mean:
df1['Detrended'] = signal.detrend(df1['SLEV(metres)'], type='linear')

scaler = MinMaxScaler()
df1['Normalized'] = scaler.fit_transform(df1['Detrended'].values.reshape(-1, 1)).flatten()
print(f"Min-Max normalized range: [{df1['Normalized'].min():.3f}, {df1['Normalized'].max():.3f}]")

# Shift Mean:
df1['Normalized_Shifted'] = (df1['Normalized'] - 0.5)
print(f"New range: [{df1['Normalized_Shifted'].min():.3f}, {df1['Normalized_Shifted'].max():.3f}]")
print(f"New mean: {df1['Normalized_Shifted'].mean():.6f}")

# Plot:
dshifted_norm_plot = go.Figure()
dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index), y= list(df1['Normalized_Shifted']), mode='lines', name='Time Series'))
dshifted_norm_plot.update_layout(title_text="Toronto Daily Mean Water Level (1960-2024) <br> (Detrended, Normalized, Shifted)",
                                 width=1100, height=600, xaxis_title='Date', yaxis_title='Normalized Water Level', title_x=0.5)
dshifted_norm_plot.update_layout(xaxis=dict(rangeslider=dict(visible=True)))

# Calculate the overall mean:
overall_mean = np.round(df1['Normalized_Shifted'].mean())
# Add a horizontal line for the overall mean
dshifted_norm_plot.add_shape(type="line", x0=df1.index.min(), y0=overall_mean, x1=df1.index.max(), y1=overall_mean, 
                    line=dict(color="Red", width=2, dash="dash"), name=f"Overall Mean ({overall_mean})", showlegend=True)

dshifted_norm_plot.show()

# Stats for plot
print(ss.describe(df1['Normalized_Shifted']))
print("median:", df1["Normalized_Shifted"].median())
print("mode:", ss.mode(df1["Normalized_Shifted"])) 
print("standard deviation:", df1["Normalized_Shifted"].std())

***
### Subsampling the Time Series
#### Notes:
- different step sizes were applied to the normalized, detrended and mean-shifted (DNS) data for subsampling
- click on the legend titles to select or hide certain time series
- use the horizontal slider to zoom in/out of certain date ranges

In [None]:
# Subsampling time series:
# Number of data points in df1['Normalized_Shifted']: 23292
step1 = len(df1['Normalized_Shifted'])//100
step2 = len(df1['Normalized_Shifted'])//1000
step3 = len(df1['Normalized_Shifted'])//10000
step4 = 1000
step5 = 100
step6 = 10

# Plot:
dshifted_norm_plot = go.Figure()
dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index), y= list(df1['Normalized_Shifted']), mode='lines', name='Time Series (no step)'))
#dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index)[::step1], y= list(df1['Normalized_Shifted'][::step1]), mode='lines', 
#                                        name=f'Time Series (step={step1})'))
#dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index)[::step2], y= list(df1['Normalized_Shifted'][::step2]), mode='lines', 
#                                        name=f'Time Series (step={step2})'))
#dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index)[::step3], y= list(df1['Normalized_Shifted'][::step3]), mode='lines', 
#                                        name=f'Time Series (step={step3})'))
#dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index)[::step6], y= list(df1['Normalized_Shifted'][::step6]), mode='lines', 
#                                        name=f'Time Series (step={step6})'))
dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index)[::52], y= list(df1['Normalized_Shifted'][::52]), mode='lines', 
                                        name=f'Time Series (step={52})'))
#dshifted_norm_plot.add_trace(go.Scatter(x= list(df1.index)[::step5], y= list(df1['Normalized_Shifted'][::step5]), mode='lines', 
#                                        name=f'Time Series (step={step5})'))
dshifted_norm_plot.update_layout(title_text="Toronto Daily Mean Water Level (1960-2024) <br> Subsampling",
                                 width=1100, height=600, xaxis_title='Date', yaxis_title='Normalized Water Level', title_x=0.5)
dshifted_norm_plot.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
dshifted_norm_plot.show()

In [None]:
# stats for subsample step 52:
print("data step: 52")
print(ss.describe(df1['Normalized_Shifted'][::52]))
print("mode:", ss.mode(df1['Normalized_Shifted'][::52])) 
print("standard deviation:", df1['Normalized_Shifted'][::52].std())
print("median:", df1["Normalized_Shifted"][::52].median())

### Comparing Subsampled Time Series to Full Time Series
#### Notes:
- The subsampled time series of different step sizes were compared to the full detrended, normalized and shifted (DNS) time series
- A variety of comparison methods were tested that could be applied elsewhere as well (e.g. for comparing tapered vs untapered data):
    - **Comparing summary statistics** (i.e. mean, median, standard deviation) for different sampling rates against the full DNS series
    - Calculating the **mean absolute error (MAE)** between the full DNS data and reconstructions of the subsampled data
        - interpolation was used to reconstruct the time series after subsampling to align timestamps between the subsampled data and the original DNS data
        - the MAE was then calculated and expressed as a percentage of the DNS series' standard deviation
    - Defining a **'percent_check' function** to determine if the subsampled data is within 5% of the full DNS data
        - this method also uses interpolation to compare subsampled data to the full DNS data
        - the absolute error between the full DNS data and the reconstructed subsampled data is calculated and divided by the full data range
        - this result is then used determine if the error is within the selected threshold (i.e. 5%)

#### Comparing Summary Statistics for Different Subsampling Intervals

In [None]:
# Compare summary statistics for different steps
stats_original = {'Mean': df1['Normalized_Shifted'].iloc[::].mean(),
    'Std': df1['Normalized_Shifted'].iloc[::].std(),
    'Median': df1['Normalized_Shifted'].iloc[::].median()}

stats_step1 = {'Mean': df1['Normalized_Shifted'].iloc[::step1].mean(),
    'Std': df1['Normalized_Shifted'].iloc[::step1].std(),
    'Median': df1['Normalized_Shifted'].iloc[::step1].median()}

stats_step2 = {'Mean': df1['Normalized_Shifted'].iloc[::step2].mean(),
    'Std': df1['Normalized_Shifted'].iloc[::step2].std(),
    'Median': df1['Normalized_Shifted'].iloc[::step2].median()}

stats_step3 = {'Mean': df1['Normalized_Shifted'].iloc[::step3].mean(),
    'Std': df1['Normalized_Shifted'].iloc[::step3].std(),
    'Median': df1['Normalized_Shifted'].iloc[::step3].median()}

# Percent differences in statistics
for stat in ['Mean', 'Std', 'Median']:
    pct_o1 = abs(stats_original[stat] - stats_step1[stat]) / abs(stats_original[stat]) * 100
    pct_o2 = abs(stats_original[stat] - stats_step2[stat]) / abs(stats_original[stat]) * 100
    pct_o3 = abs(stats_original[stat] - stats_step3[stat]) / abs(stats_original[stat]) * 100
    pct_12 = abs(stats_step1[stat] - stats_step2[stat]) / abs(stats_step1[stat]) * 100
    pct_13 = abs(stats_step1[stat] - stats_step3[stat]) / abs(stats_step1[stat]) * 100
    print(f"{stat} % diff (original vs step1): {pct_o1:.2f}%")
    print(f"{stat} % diff (original vs step2): {pct_o2:.2f}%")
    print(f"{stat} % diff (original vs step3): {pct_o3:.2f}%")

#### Comparing Subsampled DNS Data to Full DNS Data Using Mean Absolute Error (MAE)

In [None]:
# Compare each subsampled version against the original time series
# trying to use Mean Absolute Error (MAE)
# use interpolation to align timestamps
# Original time series:
x_original = np.arange(len(df1))
y_original = df1['Normalized_Shifted'].values

# Create interpolation functions for each subsampled version
x_step1 = np.arange(0, len(df1), step1)
y_step1 = df1['Normalized_Shifted'].iloc[::step1].values
f_step1 = interpolate.interp1d(x_step1, y_step1, kind='linear', fill_value='extrapolate')

x_step2 = np.arange(0, len(df1), step2)
y_step2 = df1['Normalized_Shifted'].iloc[::step2].values
f_step2 = interpolate.interp1d(x_step2, y_step2, kind='linear', fill_value='extrapolate')

x_step3 = np.arange(0, len(df1), step3)
y_step3 = df1['Normalized_Shifted'].iloc[::step3].values
f_step3 = interpolate.interp1d(x_step3, y_step3, kind='linear', fill_value='extrapolate')

x_step5 = np.arange(0, len(df1), step5)
y_step5 = df1['Normalized_Shifted'].iloc[::step5].values
f_step5 = interpolate.interp1d(x_step5, y_step5, kind='linear', fill_value='extrapolate')

# Reconstruct time series from subsampled data:
y_reconstructed_step1 = f_step1(x_original)
y_reconstructed_step2 = f_step2(x_original)
y_reconstructed_step3 = f_step3(x_original)
y_reconstructed_step5 = f_step5(x_original)

# Trying mean absolute error (MAE), then can also express as a percentage of the data's standard deviation
mae_1S = np.mean(np.abs(y_original - y_reconstructed_step1))
mae_2S = np.mean(np.abs(y_original - y_reconstructed_step2))
mae_3S = np.mean(np.abs(y_original - y_reconstructed_step3))

# Express as percentage of original data's standard deviation
original_std = np.std(y_original)
print(f"Standard Deviation of Full Data: {original_std: .4f}")
percent_diff_1S = (mae_1S / original_std)*100
percent_diff_2S = (mae_2S / original_std)*100
percent_diff_3S = (mae_3S / original_std)*100

print(f"Step 1 Subsample: Step = {step1}, Mean abs error: {mae_1S:.4f}, {percent_diff_1S:.2f}% of the original data's standard deviation")
print(f"Step 2 Subsample: Step = {step2}, Mean abs error: {mae_2S:.4f}, {percent_diff_2S:.2f}% of the original data's standard deviation")
print(f"Step 3 Subsample: Step = {step3}, Mean abs error: {mae_3S:.4f}, {percent_diff_3S:.2f}% of the original data's standard deviation")
# low % is good, indicates small reconstruction error compared to signal itself

#### Comparing Subsampled DNS Data to Full DNS Data Using 'percent_check' Function

In [None]:
# Idea: define function that checks if subsampled data is within 5% of starting data
def percent_check(data, column, step, tolerance=0.05):
    # Get original series
    x_original = np.arange(len(data))
    y_original = data[column].values
    
    # Get subsampled series
    x_subsampled = np.arange(0, len(data), step)
    y_subsampled = data[column].iloc[::step].values
    
    # Interpolate subsampled data back to original timestamps
    f_interp = interpolate.interp1d(x_subsampled, y_subsampled, kind='linear', fill_value='extrapolate')
    y_reconstructed = f_interp(x_original)
    
    # Calculate absolute errors
    abs_errors = np.abs(y_original - y_reconstructed)
    
    # Calculate data range (the scale of the data)
    data_range = np.max(y_original) - np.min(y_original)
    # Normalize errors by data range instead of individual values to get scale-independent percentages
    normalized_errors = abs_errors / data_range

    # After calculating normalized_errors
    # plt.figure(figsize=(12, 4))
    # plt.plot(normalized_errors*100, alpha=0.5)
    # plt.axhline(tolerance*100, color='r', linestyle='--', label=f'{tolerance*100}% tolerance')
    # plt.ylabel('Normalized Error (%)')
    # plt.xlabel('Data Point')
    # plt.title(f'Reconstruction Error (Step={step})')
    # plt.legend()
    # plt.show()
    
    # Test if within tolerance
    within_tolerance = normalized_errors <= tolerance # check each point
    pct_within = (np.sum(within_tolerance) / len(within_tolerance))*100 # then count how many pass
    
    # Overall pass/fail
    passes = pct_within >= 95  # At least 95% of points within tolerance
    
    results = {
        'step': step,
        'tolerance': f'{tolerance*100}%',
        'passes': passes,
        'points_within_tolerance': f'{pct_within: .4f}%',
        'num_original_points': len(y_original),
        'num_subsampled_points': len(y_subsampled),
        'data_range': f'{data_range: .4f}',  
        'max_normalized_error': f'{(np.max(normalized_errors)*100): .4f}%' }
    
    return results

#results_step1 = percent_check(df1, 'Normalized_Shifted', step1, tolerance=0.05)
#results_step2 = percent_check(df1, 'Normalized_Shifted', step2, tolerance=0.05)
results_step3 = percent_check(df1, 'Normalized_Shifted', step3, tolerance=0.05)
#print("results1:", results_step1)
#print("results2:", results_step2)
print("results3:", results_step3)
#results_step4 = percent_check(df1, 'Normalized_Shifted', step4, tolerance=0.05)
#results_step5 = percent_check(df1, 'Normalized_Shifted', step5, tolerance=0.05)
#results_step6 = percent_check(df1, 'Normalized_Shifted', step6, tolerance=0.05)
#print("results4:", results_step4)
#print("results5:", results_step5)
#print("results6:", results_step6)
results_step_test = percent_check(df1, 'Normalized_Shifted', 53, tolerance=0.05)
print("results for step test", results_step_test)

***
### Tapering the Time Series
#### Notes:
- a Hann window/raised cosine was used to taper the DNS time series
- the time series was padded with zeros to create different tapers:
    - no padding
    - padding with half the length of the original time series
    - padding with the full length of the original time series
    - etc.
- click on legend titles in the plot to select or hide certain series
- use the horizontal slider to zoom in/out of certain date ranges
- the tapered series with no padding does not have discontinuities at the edges of the series, however, the other series do
- the same methods of comparison that were used to compare subsampled to full data were used to compare the tapered data to the original DNS series

In [None]:
# Tapering the time series
L = len(df1['Normalized_Shifted'])
data = df1['Normalized_Shifted'].values
time_index = pd.to_datetime(df1.index)
# time delta (1 day for water level data)
time_delta = (time_index[1] - time_index[0])

# 1L - original taper
window1 = hann(L)
tapered_data1 = window1 * data
padding1 = 0

# 2L - gentler taper
padding2 = L // 2
data2_padded = np.concatenate([np.zeros(padding2), data, np.zeros(padding2)])
window2 = hann(len(data2_padded))
tapered_data2_full = window2 * data2_padded
time_index2 = pd.date_range(start=time_index[0] - (time_delta * padding2), periods=len(data2_padded), freq=time_delta)

# 3L - very gentle taper (3L = full length of padding-data-padding)
padding3 = L
data3_padded = np.concatenate([np.zeros(padding3), data, np.zeros(padding3)])
window3 = hann(len(data3_padded))
tapered_data3_full = window3 * data3_padded
time_index3 = pd.date_range(start=time_index[0] - (time_delta * padding3), periods=len(data3_padded), freq=time_delta)

# test taper -> 1.5+1+1.5= 4  (4l = full length of padding-data-padding)
padding4 = (3*L)//2
data4_padded = np.concatenate([np.zeros(padding4), data, np.zeros(padding4)])
window4 = hann(len(data4_padded))
tapered_data4_full = window4*data4_padded
time_index4 = pd.date_range(start=time_index[0] - (time_delta*padding4), periods=len(data4_padded), freq=time_delta)

# Plot
tapered_plot = go.Figure()
tapered_plot.add_trace(go.Scatter(x=list(time_index), y=list(data), mode='lines', name='Original 1L Time Series'))
tapered_plot.add_trace(go.Scatter(x=list(time_index), y=list(tapered_data1), mode='lines', name='1L Tapered Series'))
tapered_plot.add_trace(go.Scatter(x=list(time_index2), y=list(tapered_data2_full), mode='lines', name='2L Tapered Series'))
tapered_plot.add_trace(go.Scatter(x=list(time_index3), y=list(tapered_data3_full), mode='lines', name='3L Tapered Series'))
tapered_plot.add_trace(go.Scatter(x=list(time_index4), y=list(tapered_data4_full), mode='lines', name='4L Tapered Series'))

tapered_plot.update_layout(title_text="Toronto Daily Mean Water Level (1960-2024) <br> Different Taper Lengths", width=1100, height=600,
    xaxis_title='Date', yaxis_title='Normalized Water Level', title_x=0.5)
tapered_plot.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
tapered_plot.show()

# Need percent difference between tapered and untapered plots
# Trying mean absolute error (MAE), then can also express as a percentage of the data's standard deviation
# Calculate MAE for each taper:
mae_1L = np.mean(np.abs(data - tapered_data1))
mae_2L = np.mean(np.abs(data - tapered_data2_full[padding2:-padding2]))
mae_3L = np.mean(np.abs(data - tapered_data3_full[padding3:-padding3]))

# Express as percentage of original data's standard deviation
std_original = np.std(data)
percent_diff_1L = (mae_1L / std_original)*100
percent_diff_2L = (mae_2L / std_original)*100
percent_diff_3L = (mae_3L / std_original)*100

print(f"1L Taper: Mean abs error: {mae_1L:.4f}, {percent_diff_1L:.2f}% of the original data's standard deviation")
print(f"2L Taper: Mean abs error: {mae_2L:.4f}, {percent_diff_2L:.2f}% of the original data's standard deviation")
print(f"3L Taper: Mean abs error: {mae_3L:.4f}, {percent_diff_3L:.2f}% of the original data's standard deviation")

# stats for tapered series:
print("")
print("Summary Stats:")
print("1L Taper")
print(ss.describe(tapered_data1))
print("mode:", ss.mode(tapered_data1)) 
print("standard deviation:", tapered_data1.std())
print("median:", np.median(tapered_data1))
print("")
print("2L Taper")
print(ss.describe(tapered_data2_full))
print("mode:", ss.mode(tapered_data2_full)) 
print("standard deviation:", tapered_data2_full.std())
print("median:", np.median(tapered_data2_full))
print("")
print("3L Taper")
print(ss.describe(tapered_data3_full))
print("mode:", ss.mode(tapered_data3_full)) 
print("standard deviation:", tapered_data3_full.std())
print("median:", np.median(tapered_data3_full))
print("")
print("4L Taper")
print(ss.describe(tapered_data4_full))
print("mode:", ss.mode(tapered_data4_full)) 
print("standard deviation:", tapered_data4_full.std())
print("median:", np.median(tapered_data4_full))

In [None]:
# Compare tapered time series to original 
def percent_check_tapered(original_data, tapered_data_full, padding, tolerance=0.05):
    """original_data : Original unpadded time series (array)
    tapered_data_full : Tapered time series, includes padding (array)
    padding : Amount of padding added to each end (int)
    tolerance : Acceptable error as fraction of data range (float)"""
    # Extract original from tapered data (remove padding)
    if padding > 0:
        tapered_original_region = tapered_data_full[padding:-padding]
    else:
        tapered_original_region = tapered_data_full
    
    # Direct comparison: tapered vs original
    abs_errors = np.abs(original_data - tapered_original_region)
    # normalization
    data_range = np.max(original_data) - np.min(original_data)
    normalized_errors = abs_errors / data_range
    
    # Calculate metrics
    within_tolerance = normalized_errors <= tolerance
    pct_within = (np.sum(within_tolerance) / len(within_tolerance))*100
    
    # MAE metrics
    mae = np.mean(abs_errors)
    std_original = np.std(original_data)
    mae_pct_std = (mae / std_original)*100
    
    passes = pct_within >= 95
    
    results = {
        'padding': padding,
        'tolerance': f'{tolerance*100}%',
        'passes': passes,
        'points_within_tolerance': f'{pct_within: .4f}%',
        'max_normalized_error': f'{(np.max(normalized_errors)*100): .4f}%',
        'mean_abs_error': f'{mae: .4f}',
        'mae_as_%_of_std': f'{mae_pct_std: .4f}%',
        'num_points': len(original_data),
        'data_range': f'{data_range: .4f}'}
    
    return results

results_1 = percent_check_tapered(data, tapered_data1, padding1, tolerance=0.05)
results_2 = percent_check_tapered(data, tapered_data2_full, padding2, tolerance=0.05)
results_3 = percent_check_tapered(data, tapered_data3_full, padding3, tolerance=0.05)
results_4 = percent_check_tapered(data, tapered_data4_full, padding4, tolerance=0.05)
print("results1:", results_1)
print("")
print("results2:", results_2)
print("")
print("results3:", results_3)
print("")
print("results4:", results_4)

***
### Combining Subsampling and Tapering
#### Notes:
- Tapering with Hann window and using step=52 based on results from percent_check function for subsampled data
- taper applied with no padding to prevent discontinuities

In [None]:
# From previous subsampling and tapering related code, determined that step = 52 goes well with percent_check,
# and taper with no padding prevents discontinuities
sub_step = 52

L = len(df1['Normalized_Shifted'])
data = df1['Normalized_Shifted'].values
time_index = pd.to_datetime(df1.index)
# time delta (1 day for water level data)
time_delta = (time_index[1] - time_index[0])

window = hann(L)
tapered_sub = (window * data)[::sub_step]

# Plot:
sub_t = go.Figure()
sub_t.add_trace(go.Scatter(x= list(time_index), y= list(df1['Normalized_Shifted']), mode='lines', name='Full Time Series')) 
sub_t.add_trace(go.Scatter(x= list(time_index)[::sub_step], y= list(tapered_sub), mode='lines', name=f'Tapered Time Series (step={sub_step})'))
sub_t.update_layout(title_text="Tapered and Subsampled Toronto Daily Mean Water Level (1960-2024)", width=1100, height=600,
    xaxis_title='Date', yaxis_title='Normalized Water Level', title_x=0.5)
sub_t.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
sub_t.show()

# Summary Stats for tapered/subsampled series:
print("Summary Stats:")
print(ss.describe(tapered_sub))
print("mode:", ss.mode(tapered_sub)) 
print("standard deviation:", tapered_sub.std())
print("median:", np.median(tapered_sub))

In [None]:
# see next file for adding sine waves, FFT plots and scalograms