# Cluster Analysis of Trimmed Spectrogram (CATS)

# False alarm filtering via multi-station analysis:
- Association, Voting, Interpolation, and Extrapolation

In [3]:
import numpy as np
import numba as nb
import holoviews as hv

from scipy import signal, special, optimize, stats
import ssqueezepy as ssq
import timeit
import pyskbel as api
import xarray as xr

In [4]:
import cats

<hr>

In [22]:
N_datasets = 25

# i = np.random.choice(N_datasets)
i = 2
folder = "C:/Users/seraf/OneDrive - ualberta.ca/Documents/WaveDatasets/GeneratedData"
Dclean = api.compose_from_meta(f"{folder}/MetaDataSetsNew/dataset_{i}_main.pkl", 
                               noise=0.0, data_path=f'{folder}/SurfaceEvents')
Dclean = np.moveaxis(Dclean, -1, 1)
main_meta = api.utils.read_dict(f"{folder}/MetaDataSetsNew/dataset_{i}_main.pkl")
main_meta['data_path'] = 'C:\\Users\\seraf\\OneDrive - ualberta.ca\\Documents\\WaveDatasets\\GeneratedData\\SurfaceEvents'

x = np.linspace(main_meta['xmin'], main_meta['xmax'], main_meta['nx'])
time = np.linspace(main_meta['Tmin'], main_meta['Tmax'], main_meta['nT'])
dt = main_meta['dt']

t1, t2 = 8, 22
it1, it2 = int(t1 / dt), int(t2 / dt)
time = time[it1 : it2 + 1]
time -= time.min()
Dclean = Dclean[..., it1 : it2 + 1]

Taper on axis = -2 is applied


  0%|          | 0/3 [00:00<?, ?it/s]

In [23]:
beta = 0
noise_ids = {-1 : "high_freq", 0 : "white", 1 : "low_freq"}
noise_id = noise_ids[beta]
Noise = api.utils.get_noise((0.05, beta), Dclean.shape, axis=2)
Noise += 0.07 * np.sin(time * 2 * np.pi * 50)[None, None, :]
D = Dclean + Noise

eps = 0.007
true_detection_time = abs(Dclean) > eps
Srms = np.sqrt(np.mean(Dclean[true_detection_time]**2))
Nrms = np.sqrt(np.mean(Noise**2))
print("SNR = ", Srms**2 / Nrms**2)
print("Noise color = ", noise_id)

SNR =  1.0619448706537078
Noise color =  white


In [74]:
STFTop = cats.STFTOperator(('hann', 1.0), overlap=0.98, dt=dt, backend='ssqueezepy')
%time x_stft = STFTop * D[:, 2]

%time x_cwt, f = ssq.cwt(D[:, 2], fs=1/dt, nv=4)

CPU times: total: 1.42 s
Wall time: 487 ms
CPU times: total: 1.58 s
Wall time: 455 ms


In [75]:
im_opts = dict(colorbar=True, cmap='viridis', logz=True, fig_size=350)
hv.Image((time, np.arange(len(f)), abs(x_cwt[63]))).opts(invert_yaxis=True, aspect=4, **im_opts)

In [76]:
hv.Image((STFTop.forward_time_axis(len(time)), 
          STFTop.f, abs(x_stft[63]))).opts(logy=True, ylim=(0.9, np.nan), aspect=2, **im_opts)

In [5]:
catsdet = cats.CATSDetector(dt_sec=dt,
                            stft_window_sec=('hann', 0.15),
                            stft_overlap=0.5,
                            stft_nfft=256,
                            minSNR=4.5,
                            stationary_frame_sec=1.0,
                            min_dt_width_sec=0.2,
                            min_df_width_Hz=8.,
                            max_dt_gap_sec=0.3,
                            neighbor_distance=2,
                            date_Q=0.95,
                            date_detection_mode=True,
                            clustering_with_SNR=True,
                            clustering_multi_comp=True,
                            stft_backend='ssqueezepy')
print(f"{catsdet.stationary_frame_len = }")
print(f"{catsdet.max_dt_gap_len = }")
print(f"{catsdet.min_dt_width_len = }")
print(f"{catsdet.min_df_width_len = }")
print(f"{catsdet.neighbor_distance_len = }")

catsdet.stationary_frame_len = 256
catsdet.max_dt_gap_len = 4
catsdet.min_dt_width_len = 2
catsdet.min_df_width_len = 4
catsdet.neighbor_distance_len = 2


In [9]:
noise_psd = abs(catsdet.STFT * Noise).mean(axis=(0, 1, 3))
f_dim = hv.Dimension('Frequency', unit='Hz')
fontsize = dict(labels=16, ticks=14)
fig_noise_psd = hv.Curve((catsdet.stft_frequency, noise_psd), 
                         kdims=f_dim).opts(fig_size=250, aspect=2.5, logx=True, 
                                           fontsize=fontsize, ylabel='', 
                                           logy=True, xlim=(1, np.nan))
# hv.save(fig_noise_psd, f"figures/NoisePSD_{noise_id}.png", dpi=300)
fig_noise_psd

In [6]:
t_start = timeit.default_timer()

catsres = catsdet.detect(D)

t_delta = timeit.default_timer() - t_start
print(f"{t_delta = : .3f} seconds")
print(f"1 Terabyte = {t_delta * (1024**4 / D.nbytes ) / 3_600 : .2f} hours")

t_delta =  0.547 seconds
1 Terabyte =  7.65 hours


In [8]:
comp = 'Z'
comps = {'X' : 0, 'Y' : 1, 'Z': 2}
ind = (50, comps[comp])
fig = catsres.plot(ind).cols(1)
xlim = (np.nan, np.nan)
fig.opts(hv.opts.Curve(xlim=xlim, linewidth=1), 
         hv.opts.Image(xlim=xlim, logz=True, ylim=(0.9, np.nan)))
# hv.save(fig, f'../figures/synthetic_demo_{comp}_{noise_id}.png', dpi=300)
fig

In [12]:
SNRK = catsres.SNR_spectrogram[ind] * catsres.binary_spectrogram_clustered[ind]
hv.Image((catsres.stft_time, catsres.stft_frequency, SNRK)).opts(aspect=2, logy=True, logz=True, fig_size=350, 
                                                                 cmap='viridis', ylim=(0.9, np.nan), colorbar=True)

In [9]:
print("Mean SNR: ", (catsres.SNR_spectrogram[catsres.binary_spectrogram_clustered]).mean())
print("Median SNR: ", np.median(catsres.SNR_spectrogram[catsres.binary_spectrogram_clustered]))

Mean SNR:  5.174900615844085
Median SNR:  3.7782787996120506


In [136]:
comp = 'Z'
comps = {'X' : 0, 'Y' : 1, 'Z': 2}
ind = (comps[comp], 50)
fig = catsres.plot(ind).cols(1)
fig[0].data.Amplitude = Dclean[ind]
fig[0].opts(linewidth=1, fig_size=350)
# hv.save(fig[0], f"../figures/synthetic_demo_{comp}_clean.png", dpi=300)
fig[0]

In [10]:
eps = 0.007
binary_cmap = ['white', 'blue']
binary_cticks = [(0, 'Noise'), (1, 'Event')]
fontsize = dict(labels=16, ticks=15)
image_kwargs = dict(colorbar=True, aspect=3.5, fontsize=fontsize, 
                    cmap=binary_cmap, cbar_ticks=binary_cticks,
                    clabel='', xlabel='Time (s)', ylabel='Location (km)',
                    fig_size=400, clim=(-0.5, 1.5))

true_detection_time = abs(Dclean) > eps
stft_time = catsdet.STFT.forward_time_axis(len(time))
true_detection = cats.core.projection.ProjectFilterIntervals(true_detection_time, time, 1, 0.5, stft_time)
# hv.Image((stft_time, x, true_detection.max(0))).opts(**image_kwargs)
# hv.Image((catsres.stft_time, x, catsres.detection.max(0))).opts(**image_kwargs)

In [11]:
fp_max = int(0.5 / catsdet.stft_hop_sec)
labels, R = cats.EvaluateDetection(true_detection.max(1), catsres.detection.max(1), 
                                   0.9, fp_max)
report = "True Positive = {0}\tFalse Positive = {1}\nTrue Negative = {2}\tFalse Negative = {3}"
print(report.format(R[..., 1].sum(), R[..., 2].sum(), R[..., 3].sum(), R[..., 0].sum()))

True Positive = 135	False Positive = 117
True Negative = 296	False Negative = 119


In [12]:
class_cmap = ['red', 'white', 'blue', 'green']
class_cticks = [(-1, 'Missed'), (0, 'Noise'), (1, 'Detected'), (2, 'False Alarm')]
fontsize = dict(labels=16, ticks=15)
cl_image_kwargs = dict(colorbar=True, aspect=3.5, fontsize=fontsize, 
                    cmap=class_cmap, cbar_ticks=class_cticks, clim=(-1.5, 2.5),
                    clabel='', xlabel='Time (s)', ylabel='Location (km)',
                    fig_size=400)

fig = hv.Image((catsres.stft_time, x, labels)).opts(**cl_image_kwargs)
# fig

In [14]:
data_meta = dict(dims=["Location", "Component", "Time"], 
                 coords={"Component" : ['X', 'Y', 'Z'],
                         "Location" : x,
                         "Time" : time})

traces = {'Synthetic' : xr.DataArray(D, **data_meta)}
traces_clean = {'Synthetic' : xr.DataArray(Dclean, **data_meta)}
#################
det_meta = dict(dims=["Location", "Component", "Time"], 
                 coords={"Component" : ['X', 'Y', 'Z'],
                         "Location" : x,
                         "Time" : catsres.stft_time})

detection_pred = {'Synthetic' : xr.DataArray(catsres.detection, **det_meta)}                          
detection_true = {'Synthetic' : xr.DataArray(true_detection, **det_meta)}

label_meta = dict(dims=["Location", "Component", "Time", "Class"], 
                 coords={"Component" : ['X', 'Y', 'Z'],
                         "Location" : x,
                         "Time" : catsres.stft_time, 
                         "Class" : ['TP', 'FP', 'FN']})

labels, R = cats.EvaluateDetection(true_detection, catsres.detection, 
                                   0.9, fp_max)
class_labels = np.stack([labels == 1, labels == 2, labels == -1], axis=-1)

detection_labels = {'Synthetic' : xr.DataArray(class_labels, **label_meta)}

print(report.format(R[..., 1].sum(), R[..., 2].sum(), R[..., 3].sum(), R[..., 0].sum()))

True Positive = 268	False Positive = 562
True Negative = 870	False Negative = 240


In [15]:
fname = 'Synthetic'

fig = cats.plot_traces(traces_clean, detection_true, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)', 
                                                       )
# hv.save(fig, f'../figures/synthetic_clean_{fname}_{comp}.png', dpi=300)
fig

In [16]:
fig = cats.plot_traces(traces, detection_pred, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)')
# hv.save(fig, f'../figures/synthetic_pred_{fname}_{comp}_{noise_id}.png', dpi=300)
fig

In [18]:
fig = cats.plot_traces(traces, detection_labels, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)')
# hv.save(fig, f'../figures/synthetic_labels_{fname}_{comp}_{noise_id}.png', dpi=300)
fig

In [18]:
print(report.format(R[..., comps[comp], 1].sum(), R[..., comps[comp], 2].sum(), 
                    R[..., comps[comp], 3].sum(), R[..., comps[comp], 0].sum()))

True Positive = 133	False Positive = 64
True Negative = 301	False Negative = 121


# Real boreholde data

In [19]:
import glob
from pathlib import Path
from scipy.io import loadmat, savemat, wavfile
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

In [20]:
data_dir = Path("C:/Users/seraf/OneDrive - ualberta.ca/Documents/SampleData")
data_folder = data_dir / 'BoreholeData_reformat'
data_paths = sorted(list(data_folder.rglob("*.pkl")))

In [22]:
filenames = []
Data = {}
Headers = {}

for i, fi in enumerate(tqdm(data_paths)):
    filename = fi.name.split('.')[0]
    filenames.append(filename)
    Di = api.utils.read_dict(fi)
    Headers[filename] = Di['hdrc']
    dt = Di['hdrc']['delta']
    time = np.arange(0, Di['data'].shape[-1] * dt, dt)
    Data[filename] = xr.DataArray(np.nan_to_num(Di['data']).swapaxes(0, 1), 
                                  dims=("Location", "Component", "Time"), 
                                  coords={"Location" : Di['hdrc']['stdp'], 
                                          "Component": ['X', 'Y', 'Z'], 
                                          "Time" : time})

  0%|          | 0/9 [00:00<?, ?it/s]

In [24]:
catsdet = cats.CATSDetector(dt_sec=dt,
                            stft_window_sec=('hann', 0.1),
                            stft_overlap=0.75,
                            stft_nfft=512,
                            minSNR=4.0,
                            stationary_frame_sec=1.0,
                            min_dt_width_sec=0.08,
                            min_df_width_Hz=20.,
                            max_dt_gap_sec=0.1,
                            neighbor_distance=2,
                            date_Q=0.95,
                            date_detection_mode=True,
                            clustering_with_SNR=True,
                            clustering_multi_comp=True,
                            stft_backend='ssqueezepy', 
                            stft_kwargs=dict(padtype='zero')
                           )
print(f"{dt = }")
print(f"{catsdet.stft_window_len = }")
print(f"{catsdet.stft_nfft = }")
print(f"{catsdet.stationary_frame_len = }")
print(f"{catsdet.min_dt_width_len = }")
print(f"{catsdet.min_df_width_len = }")
print(f"{catsdet.neighbor_distance_len = }")
print(f"{catsdet.max_dt_gap_len = }")

dt = 0.00025
catsdet.stft_window_len = 400
catsdet.stft_nfft = 512
catsdet.stationary_frame_len = 256
catsdet.min_dt_width_len = 3
catsdet.min_df_width_len = 2
catsdet.neighbor_distance_len = 2
catsdet.max_dt_gap_len = 4


In [28]:
sd = Data['File137202']

In [26]:
Detection = {}

name = "CATS"
Detection[name] = {}
for fi, di in tqdm(Data.items(), desc=name):
    res = catsdet.detect(di.values)
    stft_time, detection = res.stft_time, res.detection
    Detection[name][fi] = xr.DataArray(detection, 
                                       dims=di.dims,
                                       coords={"Component" : di.coords["Component"],
                                               "Location" : di.coords["Location"],
                                               "Time" : stft_time})

CATS:   0%|          | 0/9 [00:00<?, ?it/s]

In [27]:
fname = 'File137206'
comp = 'Z'
figure = cats.plot_traces(Data, Detection['CATS'], fname=fname, comp=comp, gain=0.5, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figure = figure.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=1.3, invert_yaxis=True)
figure

ValueError: could not convert string to float: 'Z'

In [19]:
fname = 'File137206'
comp = 'Z'
figure = cats.plot_traces(Data, Detection['CATS'], fname=fname, comp=comp, gain=0.5, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figure = figure.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=1.3, invert_yaxis=True)
figure

In [22]:
hv.save(figure, f"../figures/WellData_{fname}_{comp}.png", dpi=300)