# Cluster Analysis of Trimmed Spectrogram (CATS)

# False alarm filtering via multi-station analysis:
- Association, Voting, Interpolation, and Extrapolation

In [1]:
import numpy as np
import numba as nb
import holoviews as hv

from scipy import signal, special, optimize, stats
import ssqueezepy as ssq
import timeit
import pyskbel as api
import xarray as xr
import os

In [2]:
import cats

Main parameters:
```python
1. stft_window_sec        # window type and length of STFT window, in seconds
2. stft_overlap           # overlap rate of STFT windows, range (0, 1)
3. minSNR                 # minimum Signal-to-Noise Ratio, approximate range (3.5 - 5.5)
4. stationary_frame_sec   # length of frame where noise is stationary, in seconds
5. cluster_size_t_sec     # time duration of strongest phases in signal, in seconds
6. cluster_size_f_Hz      # frequency bandwidth of signal, in hertz
```
Minor parameters:
```python
1. stft_nfft               # zero-padding of STFT windows, recommended a power of 2 (e.g. 512)
2. min_t_duration_sec      # minimum duration time of detected events, in seconds
3. min_t_separation_sec    # minimum separation time between detected events, in seconds
4. cluster_distance_t_sec  # neighborhood distance for clustering in time, in seconds, default `cluster_size_t_sec/2`
5. cluster_distance_f_Hz   # neighborhood distance for clustering in frequency, in hertz, default `cluster_size_f_Hz/2`
6. clustering_multitrace   # whether to apply multitrace clustering, True/False, useful for regular array of receivers 
7. cluster_size_trace      # minimum number of traces for multitrace clustering, [1, 2, ...]
8. cluster_distance_trace  # neighborhood distance for multitrace clustering, [1, 2, ...]
```

<hr>

In [113]:
N_datasets = 25

# i = np.random.choice(N_datasets)
i = 2
folder = "C:/Users/seraf/OneDrive - ualberta.ca/Documents/WaveDatasets/GeneratedData"
Dclean = api.compose_from_meta(f"{folder}/MetaDataSetsNew/dataset_{i}_main.pkl", 
                               noise=0.0, data_path=f'{folder}/SurfaceEvents')

main_meta = api.utils.read_dict(f"{folder}/MetaDataSetsNew/dataset_{i}_main.pkl")
main_meta['data_path'] = 'C:\\Users\\seraf\\OneDrive - ualberta.ca\\Documents\\WaveDatasets\\GeneratedData\\SurfaceEvents'

x = np.linspace(main_meta['xmin'], main_meta['xmax'], main_meta['nx'])
time = np.linspace(main_meta['Tmin'], main_meta['Tmax'], main_meta['nT'])
dt = main_meta['dt']

Cdim = 1
Dclean = np.moveaxis(Dclean, -1, Cdim)
main_dims = ["Location", "Time"]
main_dims.insert(Cdim, "Component")

t1, t2 = 8, 22
it1, it2 = int(t1 / dt), int(t2 / dt)
time = time[it1 : it2 + 1]
time -= time.min()
Dclean = Dclean[..., it1 : it2 + 1]

Taper on axis = -2 is applied


  0%|          | 0/3 [00:00<?, ?it/s]

In [114]:
beta = 0
noise_ids = {-1 : "high_freq", 0 : "white", 1 : "low_freq"}
noise_id = noise_ids[beta]
Noise = api.utils.get_noise((0.05, beta), Dclean.shape, axis=2)
Noise += 0.05 * np.sin(time * 2 * np.pi * 50)[None, None, :]
D = Dclean + Noise

eps = 0.007
true_detection_time = abs(Dclean) > eps
Srms = np.sqrt(np.mean(Dclean[true_detection_time]**2))
Nrms = np.sqrt(np.mean(Noise**2))
print("SNR = ", Srms**2 / Nrms**2)
print("Noise color = ", noise_id)

SNR =  1.400721350282274
Noise color =  white


In [115]:
catsdet = cats.CATSDetector(dt_sec=dt,
                            stft_window_sec=('hann', 0.15),
                            stft_overlap=0.5,
                            stft_nfft=256,
                            minSNR=4.5,
                            stationary_frame_sec=1.0,
                            cluster_size_t_sec=0.4,
                            cluster_size_f_Hz=8.0,
                            cluster_distance_t_sec=None,
                            cluster_distance_f_Hz=None,
                            clustering_with_SNR=True,
                            clustering_multitrace=True,
                            cluster_size_trace=2,
                            cluster_distance_trace=2)
print(f"{catsdet.stationary_frame_len = }")
print(f"{catsdet.cluster_size_t_len = }")
print(f"{catsdet.cluster_size_f_len = }")
print(f"{catsdet.cluster_distance_t_len = }")
print(f"{catsdet.cluster_distance_f_len = }")

catsdet.stationary_frame_len = 256
catsdet.cluster_size_t_len = 5
catsdet.cluster_size_f_len = 4
catsdet.cluster_distance_t_len = 2
catsdet.cluster_distance_f_len = 2


In [116]:
noise_psd = abs(catsdet.STFT * Noise).mean(axis=(0, 1, 3))
f_dim = hv.Dimension('Frequency', unit='Hz')
fontsize = dict(labels=16, ticks=14)
fig_noise_psd = hv.Curve((catsdet.stft_frequency, noise_psd), 
                         kdims=f_dim).opts(fig_size=250, aspect=2.5, logx=True, 
                                           fontsize=fontsize, ylabel='', 
                                           logy=True, xlim=(1, np.nan))
# hv.save(fig_noise_psd, f"figures/NoisePSD_{noise_id}.png", dpi=300)
fig_noise_psd

In [117]:
t_start = timeit.default_timer()

catsres = catsdet.detect(D)

t_delta = timeit.default_timer() - t_start
print(f"{t_delta = : .3f} seconds")
print(f"1 Terabyte = {t_delta * (1024**4 / D.nbytes ) / 3_600 : .2f} hours")

t_delta =  0.506 seconds
1 Terabyte =  7.07 hours


In [118]:
comp = 'Z'
comps = {'X' : 0, 'Y' : 1, 'Z': 2}
ind = (comps[comp], 2)
fig = catsres.plot(ind).cols(1)
xlim = (np.nan, np.nan)
fig.opts(hv.opts.Curve(xlim=xlim, linewidth=1), 
         hv.opts.Image(xlim=xlim, logz=True, ylim=(0.9, np.nan)))
# hv.save(fig, f'../figures/synthetic_demo_{comp}_{noise_id}.png', dpi=300)
fig

In [119]:
SNRK = catsres.SNR_spectrogram[ind] * catsres.binary_spectrogram_clustered[ind]
hv.Image((catsres.stft_time, catsres.stft_frequency, SNRK)).opts(aspect=2, logy=True, logz=True, fig_size=350, 
                                                                 cmap='viridis', ylim=(0.9, np.nan), colorbar=True)

In [120]:
print("Mean SNR: ", (catsres.SNR_spectrogram[catsres.binary_spectrogram_clustered]).mean())
print("Median SNR: ", np.median(catsres.SNR_spectrogram[catsres.binary_spectrogram_clustered]))

Mean SNR:  7.172496894060344
Median SNR:  4.516465937802641


In [136]:
comp = 'Z'
comps = {'X' : 0, 'Y' : 1, 'Z': 2}
ind = (comps[comp], 50)
fig = catsres.plot(ind).cols(1)
fig[0].data.Amplitude = Dclean[ind]
fig[0].opts(linewidth=1, fig_size=350)
# hv.save(fig[0], f"../figures/synthetic_demo_{comp}_clean.png", dpi=300)
fig[0]

In [121]:
eps = 0.007
binary_cmap = ['white', 'blue']
binary_cticks = [(0, 'Noise'), (1, 'Event')]
fontsize = dict(labels=16, ticks=15)
image_kwargs = dict(colorbar=True, aspect=3.5, fontsize=fontsize, 
                    cmap=binary_cmap, cbar_ticks=binary_cticks,
                    clabel='', xlabel='Time (s)', ylabel='Location (km)',
                    fig_size=400, clim=(-0.5, 1.5))

true_detection_time = abs(Dclean) > eps
stft_time = catsdet.STFT.forward_time_axis(len(time))
true_detection = cats.core.projection.ProjectFilterIntervals(true_detection_time, time, 1, 0.5, stft_time)
# hv.Image((stft_time, x, true_detection.max(0))).opts(**image_kwargs)
# hv.Image((catsres.stft_time, x, catsres.detection.max(0))).opts(**image_kwargs)

In [122]:
fp_max = int(0.5 / catsdet.stft_hop_sec)
labels, R = cats.EvaluateDetection(true_detection.max(Cdim), catsres.detection.max(Cdim), 
                                   0.9, fp_max)
report = "True Positive = {0}\tFalse Positive = {1}\nTrue Negative = {2}\tFalse Negative = {3}"
print(report.format(R[..., 1].sum(), R[..., 2].sum(), R[..., 3].sum(), R[..., 0].sum()))

True Positive = 202	False Positive = 11
True Negative = 255	False Negative = 52


In [123]:
class_cmap = ['red', 'white', 'blue', 'green']
class_cticks = [(-1, 'Missed'), (0, 'Noise'), (1, 'Detected'), (2, 'False Alarm')]
fontsize = dict(labels=16, ticks=15)
cl_image_kwargs = dict(colorbar=True, aspect=3.5, fontsize=fontsize, 
                    cmap=class_cmap, cbar_ticks=class_cticks, clim=(-1.5, 2.5),
                    clabel='', xlabel='Time (s)', ylabel='Location (km)',
                    fig_size=400)

fig = hv.Image((catsres.stft_time, x, labels)).opts(**cl_image_kwargs)
# fig

In [124]:
data_meta = dict(dims=main_dims, 
                 coords={"Location" : x, 
                         "Component" : ['X', 'Y', 'Z'],
                         "Time" : time})

traces = {'Synthetic' : xr.DataArray(D, **data_meta)}
traces_clean = {'Synthetic' : xr.DataArray(Dclean, **data_meta)}
#################
det_meta = dict(dims=main_dims, 
                 coords={"Location" : x, "Component" : ['X', 'Y', 'Z'],
                         "Time" : catsres.stft_time})

detection_pred = {'Synthetic' : xr.DataArray(catsres.detection, **det_meta)}                          
detection_true = {'Synthetic' : xr.DataArray(true_detection, **det_meta)}

label_meta = dict(dims=main_dims + ["Class"], 
                 coords={"Location" : x, "Component" : ['X', 'Y', 'Z'],
                         "Time" : catsres.stft_time, 
                         "Class" : ['TP', 'FP', 'FN']})

labels, R = cats.EvaluateDetection(true_detection, catsres.detection, 
                                   0.9, fp_max)
class_labels = np.stack([labels == 1, labels == 2, labels == -1], axis=-1)

detection_labels = {'Synthetic' : xr.DataArray(class_labels, **label_meta)}

print(report.format(R[..., 1].sum(), R[..., 2].sum(), R[..., 3].sum(), R[..., 0].sum()))

True Positive = 404	False Positive = 705
True Negative = 713	False Negative = 104


In [125]:
fname = 'Synthetic'

fig = cats.plot_traces(traces_clean, detection_true, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)', 
                                                       )
# hv.save(fig, f'../figures/synthetic_clean_{fname}_{comp}.png', dpi=300)
fig

In [126]:
fig = cats.plot_traces(traces, detection_pred, fname, comp, gain=0.75, rsp=6, alpha=0.3)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)')
# hv.save(fig, f'../figures/synthetic_pred_{fname}_{comp}_{noise_id}.png', dpi=300)
fig

In [127]:
fig = cats.plot_traces(traces, detection_labels, fname, comp, gain=0.75, rsp=6, alpha=0.3)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)')
# hv.save(fig, f'../figures/synthetic_labels_{fname}_{comp}_{noise_id}.png', dpi=300)
fig

In [128]:
print(report.format(R[..., comps[comp], 1].sum(), R[..., comps[comp], 2].sum(), 
                    R[..., comps[comp], 3].sum(), R[..., comps[comp], 0].sum()))

True Positive = 202	False Positive = 11
True Negative = 255	False Negative = 52


# Real boreholde data

In [129]:
import glob
from pathlib import Path
from scipy.io import loadmat, savemat, wavfile
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

In [130]:
data_dir = Path("C:/Users/seraf/OneDrive - ualberta.ca/Documents/SampleData")
data_folder = data_dir / 'BoreholeData_reformat'
data_paths = sorted(list(data_folder.rglob("*.pkl")))

In [139]:
filenames = []
Data = {}
Headers = {}

Cdim = 1
main_dims = ["Location", "Time"]
main_dims.insert(Cdim, "Component")

for i, fi in enumerate(tqdm(data_paths)):
    filename = fi.name.split('.')[0]
    filenames.append(filename)
    Di = api.utils.read_dict(fi)
    Headers[filename] = Di['hdrc']
    dt = Di['hdrc']['delta']
    time = np.arange(0, Di['data'].shape[-1] * dt, dt)
    Data[filename] = xr.DataArray(np.nan_to_num(Di['data']).swapaxes(0, Cdim), 
                                  dims=main_dims, 
                                  coords={"Location" : Di['hdrc']['stdp'], 
                                          "Component": ['X', 'Y', 'Z'], 
                                          "Time" : time})

  0%|          | 0/9 [00:00<?, ?it/s]

In [141]:
catsdet = cats.CATSDetector(dt_sec=dt,
                            stft_window_sec=('hann', 0.1),
                            stft_overlap=0.75,
                            stft_nfft=512,
                            minSNR=4.0,
                            stationary_frame_sec=10.0,
                            cluster_size_t_sec=0.08,
                            cluster_size_f_Hz=30.,
                            cluster_distance_t_sec=0.05,
                            cluster_distance_f_Hz=20.0,
                            clustering_with_SNR=True,
                            clustering_multitrace=True,
                            cluster_size_trace=1,
                            cluster_distance_trace=2)
print(f"{catsdet.stationary_frame_len = }")
print(f"{catsdet.cluster_size_t_len = }")
print(f"{catsdet.cluster_size_f_len = }")
print(f"{catsdet.cluster_distance_t_len = }")
print(f"{catsdet.cluster_distance_f_len = }")

catsdet.stationary_frame_len = 400
catsdet.cluster_size_t_len = 3
catsdet.cluster_size_f_len = 3
catsdet.cluster_distance_t_len = 2
catsdet.cluster_distance_f_len = 2


In [142]:
Detection = {}

name = "CATS"
Detection[name] = {}
for fi, di in tqdm(Data.items(), desc=name):
    res = catsdet.detect(di.values)
    stft_time, detection = res.stft_time, res.detection
    Detection[name][fi] = xr.DataArray(detection, 
                                       dims=di.dims,
                                       coords={"Component" : di.coords["Component"],
                                               "Location" : di.coords["Location"],
                                               "Time" : stft_time})

CATS:   0%|          | 0/9 [00:00<?, ?it/s]

In [143]:
fname = 'File137206'
comp = 'Z'
figure = cats.plot_traces(Data, Detection['CATS'], fname=fname, comp=comp, gain=0.5, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figure = figure.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=1.3, invert_yaxis=True)
figure

In [144]:
fname = 'File137203'
comp = 'Z'
figure = cats.plot_traces(Data, Detection['CATS'], fname=fname, comp=comp, gain=5, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figure = figure.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=1.3, invert_yaxis=True)
figure

In [22]:
hv.save(figure, f"../figures/WellData_{fname}_{comp}.png", dpi=300)