# Cluster Analysis of Trimmed Spectrogram (CATS)

In [1]:
import numpy as np
import numba as nb
import holoviews as hv
# hv.extension('bokeh')

from scipy import signal, special, optimize, stats
import ssqueezepy as ssq
import timeit
import xarray as xr

In [2]:
import pyskbel as api
import cats

# Synthetic data

In [3]:
N_datasets = 25

# i = np.random.choice(N_datasets)
i = 2
folder = "C:/Users/seraf/OneDrive - ualberta.ca/Documents/WaveDatasets/GeneratedData"
Dclean = api.compose_from_meta(f"{folder}/MetaDataSetsNew/dataset_{i}_main.pkl", 
                               noise=0.0, data_path=f'{folder}/SurfaceEvents')
main_meta = api.utils.read_dict(f"{folder}/MetaDataSetsNew/dataset_{i}_main.pkl")
main_meta['data_path'] = 'C:\\Users\\seraf\\OneDrive - ualberta.ca\\Documents\\WaveDatasets\\GeneratedData\\SurfaceEvents'

x = np.linspace(main_meta['xmin'], main_meta['xmax'], main_meta['nx'])
time = np.linspace(main_meta['Tmin'], main_meta['Tmax'], main_meta['nT'])
dt = main_meta['dt']

Cdim = 0
Dclean = np.moveaxis(Dclean, -1, Cdim)
main_dims = ["Location", "Time"]
main_dims.insert(Cdim, "Component")

t1, t2 = 8, 22
it1, it2 = int(t1 / dt), int(t2 / dt)
time = time[it1 : it2 + 1]
time -= time.min()
Dclean = Dclean[..., it1 : it2 + 1]

Taper on axis = -2 is applied


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
beta = 1
noise_ids = {-1 : "high_freq", 0 : "white", 1 : "low_freq"}
noise_id = noise_ids[beta]
Noise = api.utils.get_noise((0.05, beta), Dclean.shape, axis=2)
Noise += 0.07 * np.sin(time * 2 * np.pi * 50)[None, None, :]
D = Dclean + Noise

eps = 0.007
true_detection_time = abs(Dclean) > eps
Srms = np.sqrt(np.mean(Dclean[true_detection_time]**2))
Nrms = np.sqrt(np.mean(Noise**2))
print("SNR = ", Srms**2 / Nrms**2)
print("Noise color = ", noise_id)

SNR =  1.0364589266593776
Noise color =  low_freq


In [15]:
catsden = cats.CATSDenoiser(dt_sec=dt,
                            stft_window_sec=('hann', 0.15),
                            stft_overlap=0.5,
                            stft_nfft=512,
                            minSNR=4.5,
                            stationary_frame_sec=1.0,
                            cluster_size_t_sec=0.3,
                            cluster_size_f_Hz=4.,
                            cluster_distance_t_sec=None,
                            cluster_distance_f_Hz=None,
                            min_neighbors=True,
                            clustering_with_SNR=True,
                            clustering_multitrace=True,
                            cluster_size_trace=2,
                            cluster_distance_trace=2,
                            stft_backend='ssqueezepy')
print(f"{catsden.cluster_distance_t_len = }")
print(f"{catsden.cluster_distance_f_len = }")

catsden.cluster_distance_t_len = 2
catsden.cluster_distance_f_len = 2


In [16]:
%time denres = catsden.denoise(D)

CPU times: total: 4.45 s
Wall time: 3.95 s


In [9]:
comp = 'Z'
trace = 63
comps = {'X' : 0, 'Y' : 1, 'Z': 2}
ind = [trace]
ind.insert(Cdim, comps[comp])
ind = tuple(ind)
fig = denres.plot(ind).cols(1)
xlim = (np.nan, np.nan)
fig.opts(hv.opts.Curve(xlim=xlim, linewidth=1), 
         hv.opts.Image(xlim=xlim, logz=True, ylim=(0.9, np.nan)))
hv.save(fig, f'../figures/denoise_synthetic_demo_{comp}_{noise_id}.png', dpi=300)
fig

In [10]:
fig_new = denres.plot(ind)
fig_new[0].data.Amplitude = Dclean[ind]
fig_new[0].opts(xlim=(2, 4), xlabel='Time (s)', color='k',
                linewidth=8, fig_size=350) * fig[-1].opts(linewidth=3, color='red', 
                                                          linestyle='--')
# hv.save(fig[0], f"figures/CleanZ.png", dpi=300)

In [17]:
data_meta = dict(dims=main_dims, 
                 coords={"Component" : ['X', 'Y', 'Z'],
                         "Location" : x,
                         "Time" : time})

traces = {'Synthetic' : xr.DataArray(D, **data_meta)}
traces_clean = {'Synthetic' : xr.DataArray(Dclean, **data_meta)}
traces_denoised = {'Synthetic' : xr.DataArray(denres.denoised_signal, **data_meta)}

In [12]:
fname = 'Synthetic'

fig = cats.plot_traces(traces_clean, None, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)', 
                                                       )
# hv.save(fig, f'../figures/denoise_synthetic_clean_{fname}_{comp}.png', dpi=300)
fig

In [13]:
fig = cats.plot_traces(traces, None, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize, invert_axes=False,
                                                        ylabel='Location (km)', xlabel='Time (s)')
# hv.save(fig, f'../figures/denoise_synthetic_noisy_{fname}_{comp}_{noise_id}.png', dpi=300)
fig

In [18]:
fig = cats.plot_traces(traces_denoised, None, fname, comp, gain=0.75, rsp=6, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figsize = 450
fig = fig.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=2, fig_size=figsize,
                                                        ylabel='Location (km)', xlabel='Time (s)')
# hv.save(fig, f'../figures/denoise_synthetic_denoised_{fname}_{comp}_{noise_id}_SNR_mtrace_noFill.png', dpi=300)
fig

# Downhole data

In [18]:
import glob
from pathlib import Path
from scipy.io import loadmat, savemat, wavfile
from datetime import datetime, timedelta
from tqdm.notebook import tqdm

In [19]:
data_dir = Path("C:/Users/seraf/OneDrive - ualberta.ca/Documents/SampleData")
data_folder = data_dir / 'BoreholeData_reformat'
data_paths = sorted(list(data_folder.rglob("*.pkl")))

In [20]:
filenames = []
Data = {}
Headers = {}

Cdim = 0
main_dims = ["Location", "Time"]
main_dims.insert(Cdim, "Component")

for i, fi in enumerate(tqdm(data_paths)):
    filename = fi.name.split('.')[0]
    filenames.append(filename)
    Di = api.utils.read_dict(fi)
    Headers[filename] = Di['hdrc']
    dt = Di['hdrc']['delta']
    time = np.arange(0, Di['data'].shape[-1] * dt, dt)
    Data[filename] = xr.DataArray(np.nan_to_num(Di['data']).swapaxes(0, Cdim), 
                                  dims=main_dims, 
                                  coords={"Component": ['X', 'Y', 'Z'], 
                                          "Location" : Di['hdrc']['stdp'], 
                                          "Time" : time})

  0%|          | 0/9 [00:00<?, ?it/s]

In [29]:
catsden = cats.CATSDenoiser(dt_sec=dt,
                            stft_window_sec=('hann', 0.1),
                            stft_overlap=0.75,
                            stft_nfft=512,
                            minSNR=4.5,
                            stationary_frame_sec=1.0,
                            cluster_size_t_sec=0.1,
                            cluster_size_f_Hz=35.,
                            cluster_distance_t_sec=None,
                            cluster_distance_f_Hz=None,
                            min_neighbors=None,
                            clustering_with_SNR=True,
                            clustering_multitrace=False,
                            cluster_size_trace=2,
                            cluster_distance_trace=2,
                            stft_backend='ssqueezepy')
print(f"{dt = }")
print(f"{catsden.stft_window_len = }")
print(f"{catsden.stft_nfft = }")
print(f"{catsden.stationary_frame_len = }")
print(f"{catsden.cluster_size_t_len = }")
print(f"{catsden.cluster_size_f_len = }")
print(f"{catsden.cluster_distance_t_len = }")
print(f"{catsden.cluster_distance_f_len = }")

dt = 0.00025
catsden.stft_window_len = 400
catsden.stft_nfft = 512
catsden.stationary_frame_len = 256
catsden.cluster_size_t_len = 4
catsden.cluster_size_f_len = 4
catsden.cluster_distance_t_len = 2
catsden.cluster_distance_f_len = 2


In [30]:
Denoised = {}
for fi, di in tqdm(Data.items()):
    denoised = catsden.denoise(di.values).denoised_signal
    Denoised[fi] = xr.DataArray(denoised, dims=di.dims,
                                coords={"Component" : di.coords["Component"],
                                        "Location" : di.coords["Location"],
                                        "Time" : di.coords["Time"]})

  0%|          | 0/9 [00:00<?, ?it/s]

In [27]:
fname = 'File137206'
comp = 'Z'
figure = cats.plot_traces(Data, None, fname=fname, comp=comp, gain=1, alpha=0.2)
fontsize = dict(labels=17, ticks=15)
figure = figure.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=1.3, invert_yaxis=True)
hv.save(figure, f'../figures/denoise_downhole_noisy_{fname}_{comp}.png', dpi=300)
figure

In [31]:
figure = cats.plot_traces(Denoised, None, fname=fname, comp=comp, gain=1, alpha=0.2)
figure = figure.opts(hv.opts.Curve(fontsize=fontsize), 
            hv.opts.Rectangles(fontsize=fontsize)).opts(aspect=1.3, invert_yaxis=True)
hv.save(figure, f'../figures/denoise_downhole_denoised_{fname}_{comp}.png', dpi=300)
figure

# NOIZEUS voice data

In [3]:
from scipy.io import wavfile

In [4]:
folder = "C:/Users/seraf/OneDrive - ualberta.ca/Documents/NOIZEUS"
files = ["train/5dB/sp01_train_sn5", "restaurant/10dB/sp01_restaurant_sn10"]
voices = [wavfile.read(f"{folder}/{fi}.wav")[1] for fi in files]
voices = np.array(voices)
dt = 1 / 8000

In [30]:
catsden = cats.CATSDenoiser(dt_sec=dt,
                            stft_window_sec=('hann', 0.05),
                            stft_overlap=0.75,
                            stft_nfft=1024,
                            minSNR=4.0,
                            stationary_frame_sec=3.0,
                            min_dt_width_sec=0.1,
                            min_df_width_Hz=50.,
                            neighbor_distance=2,
                            min_neighbors=None,
                            date_Q=0.95,
                            date_detection_mode=False,
                            clusteringWithSNR=True,
                            wiener=False,
                            stft_backend='ssqueezepy')

In [31]:
%time denres = catsden.denoise_stepwise(voices)

CPU times: total: 62.5 ms
Wall time: 62.5 ms


In [32]:
ind = 1
fig = denres.plot(ind)
fig.opts(hv.opts.Curve(aspect=4, linewidth=1),
         hv.opts.Image(aspect=4, logz=True, logy=False))

In [33]:
for i, fi in enumerate(files):
    wavfile.write(f"{folder}/{fi}_denoised.wav", int(1 / dt), np.int16(denres.denoised_signal[i]))