# example_batch_processing.ipynb
:auth: Nathan T. Stevens  
:email: ntsteven (at) uw.edu  
:org: Pacific Northwest Seismic Network  
:license: MIT (2023)  


:purpose:   
This notebook documents a brief example of batch machine learning processing on 10 minutes of data from the entire PNSN network surrounding the M4.3 in October 2023 near Port Townsend, WA.  

Note: this takes awhile to run as a Jupyter Notebook and is likely faster to run as a *.py script...  
          

In [1]:
# Import "standard" modules
import os
import sys
import pandas as pd
from obspy import read, Stream
from tqdm import tqdm
# Import repo-specific modules
sys.path.append('..')
import core.preprocessing as prep
import core.prediction as pred
import core.postprocessing as post

In [2]:
# Get the wfdisc.csv for test_dataset_2
wfdisc_file = os.path.join('..','data','test_dataset_2','wfdisc.csv')
df_wf = pd.read_csv(wfdisc_file,parse_dates=['time','endtime'],index_col=[0])
display(df_wf)

Unnamed: 0,sta,bandinst,time,endtime,samprate,dir,dfile,lddate
0,ABER,EN,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/ABER,UW.ABER..EN?.2023.282.mseed,2023-11-07 13:31:48.891578
1,ABER,HH,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/ABER,UW.ABER..HH?.2023.282.mseed,2023-11-07 13:31:50.372572
2,AGNW,EN,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/AGNW,UW.AGNW..EN?.2023.282.mseed,2023-11-07 13:31:51.262202
3,AGNW,HH,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/AGNW,UW.AGNW..HH?.2023.282.mseed,2023-11-07 13:31:52.570386
4,ALCT,EN,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/ALCT,UW.ALCT..EN?.2023.282.mseed,2023-11-07 13:31:53.891929
...,...,...,...,...,...,...,...,...
625,YELM,EN,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/YELM,UW.YELM..EN?.2023.282.mseed,2023-11-07 13:48:03.218380
626,YELM,HH,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/YELM,UW.YELM..HH?.2023.282.mseed,2023-11-07 13:48:05.223467
627,YPT,EH,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/YPT,UW.YPT..EH?.2023.282.mseed,2023-11-07 13:48:06.518367
628,YPT,EN,2023-10-09T02:18:00,2023-10-09T03:18:00,100.0,../data/test_dataset_2/UW/YPT,UW.YPT..EN?.2023.282.mseed,2023-11-07 13:48:08.369481


In [8]:
# Load all data
stream = Stream()
for _i in tqdm(range(len(df_wf))):
    _S = df_wf.iloc[_i,:]
    stream += read(os.path.join(_S.dir, _S.dfile), fmt='MSEED')


100%|██████████| 630/630 [00:02<00:00, 214.24it/s]


In [9]:
# Load ML model - currently run with Metal Performance Shaders (mps) backend for Apple M1/M2 silicon
model, device = prep.initialize_EQT_model(device=pred.torch.device('mps'))

In [11]:
# Run preprocessing
# Split by unitue Net.Sta.Loc.BandInst codes
NSLBI_dict = prep.stream_to_NSLBI_dict(stream, merge_kwargs={'method': 1}, tqdm_disable=False)
display(NSLBI_dict)


100%|██████████| 630/630 [02:52<00:00,  3.66it/s]


{'UW.ABER..EN?': 3 Trace(s) in Stream:
UW.ABER..ENE | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..ENN | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..ENZ | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples,
 'UW.ABER..HH?': 3 Trace(s) in Stream:
UW.ABER..HHE | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..HHN | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..HHZ | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples,
 'UW.AGNW..EN?': 3 Trace(s) in Stream:
UW.AGNW..ENE | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.AGNW..ENN | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.AGNW..ENZ | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 

In [12]:
# Resample & pad NSLBI streams
NSLBI_dict_h = prep.homogenize_NSLBI_dict(NSLBI_dict, trim_bound='max', tqdm_disable=False)
display(NSLBI_dict_h)

100%|██████████| 630/630 [00:26<00:00, 24.22it/s]


{'UW.ABER..EN?': 3 Trace(s) in Stream:
UW.ABER..ENE | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..ENN | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..ENZ | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples,
 'UW.ABER..HH?': 3 Trace(s) in Stream:
UW.ABER..HHE | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..HHN | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.ABER..HHZ | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples,
 'UW.AGNW..EN?': 3 Trace(s) in Stream:
UW.AGNW..ENE | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.AGNW..ENN | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 Hz, 360001 samples
UW.AGNW..ENZ | 2023-10-09T02:18:00.000000Z - 2023-10-09T03:18:00.000000Z | 100.0 

In [15]:
# Form data windows and station-window-index
windows, swindex = prep.NSLBI_dict_to_windows(NSLBI_dict_h, model, tqdm_disable=False)

  windows[_s, :, :] /= np.max(np.abs(windows[_s, :, :]), axis=-1, keepdims=True)
100%|██████████| 630/630 [07:38<00:00,  1.37it/s]


In [16]:
print(f'The shape of windows is {windows.shape}')

The shape of windows is (122965, 3, 6000)


In [17]:
# Set batch size based on (#cpu - 1)*2
batch_size = (pred.torch.get_num_threads() - 1)*2
# Run prediction
preds = pred.run_batched_prediction(windows, model, device, batch_size=batch_size)

100%|██████████| 8782/8782 [08:54<00:00, 16.43it/s]


In [24]:
# Reassemble predictions into streams
pred_stream = post.reassemble_multistation_preds(preds, swindex, model, NSLBI_dict, tqdm_disable=False)

  axis=0)
100%|██████████| 630/630 [00:06<00:00, 102.00it/s]


In [26]:
print(pred_stream.__str__(extended=True))

1742 Trace(s) in Stream:
UW.ABER.EW.END  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.ABER.EW.ENP  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.ABER.EW.ENS  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.ABER.EW.HHD  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.ABER.EW.HHP  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.ABER.EW.HHS  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.AGNW.EW.END  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.AGNW.EW.ENP  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.AGNW.EW.ENS  | 2023-10-09T02:18:05.000000Z - 2023-10-09T03:17:47.990000Z | 100.0 Hz, 358300 samples
UW.AGNW.EW.HHD  | 2023-10-09T02:18:05.000000Z - 

In [31]:
# Write prediction traces to disk
write_root = os.path.join('..','data','test_dataset_2','')
for _k in tqdm(NSLBI_dict.keys()):
    _n, _s, _l, _bi = _k.split('.')
    _st = pred_stream.copy().select(network=_n, station=_s, channel=_bi)
    write_fpath = os.path.join(write_root,_n, _s)
    save_name = f'{_n}.{_s}.{_st[0].stats.location}.{_bi}.{_st[0].stats.starttime.year:d}.{_st[0].stats.starttime.julday:03d}.mseed'
    _save_fp = os.path.join(write_fpath,save_name)
    _st.write(_save_fp, fmt='MSEED')


A suitable encoding will be chosen.
100%|██████████| 630/630 [02:05<00:00,  5.00it/s]
