# Getting started with Quakeflow

https://github.com/wayneweiqiang/QuakeFlow


Extension of demo notebooks looking at Ridgecrest Earthquake

https://github.com/wayneweiqiang/QuakeFlow#data-process 
https://wayneweiqiang.github.io/QuakeFlow/workflow/


https://earthquake.usgs.gov/storymap/index-ridgecrest.html 

## Configure software environment

We've created a lock file to ensure a reproducible environment is created with locked package versions.

```
conda env create -f environment.yml
conda activate quakeflow
conda list --explicit > conda-linux-64.lock
```

**Uncomment and run the following cell, then Kernel --> Change Kernel --> quakeflow. NOTE: you may need log out and log back in for the new kernel to appear.**

In [None]:
import os
if not os.path.exists('/home/studio-lab-user/.conda/envs/quakeflow'):
    %conda create -y -n quakeflow --file conda-linux-64.lock

In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd
import time
import requests
import json
import obspy
from obspy.clients.fdsn import Client

# from google.colab import data_table
# data_table.enable_dataframe_formatter()

In [None]:
# Additional useful libraries
import geopandas as gpd

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Plotting configuration

%matplotlib inline
%config InlineBackend.figure_format='retina'

## Get Data from API

client = "SCEDC" # http://service.scedc.caltech.edu "Southern California Earthquake Data Center

CI: Southern California Seismic Network https://www.fdsn.org/networks/detail/CI/

In [None]:
region_name = "Ridgecrest_demo"
center = (-117.504, 35.705)
horizontal_degree = 1.0
vertical_degree = 1.0
starttime = obspy.UTCDateTime("2019-07-04T17")
endtime = obspy.UTCDateTime("2019-07-04T18")
client = "SCEDC" # http://service.scedc.caltech.edu "Southern California Earthquake Data Center
network_list = ["CI"]
# channel_list = "HH*,BH*,EH*,HN*"
channel_list = "HH*,BH*,EH*"

config = {}
config["region"] = region_name
config["center"] = center
config["xlim_degree"] = [center[0] - horizontal_degree / 2, center[0] + horizontal_degree / 2]
config["ylim_degree"] = [center[1] - vertical_degree / 2, center[1] + vertical_degree / 2]
config["starttime"] = starttime.datetime.isoformat()
config["endtime"] = endtime.datetime.isoformat()
config["networks"] = network_list
config["channels"] = channel_list
config["client"] = client

### Events

In [None]:
events = Client("iris").get_events(
    starttime=config["starttime"],
    endtime=config["endtime"],
    minlongitude=config["xlim_degree"][0],
    maxlongitude=config["xlim_degree"][1],
    minlatitude=config["ylim_degree"][0],
    maxlatitude=config["ylim_degree"][1],
)

In [None]:
# returns copy of plot as variable
p = events.plot()

In [None]:
print(f"Number of events: {len(events)}")

####### Save catalog ########
catalog = defaultdict(list)
for event in events:
    if len(event.magnitudes) > 0:
        catalog["time"].append(event.origins[0].time.datetime)
        catalog["magnitude"].append(event.magnitudes[0].mag)
        catalog["longitude"].append(event.origins[0].longitude)
        catalog["latitude"].append(event.origins[0].latitude)
        catalog["depth(m)"].append(event.origins[0].depth)

catalog = pd.DataFrame.from_dict(catalog)#.sort_values(["time"]).reset_index(drop=True)


catalog.head()

In [None]:
# Save a CSV
catalog.to_csv(
    'events.csv',
    sep="\t",
    index=False,
    float_format="%.3f",
    date_format='%Y-%m-%dT%H:%M:%S.%f',
    columns=["time", "magnitude", "longitude", "latitude", "depth(m)"],
)

In [None]:
# can also save as GeoJSON
gf = gpd.GeoDataFrame(catalog, 
                      geometry=gpd.points_from_xy(catalog.longitude, catalog.latitude),
                      crs=4326,
                     )
gf.to_file('events.json', driver='GeoJSON')

In [None]:
# issue with pandas timestamps
# gf.explore() https://github.com/geopandas/geopandas/issues/1906 
gf['time'] = gf.time.astype('str')
gf.explore(column='magnitude', cmap='viridis', tiles='Stamen Terrain')

### Stations

In [None]:
stations = Client(config["client"]).get_stations(
    network=",".join(config["networks"]),
    station="*",
    starttime=config["starttime"],
    endtime=config["endtime"],
    minlongitude=config["xlim_degree"][0],
    maxlongitude=config["xlim_degree"][1],
    minlatitude=config["ylim_degree"][0],
    maxlatitude=config["ylim_degree"][1],
    channel=config["channels"],
    level="response",
)

In [None]:
s = stations.plot()

In [None]:
station_locs = defaultdict(dict)
for network in stations:
    for station in network:
        for chn in station:
            sid = f"{network.code}.{station.code}.{chn.location_code}.{chn.code[:-1]}"
            if sid in station_locs:
                station_locs[sid]["component"] += f",{chn.code[-1]}"
                station_locs[sid]["response"] += f",{chn.response.instrument_sensitivity.value:.2f}"
            else:
                component = f"{chn.code[-1]}"
                response = f"{chn.response.instrument_sensitivity.value:.2f}"
                dtype = chn.response.instrument_sensitivity.input_units.lower()
                tmp_dict = {}
                tmp_dict["longitude"], tmp_dict["latitude"], tmp_dict["elevation(m)"] = (
                    chn.longitude,
                    chn.latitude,
                    chn.elevation,
                )
                tmp_dict["component"], tmp_dict["response"], tmp_dict["unit"] = component, response, dtype
                station_locs[sid] = tmp_dict

station_locs = pd.DataFrame.from_dict(station_locs, orient='index')
station_locs["id"] = station_locs.index

In [None]:
station_locs.head()

In [None]:
station_locs.to_csv('stations.csv')

In [None]:
# Interactive visualization with geopandas geodataframe
gf = gpd.GeoDataFrame(station_locs.copy(), 
                      geometry=gpd.points_from_xy(station_locs.longitude, station_locs.latitude),
                      crs=4326,
                     )

gf.to_file('stations.json', driver='GeoJSON')

gf.explore()

### Waveforms

In [None]:
client = Client(config["client"])
interval = 30 #s
# interval = 3600 #s

# for event in events:
def download(event, stations):
    '''
    For a given 'event' and 'stations' list download 30 second waveforms w/ 100Hz samping rate
    
    Output: obspy miniseed stream
    '''
    starttime = event["origins"][0].time
    endtime = starttime + interval

    max_retry = 10
    stream = obspy.Stream()
    num_sta = 0
    for network in stations:
        for station in network:
            print(f"********{network.code}.{station.code}********")
            retry = 0
            while retry < max_retry:
                try:
                    tmp = client.get_waveforms(
                        network.code, station.code, "*", config["channels"], starttime, endtime
                    )
                    for trace in tmp:
                        if trace.stats.sampling_rate != 100:
                            # print(trace)
                            trace = trace.interpolate(100, method="linear")
                    #      trace = trace.detrend("spline", order=2, dspline=5*trace.stats.sampling_rate)
                    #      stream.append(trace)
                    stream += tmp
                    num_sta += len(tmp)
                    break
                except Exception as err:
                    print("Error {}.{}: {}".format(network.code, station.code, err))
                    message = "No data available for request."
                    if str(err)[: len(message)] == message:
                        break
                    retry += 1
                    time.sleep(5)
                    continue
            if retry == max_retry:
                print(f"{fname}: MAX {max_retry} retries reached : {network.code}.{station.code}")
            
    # stream.attach_response(stations)
    # stream = stream.remove_sensitivity()
    return stream

In [None]:
mseed = download(events[0], stations)

In [None]:
type(mseed)

In [None]:
#print(mseed.__str__(extended=True))
mseed

In [None]:
# https://docs.obspy.org/tutorial/code_snippets/waveform_plotting_tutorial.html

# mseed.plot() #all channels all stations!
# mseed.plot(type='section')

t = mseed[0:3].plot()

#### Convert to numpy arrays

In [None]:
sampling_rate = 100
n_channel = 3
dtype = "float32"
amplitude = True
remove_resp = True

def convert_mseed(mseed, station_locs):
    try:
        mseed = mseed.detrend("spline", order=2, dspline=5 * mseed[0].stats.sampling_rate)
    except:
        logging.error(f"Error: spline detrend failed at file {fname}")
        mseed = mseed.detrend("demean")
    mseed = mseed.merge(fill_value=0)
    starttime = min([st.stats.starttime for st in mseed])
    endtime = max([st.stats.endtime for st in mseed])
    mseed = mseed.trim(starttime, endtime, pad=True, fill_value=0)

    for i in range(len(mseed)):
        if mseed[i].stats.sampling_rate != sampling_rate:
            logging.warning(
                f"Resampling {mseed[i].id} from {mseed[i].stats.sampling_rate} to {sampling_rate} Hz"
            )
            mseed[i] = mseed[i].interpolate(sampling_rate, method="linear")

    order = ['3', '2', '1', 'E', 'N', 'Z']
    order = {key: i for i, key in enumerate(order)}
    comp2idx = {"3": 0, "2": 1, "1": 2, "E": 0, "N": 1, "Z": 2}

    nsta = len(station_locs)
    nt = max(len(mseed[i].data) for i in range(len(mseed)))
    data = []
    station_id = []
    t0 = []
    for i in range(nsta):
        trace_data = np.zeros([nt, n_channel], dtype=dtype)
        empty_station = True
        # sta = station_locs.iloc[i]["station"]
        sta = station_locs.index[i]
        comp = station_locs.iloc[i]["component"].split(",")
        if remove_resp:
            resp = station_locs.iloc[i]["response"].split(",")
            # resp = station_locs.iloc[i]["response"]

        for j, c in enumerate(sorted(comp, key=lambda x: order[x[-1]])):

            resp_j = float(resp[j])
            if len(comp) != 3:  ## less than 3 component
                j = comp2idx[c]

            if len(mseed.select(id=sta + c)) == 0:
                print(f"Empty trace: {sta+c} {starttime}")
                continue
            else:
                empty_station = False

            tmp = mseed.select(id=sta + c)[0].data.astype(dtype)
            trace_data[: len(tmp), j] = tmp[:nt]

            if station_locs.iloc[i]["unit"] == "m/s**2":
                tmp = mseed.select(id=sta + c)[0]
                tmp = tmp.integrate()
                tmp = tmp.filter("highpass", freq=1.0)
                tmp = tmp.data.astype(dtype)
                trace_data[: len(tmp), j] = tmp[:nt]
            elif station_locs.iloc[i]["unit"] == "m/s":
                tmp = mseed.select(id=sta + c)[0].data.astype(dtype)
                trace_data[: len(tmp), j] = tmp[:nt]
            else:
                print(
                    f"Error in {station_locs.iloc[i]['station']}\n{station_locs.iloc[i]['unit']} should be m/s**2 or m/s!"
                )
            
            if remove_resp:
                trace_data[:, j] /= resp_j
                
        if not empty_station:
            data.append(trace_data)
            station_id.append(sta)
            t0.append(starttime.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3])

    data = np.stack(data)

    meta = {"data": data, "t0": t0, "station_id": station_id, "fname": station_id}
    
    
    return meta

In [None]:
meta = convert_mseed(mseed, station_locs)

In [None]:
meta.keys()

## P/S Picks Phasenet

The code below calls an API, posting a dictionary containing 'id', 'timestamp', 'numpy array' passed as a python list, and returns JSON with model-generated P and S picks (time, probability, type)

I think this operates per trace and does not consider the fact that channels are from the same station / traces are for the same event.

In [None]:
%%time

PHASENET_API_URL = "http://phasenet.quakeflow.com"

# req = {"id": meta["station_id"], 
#        "timestamp": meta["t0"],
#        "vec": meta["data"].tolist()}
# resp = requests.post(f'{PHASENET_API_URL}/predict', json=req)
# phasenet_picks = resp.json()

batch = 4
phasenet_picks = []
for j in range(0, len(meta["station_id"]), batch):
    req = {"id": meta['station_id'][j:j+batch],
        "timestamp": meta["t0"][j:j+batch],
        "vec": meta["data"][j:j+batch].tolist()}

    resp = requests.post(f'{PHASENET_API_URL}/predict', json=req)
    phasenet_picks.extend(resp.json())

print('PhaseNet picks')
df = pd.DataFrame(phasenet_picks)
df.head()

In [None]:
# Save for later so you don't have to call API again
df.to_csv('phasenet.csv')

In [None]:
# TODO: plot pick timestamps with waveforms

## GaMMA

Takes a set of P- and S-wave arrival picks from a network of stations and associates them into individual earthquake events


NOTE: x,y,z domain bounds are set as parameters. Again, data is passed in JSON format as lists of phasenet_picks and station locations

In [None]:
GAMMA_API_URL = "http://gamma.quakeflow.com"

# stations_json = json.loads(station_locs.to_json(orient="records"))
stations_json = station_locs.to_dict(orient="records")
config_gamma = {'xlim_degree': config["xlim_degree"], 
                'ylim_degree': config["ylim_degree"],
                'z(km)': [0, 41]}

result = requests.post(f'{GAMMA_API_URL}/predict', json= {"picks": phasenet_picks, 
                                                          "stations": stations_json,
                                                           "config": config_gamma})

result = result.json()
catalog_gamma = result["catalog"]
picks_gamma = result["picks"]
print("GaMMA catalog:")
dfC = pd.DataFrame(catalog_gamma)[["time", "latitude", "longitude", "depth(m)", "magnitude", "covariance"]]
display(dfC)
print("GaMMA association:")
dfA = pd.DataFrame(picks_gamma)
display(dfA)

In [None]:
# NOTE: optional you can run both phasenet and gamma with a single API call

# PHASENET_API_URL = "http://phasenet.quakeflow.com"

# req = {"id": meta["station_id"], 
#        "timestamp": meta["t0"],
#        "vec": meta["data"].squeeze().tolist(),
#        "stations": stations_json,
#        "config": config_gamma}

# resp = requests.post(f'{PHASENET_API_URL}/predict_phasenet2gamma2ui', json=req)
# print(resp.json())
# result = resp.json()
# catalog_gamma = result["catalog"]
# picks_gamma = result["picks"]
# print("Catalog:")
# display(pd.DataFrame(catalog_gamma)[["time", "latitude", "longitude", "depth(m)", "magnitude", "covariance"]])
# print("Association:")
# display(pd.DataFrame(picks_gamma))

# Compare! 

In [None]:
event = events[0]
print(event.origins[0])
print(event.magnitudes[0])

In [None]:
catalog.iloc[[0]]

In [None]:
dfC

In [None]:
# NOTE: depth is very different! 