# Data Availability Wrangler

In [9]:
import pandas as pd
import pathlib
from aurora.test_utils.earthscope.helpers import DATA_AVAILABILITY_CSV
from aurora.test_utils.earthscope.helpers import PUBLIC_DATA_AVAILABILITY_PATH
DATA_AVAILABILITY_PATH = PUBLIC_DATA_AVAILABILITY_PATH

In [10]:
availability_files = DATA_AVAILABILITY_PATH.glob("mt_availability_*.txt")

In [11]:
availability_files = list(availability_files)

In [12]:
availability_files

[PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_XC.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_8P.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_US.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_EM.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_4P.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_1H.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_SF.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_IU.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_ZU.txt'),
 PosixPath('/home/kkappler/.cache/earthscope/data_availability/public/mt_availability_N4.txt'),
 PosixPath('/home/kkappler/.cache/earths

In [13]:
df_list = []
for af in availability_files:
    df = pd.read_csv(af, parse_dates=["Earliest","Latest","Span"])
    #print(df)
    df_list.append(df)

merged_df = pd.concat(df_list)

    

In [14]:
print(len(merged_df))

164966


In [15]:
merged_df.columns

Index(['#Network', 'Station', 'Location', 'Channel', 'Quality', 'SampleRate',
       'Earliest', 'Latest', 'Span'],
      dtype='object')

In [16]:
for col in merged_df.columns:
    unique_values = merged_df[col].unique()
    print(f"{col}: {len(unique_values)} unique values: \n {unique_values}")

#Network: 19 unique values: 
 ['XC' '8P' 'US' 'EM' '4P' '1H' 'SF' 'IU' 'ZU' 'N4' 'Z7' '8J' 'AV' '7I'
 'SN' 'BK' 'NV' 'YB' 'II']
Station: 2329 unique values: 
 ['FL001' 'FL002' 'FL003' ... 'MOJA' 'BFO' 'PFO']
Location: 14 unique values: 
 ['--' 40 9 '00' '40' 2 '01' '04' 'B1' 'W1' 'W2' 'W3' 0 20]
Channel: 34 unique values: 
 ['MFE' 'MFN' 'MFZ' 'MQE' 'MQN' 'LFE' 'LFN' 'LFZ' 'LQE' 'LQN' 'LF1' 'LF2'
 'VFE' 'VFN' 'VFZ' 'VQE' 'VQN' 'CF1' 'GF1' 'GF2' 'UFZ' 'VF1' 'VF2' 'CFZ'
 'UF1' 'UF2' 'BQ2' 'MF1' 'MF2' 'BF1' 'BF2' 'BFE' 'BFN' 'BFZ']
Quality: 1 unique values: 
 ['M']
SampleRate: 13 unique values: 
 [8.0000e+00 1.0000e+00 2.0000e-01 3.3333e-02 4.0000e+00 2.5000e+02
 4.0000e+03 1.0000e-02 1.0000e-01 5.0000e+02 2.0000e+02 4.0000e+01
 5.0000e+00]
Earliest: 59139 unique values: 
 <DatetimeArray>
[       '2015-01-08 19:49:15+00:00',        '2015-01-19 16:16:31+00:00',
        '2015-01-09 17:24:25+00:00',        '2015-01-19 19:33:06+00:00',
        '2015-01-10 15:52:11+00:00',        '2015-01-11 22

Looks like 34 different channel codes encountered

In [17]:
unique_channel_codes = merged_df.Channel.unique()
unique_channel_codes

array(['MFE', 'MFN', 'MFZ', 'MQE', 'MQN', 'LFE', 'LFN', 'LFZ', 'LQE',
       'LQN', 'LF1', 'LF2', 'VFE', 'VFN', 'VFZ', 'VQE', 'VQN', 'CF1',
       'GF1', 'GF2', 'UFZ', 'VF1', 'VF2', 'CFZ', 'UF1', 'UF2', 'BQ2',
       'MF1', 'MF2', 'BF1', 'BF2', 'BFE', 'BFN', 'BFZ'], dtype=object)

In [18]:
sample_rate_codes = [x[0] for x in unique_channel_codes]
sample_rate_codes = list(set(sample_rate_codes))
print(sample_rate_codes)

['B', 'U', 'L', 'G', 'V', 'M', 'C']


In [19]:
for src in sample_rate_codes:
    print(src, [x for x in unique_channel_codes if x[0]==src])

B ['BQ2', 'BF1', 'BF2', 'BFE', 'BFN', 'BFZ']
U ['UFZ', 'UF1', 'UF2']
L ['LFE', 'LFN', 'LFZ', 'LQE', 'LQN', 'LF1', 'LF2']
G ['GF1', 'GF2']
V ['VFE', 'VFN', 'VFZ', 'VQE', 'VQN', 'VF1', 'VF2']
M ['MFE', 'MFN', 'MFZ', 'MQE', 'MQN', 'MF1', 'MF2']
C ['CF1', 'CFZ']


Options for doing MT are {V, M, B, L}

OK, so FDSN does not support the concept of run, so we cannot groupby run (yet) ... 

Ideally, we would iterate over this df, calling each channel, and it would land in an appropriate spot in an h5 ...  that would be nice 
@Jared .. to discuss this concept.

But if it doesn't work, we can start by grouping the rows of the df into blocks by Net-Sta-Loc (all channels together getting called)


So lets, try grouping these into Runs and see in anything untoward happens ... 



In [20]:
locations = merged_df["Location"].unique()
merged_df.Location.value_counts()
# for loc_code in locations:
#     print(len(loc_code))

--    82583
9     44463
40    27019
20     4950
40     2356
0      1969
00      761
W1      273
W2      271
W3      266
B1       21
2        14
01       10
04       10
Name: Location, dtype: int64

In [21]:
net_sta_grouper = merged_df.groupby(["#Network", "Station"])

In [22]:
len(net_sta_grouper)

3401

Interesting, there are 3401 Net-Stas, but only 2329 Station codes ... which suggests that station names are used more than once.

In [23]:
net_sta_loc_grouper = merged_df.groupby(["#Network", "Station", "Location"])
print(len(net_sta_loc_grouper))

3407


And look, even more when we add Loc, suggesting that a Net-Sta was moved to another spot ... gadzooks - what have we gotten ourselves into ?

In [24]:
# for net_sta_loc, nsl_df in net_sta_grouper:
#     print(net_sta_loc, len(nsl_df))

### Look for "Normal" MT Stations, these have 5-channels

These net-sta-loc combinations will be appropriate for aurora MT processing.


In [25]:
five_channel_net_sta_locs = []
i_not_weird = 0
not_weird_networks = [] 
for net_sta_loc, nsl_df in net_sta_loc_grouper:
    network = net_sta_loc[0]
    n_ch = len(nsl_df)
    if n_ch == 5:  # this is "normal"
        i_not_weird+=1
        if network not in not_weird_networks:
            not_weird_networks.append(network)
            not_weird_networks = list(set(not_weird_networks))
        five_channel_net_sta_locs.append(net_sta_loc)
print(f"There are {i_not_weird} net_sta_loc combos with 5 channels")
print(f"These are found in the following networks: \n {not_weird_networks}")


There are 251 net_sta_loc combos with 5 channels
These are found in the following networks: 
 ['EM', 'ZU', 'XC', '1H', '8P', '4P', 'Z7']


### Check if location code is ever non degenerate for the cases of interest:

In [26]:
locs = [x[2] for x in five_channel_net_sta_locs]
unique_locs = list(set(locs))
print(f"Found {len(unique_locs)} unique location codes: \n {unique_locs}")
      

Found 1 unique location codes: 
 ['--']


OK, location is moot, and we can use as an iterator:

five_channel_net_sta_locs

So before launching, let's create a dataframe of the list of net-stas so that there 
is a shareable csv

In [27]:
n_sources = len(five_channel_net_sta_locs)
networks = n_sources * [""]
stations = n_sources * [""]
i_source=0
for net_sta_loc in five_channel_net_sta_locs:
    network = net_sta_loc[0]
    station = net_sta_loc[1]
    #print(station)
    networks[i_source] = network
    stations[i_source] = station
    i_source+= 1
    
df = pd.DataFrame(data={"network":networks, "station":stations})
print(df)
df.to_csv(DATA_AVAILABILITY_CSV, index=False)

    network station
0        1H   MB013
1        1H   MC011
2        1H   MC014
3        1H   MC016
4        1H   MD010
..      ...     ...
246      XC   FL020
247      XC   FL022
248      Z7   KAN04
249      Z7   KAN05
250      ZU   KSQ29

[251 rows x 2 columns]


In [28]:
print(len(merged_df))
reduced_df = merged_df[merged_df.Station.isin(stations)]
reduced_df = reduced_df[reduced_df["#Network"].isin(networks)]
print(len(reduced_df))

164966
1365


In [29]:
reduced_df.Channel.unique()

array(['MFE', 'MFN', 'MFZ', 'MQE', 'MQN', 'LFE', 'LFN', 'LFZ', 'LQE',
       'LQN', 'VFE', 'VFN', 'VFZ', 'VQE', 'VQN'], dtype=object)