# Data Availability Wrangler

In [6]:
import pandas as pd
from aurora.test_utils.earthscope.helpers import DATA_AVAILABILITY_PATH

In [3]:
availability_files = DATA_AVAILABILITY_PATH.glob("mt_availability_*.txt")

In [4]:
availability_files = list(availability_files)

In [5]:
availability_files

[PosixPath('data_availability/public/mt_availability_1H.txt'),
 PosixPath('data_availability/public/mt_availability_7I.txt'),
 PosixPath('data_availability/public/mt_availability_NV.txt'),
 PosixPath('data_availability/public/mt_availability_EM.txt'),
 PosixPath('data_availability/public/mt_availability_II.txt'),
 PosixPath('data_availability/public/mt_availability_XC.txt'),
 PosixPath('data_availability/public/mt_availability_US.txt'),
 PosixPath('data_availability/public/mt_availability_BK.txt'),
 PosixPath('data_availability/public/mt_availability_YB.txt'),
 PosixPath('data_availability/public/mt_availability_4P.txt'),
 PosixPath('data_availability/public/mt_availability_8J.txt'),
 PosixPath('data_availability/public/mt_availability_AV.txt'),
 PosixPath('data_availability/public/mt_availability_Z7.txt'),
 PosixPath('data_availability/public/mt_availability_8P.txt'),
 PosixPath('data_availability/public/mt_availability_IU.txt'),
 PosixPath('data_availability/public/mt_availability_SF

In [8]:
df_list = []
for af in availability_files:
    df = pd.read_csv(af, parse_dates=["Earliest","Latest","Span"])
    #print(df)
    df_list.append(df)

merged_df = pd.concat(df_list)

    

In [9]:
print(len(merged_df))

164598


In [10]:
merged_df.columns

Index(['#Network', 'Station', 'Location', 'Channel', 'Quality', 'SampleRate',
       'Earliest', 'Latest', 'Span'],
      dtype='object')

In [14]:
for col in merged_df.columns:
    unique_values = merged_df[col].unique()
    print(f"{col}: {len(unique_values)} unique values: \n {unique_values}")

#Network: 19 unique values: 
 ['1H' '7I' 'NV' 'EM' 'II' 'XC' 'US' 'BK' 'YB' '4P' '8J' 'AV' 'Z7' '8P'
 'IU' 'SF' 'ZU' 'N4' 'SN']
Station: 2329 unique values: 
 ['MA011' 'MB012' 'MB013' ... 'EM002' 'EM030' 'EM060']
Location: 14 unique values: 
 ['--' 2 'B1' 'W1' 'W2' 'W3' 0 20 40 '00' '40' 9 '01' '04']
Channel: 34 unique values: 
 ['LFE' 'LFN' 'LFZ' 'LQE' 'LQN' 'VF1' 'VF2' 'MF1' 'MF2' 'MFZ' 'MFE' 'MFN'
 'VFE' 'VFN' 'VFZ' 'VQE' 'VQN' 'MQE' 'MQN' 'BF1' 'BF2' 'BFE' 'BFN' 'BFZ'
 'LF1' 'LF2' 'UF1' 'UF2' 'BQ2' 'UFZ' 'CF1' 'GF1' 'GF2' 'CFZ']
Quality: 1 unique values: 
 ['M']
SampleRate: 13 unique values: 
 [1.0000e+00 2.0000e-01 5.0000e+00 8.0000e+00 3.3333e-02 4.0000e+00
 4.0000e+01 1.0000e-02 1.0000e-01 2.5000e+02 4.0000e+03 5.0000e+02
 2.0000e+02]
Earliest: 58898 unique values: 
 <DatetimeArray>
[       '2018-09-09 21:52:25+00:00',        '2018-09-09 22:40:58+00:00',
        '2018-09-10 21:49:12+00:00',        '2018-09-19 02:17:40+00:00',
        '2018-09-19 03:28:29+00:00',        '2018-09-

OK, so FDSN does not support the concept of run, so we cannot groupby run (yet) ... But that is maybe OK ...

Ideally, we would iterate over this df, calling each channel, and it would land in an appropriate spot in an h5 ...  that would be nice 
@Jared .. to discuss this concept.

But if it doesn't work, we can start by grouping the rows of the df into blocks by Net-Sta-Loc (all channels together getting called)


So lets, try grouping these into Runs and see in anything untoward happens ... 



In [15]:
net_sta_grouper = merged_df.groupby(["#Network", "Station"])

In [16]:
len(net_sta_grouper)

3401

Interesting, there are 3401 Net-Stas, but only 2329 Station codes ... 

In [17]:
net_sta_loc_grouper = merged_df.groupby(["#Network", "Station", "Location"])
print(len(net_sta_loc_grouper))

3407


And look, even more when we add Loc, suggesting that a Net-Sta was moved to another spot ... gadzooks - what have we gotten ourselves into ?

In [19]:
for net_sta_loc, nsl_df in net_sta_grouper:
    print(net_sta_loc, len(nsl_df))

('1H', 'MA011') 15
('1H', 'MB012') 10
('1H', 'MB013') 5
('1H', 'MB014') 10
('1H', 'MB015') 10
('1H', 'MC010') 15
('1H', 'MC011') 5
('1H', 'MC012') 20
('1H', 'MC013') 10
('1H', 'MC014') 5
('1H', 'MC015') 15
('1H', 'MC016') 5
('1H', 'MD010') 5
('1H', 'MD011') 10
('1H', 'MD012') 10
('1H', 'MD013') 5
('1H', 'ME010') 10
('1H', 'ME011') 5
('1H', 'ME012') 5
('1H', 'MF010') 10
('1H', 'MF011') 10
('1H', 'MF012') 5
('1H', 'MF013') 10
('1H', 'MF014') 120
('1H', 'MG010') 10
('1H', 'MG011') 20
('1H', 'MG012') 5
('1H', 'MG013') 10
('1H', 'MG014') 5
('1H', 'MH010') 5
('1H', 'MH011') 20
('1H', 'MH013') 25
('1H', 'MH014') 10
('1H', 'MH015') 15
('1H', 'MI010') 5
('1H', 'MI011') 5
('1H', 'MI012') 25
('1H', 'MI013') 10
('1H', 'MJ010') 10
('1H', 'MJ011') 5
('1H', 'MJ012') 5
('1H', 'MJ013') 10
('1H', 'MJ014') 15
('1H', 'MJ015') 10
('1H', 'MK010') 10
('1H', 'MK011') 10
('1H', 'MK012') 10
('1H', 'ML010') 10
('1H', 'ML011') 5
('1H', 'ML012') 10
('1H', 'ML014') 10
('1H', 'ML015') 15
('1H', 'MM011') 10
('1H', 'M

In [24]:
i_not_weird = 0
not_weird_networks = []
for net_sta_loc, nsl_df in net_sta_loc_grouper:
    network = net_sta_loc[0]
    n_ch = len(nsl_df)
    if n_ch == 5:  # this is "normal"
        i_not_weird+=1
        if network not in not_weird_networks:
            not_weird_networks.append(network)
            not_weird_networks = list(set(not_weird_networks))
        
        print(net_sta_loc, len(nsl_df))
print(i_not_weird)
print(not_weird_networks)

('1H', 'MB013', '--') 5
('1H', 'MC011', '--') 5
('1H', 'MC014', '--') 5
('1H', 'MC016', '--') 5
('1H', 'MD010', '--') 5
('1H', 'MD013', '--') 5
('1H', 'ME011', '--') 5
('1H', 'ME012', '--') 5
('1H', 'MF012', '--') 5
('1H', 'MG012', '--') 5
('1H', 'MG014', '--') 5
('1H', 'MH010', '--') 5
('1H', 'MI010', '--') 5
('1H', 'MI011', '--') 5
('1H', 'MJ011', '--') 5
('1H', 'MJ012', '--') 5
('1H', 'ML011', '--') 5
('1H', 'MM012', '--') 5
('1H', 'MM013', '--') 5
('1H', 'MM015', '--') 5
('1H', 'MN011', '--') 5
('1H', 'MN014', '--') 5
('1H', 'MP012', '--') 5
('1H', 'MQ014', '--') 5
('4P', 'CAM02', '--') 5
('4P', 'CAM06', '--') 5
('4P', 'CAN06', '--') 5
('4P', 'GAA53', '--') 5
('4P', 'MD060', '--') 5
('4P', 'MTD19', '--') 5
('4P', 'NDB32', '--') 5
('4P', 'NFD30', '--') 5
('4P', 'NVM08', '--') 5
('4P', 'NVM09', '--') 5
('4P', 'NVM10', '--') 5
('4P', 'NVN08', '--') 5
('4P', 'NVN10', '--') 5
('4P', 'NVN12', '--') 5
('4P', 'NVO07', '--') 5
('4P', 'NVO10', '--') 5
('4P', 'NVP09', '--') 5
('4P', 'NVP10', 