In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import xarray as xr

In [None]:
# - Set data path
dpath = '/opt/acoustic-variability/data/'

In [None]:
# - Set saved out figure path
figpath = '/opt/acoustic-variability/python/figures/'

# Compile JASADCP metadata

## Read JASADCP metadata into df

In [None]:
fnames = sorted(os.listdir(dpath + 'JASADCP/ncfiles'))

In [None]:
nc_counter = len(fnames)
hardware_model = [None]*nc_counter
serial_numbers = [None]*nc_counter
transmit_frequency = [None]*nc_counter
phased_array = [None]*nc_counter
cruise_beg_date = [None]*nc_counter
blanking_interval = [None]*nc_counter
bin_length = [None]*nc_counter
transducer_beam_angle = [None]*nc_counter
transmit_pulse_length = [None]*nc_counter
comments = [None]*nc_counter
biomass_dtmn = [None]*nc_counter
geo_region = [None]*nc_counter

In [None]:
ifile = 0
for fname in fnames:
    ncfile = dpath + 'JASADCP/ncfiles/' + fname
    ncnow = xr.open_dataset(ncfile)
    strnow = ncnow.attrs['cruise_sonar_summary']
    # 1.) hardware_model
    hardware_modelnow = re.findall("HARDWARE MODEL *: *((?:\S+ )*\S+)", strnow)
    if hardware_modelnow:
        hardware_model[ifile] = hardware_modelnow[0]
    elif not hardware_modelnow: # if hardware_modelnow is empty
        manufacturernow = re.findall("MANUFACTURER *: *((?:\S+ )*\S+)", strnow)
        if manufacturernow:
            hardware_model[ifile] = manufacturernow[0]
    # 2.) serial_numbers
    serial_numbersnow = re.findall("SERIAL NUMBERS *: *((?:\S+ )*\S+)", strnow)
    if serial_numbersnow:
        serial_numbers[ifile] = serial_numbersnow[0]
    # 3.) transmit_frequency
    transmit_frequencynow = re.findall("TRANSMIT FREQUENCY *: *((?:\S+ )*\S+)", strnow)
    if transmit_frequencynow:
        transmit_frequency[ifile] = transmit_frequencynow[0]
    # 4.) phased_array 
    phased_arraynow = re.findall("phased.array", strnow, re.IGNORECASE)
    if phased_arraynow:
        sep = '///'; phased_arraynow = sep.join(phased_arraynow)
        phased_array[ifile] = phased_arraynow
    # 5.) cruise_beg_date
    cruise_beg_date[ifile] = ncnow['time'][0].values
    # 6.) blanking_interval
    blanking_intervalnow = re.findall("BLANKING INTERVAL *: *((?:\S+ )*\S+)", strnow)
    if blanking_intervalnow:
        blanking_interval[ifile] = blanking_intervalnow[0]
    # 7.) bin_length
    bin_lengthnow = re.findall("BIN LENGTH *: *((?:\S+ )*\S+)", strnow)
    if bin_lengthnow:
        bin_length[ifile] = bin_lengthnow[0]
    # 8.) transducer_beam_angle
    transducer_beam_anglenow = re.findall("TRANSDUCER BEAM ANGLE *: *((?:\S+ )*\S+)", strnow)
    if transducer_beam_anglenow:
        transducer_beam_angle[ifile] = transducer_beam_anglenow[0]
    # 9.) transmit_pulse_length
    transmit_pulse_lengthnow = re.findall("TRANSMIT PULSE LENGTH *: *((?:\S+ )*\S+)", strnow)
    if transmit_pulse_lengthnow:
        transmit_pulse_length[ifile] = transmit_pulse_lengthnow[0]
    # 10.) comments
    commentsnow = re.findall("COMMENTS *: *((?:\S+ )*\S+)", strnow)
    if commentsnow:
        sep = '///'; commentsnow = sep.join(commentsnow)
        comments[ifile] = commentsnow
    # 11.) biomass_dtmn
    biomass_dtmnnow = re.findall("BIOMASS DETERMINATION *: *((?:\S+ )*\S+)", strnow)
    if biomass_dtmnnow:
        biomass_dtmn[ifile] = biomass_dtmnnow[0]
    # 12.) geo_region
    geo_regionnow = re.findall("GEOGRAPHIC_REGION *: *((?:\S+ )*\S+)", strnow)
    if geo_regionnow:
        geo_region[ifile] = geo_regionnow[0]
    ifile = ifile+1

In [None]:
df = pd.concat([pd.Series(fnames,name='fname'),
     pd.Series(hardware_model,name='hardware_model'), pd.Series(serial_numbers,name='serial_numbers'),
     pd.Series(transmit_frequency,name='transmit_frequency'),pd.Series(phased_array,name='phased_array'),
     pd.Series(cruise_beg_date,name='cruise_beg_date'),pd.Series(blanking_interval,name='blanking_interval'),
     pd.Series(bin_length,name='bin_length'),pd.Series(transducer_beam_angle,name='transducer_beam_angle'),
     pd.Series(transmit_pulse_length,name='transmit_pulse_length'),pd.Series(comments,name='comments'),
     pd.Series(biomass_dtmn,name='biomass_dtmn'),pd.Series(geo_region,name='geo_region')],
    axis=1)

In [None]:
# - Define fxns to describe bandwidth from comments, hardware_model, and transmit_frequency 

# --> Checks for hardware_model names containing NB = narrowband
#dfnow = df.dropna(subset=['hardware_model'])
#dfnow[dfnow['hardware_model'].str.contains('NB')]['hardware_model'].unique()
# --> Results are:
# array(['NB 150 (VM-150-18HP)', 'NB 150', 'VM-150 (NB)'], dtype=object)

def set_bandwidth_from_comments(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['comments']:
        if (re.search('broadband', row['comments'], re.IGNORECASE) and
                re.search('narro', row['comments'], re.IGNORECASE)):
            return 'both broad and narrowband?' 
        elif re.search('broadband', row['comments'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['comments'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_hardware_model(row):
    if row['hardware_model']:
        if re.search('broad', row['hardware_model'], re.IGNORECASE):
            return 'broadband'
        elif (re.search('narro', row['hardware_model'], re.IGNORECASE) or
                re.search('nb', row['hardware_model'], re.IGNORECASE)):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_transmit_frequency(row):
    if row['transmit_frequency']:
        if re.search('broadband', row['transmit_frequency'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['transmit_frequency'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_final_bandwidth(row):
    lsnow = [row['bw_from_comments'],row['bw_from_hardware_model'],row['bw_from_transmit_frequency']]
    idxnow = [i for i,lsitem in enumerate(lsnow) if lsitem != None]
    if len(idxnow)>0:
        if len(idxnow)==3:
            if lsnow[1]!=lsnow[2]:
                return 'CHECK NC FILE'
            else:
                return lsnow[2]
        elif len(idxnow)==2:
            if idxnow==[0,1]:
            # prioritize bw_from_hardware_model over bw_from_comments
                return lsnow[1]
            elif idxnow==[0,2]:
            # prioritize bw_from_transmit_frequency over bw_from_hardware_model
                return lsnow[2]
            elif (idxnow==[1,2]) and (lsnow[1]==lsnow[2]):
                return lsnow[2]
            elif (idxnow==[1,2]) and (lsnow[1]!=lsnow[2]):
            # equal priority for bw_from_transmit_frequency and bw_from_hardware_model
                return 'CHECK NC FILE'            
        elif len(idxnow)==1:
            return lsnow[idxnow[0]]
    else:
        return None

In [None]:
df['bw_from_comments']=df.apply(set_bandwidth_from_comments, axis=1)
df['bw_from_hardware_model']=df.apply(set_bandwidth_from_hardware_model, axis=1)
df['bw_from_transmit_frequency']=df.apply(set_bandwidth_from_transmit_frequency, axis=1)
df['bandwidth']=df.apply(set_final_bandwidth, axis=1)
# - Testing
#pd.set_option('max_rows', 500)
#pd.set_option('display.max_colwidth', -1)
#df[df['bw_from_comments']=='narrowband'][['hardware_model','transmit_frequency','bw_from_comments','bw_from_hardware_model','bw_from_transmit_frequency','bandwidth','comments']]
# also looked at: ['bw_from_comments']=='broadband', ['bw_from_hardware_model']=='narrowband',
# ['bw_from_hardware_model']=='broadband', ['bw_from_transmit_frequency']=='narrowband',
# ['bw_from_hardware_model']=='broadband', ['bandwidth']=='CHECK NC FILE'
# --> I CHECKED ALL PRINTED OUT ROWS AND ALL GOT THE RIGHT ANSWER IN THE 
# FINAL COLUMN NAMED 'bandwidth'; ALSO no files triggered the 'CHECK NC FILE'
# designation in the 'bandwidth' column, yay!

In [None]:
#pd.set_option('max_rows', 1000)
#pd.set_option('display.max_colwidth', -1)
#df.head()

## Uniformly rename hardware_model names

In [None]:
# - Save out all unique hardware_model names + # of occurrences to look at in separate window
# --> use this output to create the legend below of how to translate different instrument names to a uniform list of names
#df['hardware_model'].value_counts().to_csv('jasadcp_unique_instruments.csv', header=['hardware_model count'])

#### **List of uniform instrument names:**

**All names to become 'OS-38'**:\
'Ocean Surveyer 38', 'Ocean Surveyor 38'

**All names to become 'OS-75'**:\
'Ocean Surveyor 75', 'OS75 narrowband', 'Ocean Surveyer 75', 'Ocean Surveyor 75 narrowband', 'OS75 (Ocean Surveyor)', 'Ocean Surveyor 75 broadband', 'Ocean Surveyor OS75', 'OS75', 'Ocean Surveryor 75', 'Ocean Surveyor 75 narroband', 'Ocean Surveyor 75 Broadband', 'Ocean Surveyor 75 Narrowband', '75KHz Ocean Surveyor narrowband', 'Ocean Surveyor 75 kHz', 'RDI 75KHz Ocean Surveyor', 'Ocean Surveyor 75 kHz Phased Array', '75KHz Ocean Surveyor'

**All names to become 'OS-150'**:\
'Ocean Surveyer 150', 'Ocean Surveryor 150', 'Ocean Surveyor 150 narroband', 'Ocean Surveyor 150 narrowband', 'Ocean Surveyor 150 broadband'

**All names to become 'OS-II-38'**:\
'Ocean Surveyor II (OS-II 38)'

**All names to become 'OS-II-75'**:\
'OSII75S phased-array'

**All names to become 'VM-75'**:\
'VM75 narrowband'

**All names to become 'VM-150'**:\
'VM-150', 'RD-VM150', 'VM-150 Narrowband', 'RD-VM150 Narrow band', 'VM-150 (NB)', 'VM-150 narrowband', 'RDI VM150 narrowband', 'VM150', 'RD-VM150 narrowband', 'RD-VM0150'

**All names to become 'VM-300'**:\
'VM-300', 'RD-VM300'

**All names to become 'VM-150-18HP'**:\
'NB 150 (VM-150-18HP)'

**All names to become 'Workhorse-300'**:\
'WorkHorse 300', 'Workhorse 300', 'Workhorse 300; 300 kHz' 

**All names to become 'Workhorse-1200'**:\
'Workhorse 1200'

**All names to become 'Workhorse-Mariner-300'**:\
'Workhorse Mariner (300 kHz)'

**All names to become 'Workhorse-Mariner-600'**:\
'WorkHorse Mariner 600'

**All names to become 'DCP4400A'**:\
'DCP4400A'

**All names to become 'UNCLEAR'**:\
'Narrowband 150', 'NB 150', '150', 'narrowband 75 kHz', 'RDI', '150 narrowband', '150 kHz Narrowband', 'Broadband 150', 'Broad Band 150', '150 kHz hull mounted ADCP', 'Narrowband', 'Narrowband 300', '150 kHz', 'Direct-Read 150 kHz Narrowband' (HAS ALL SERIAL NUMBERS), 'Vessel-mounted 150 kHz Narrowband' (HAS ALL SERIAL NUMBERS), 'Narrow Band 150Khz', 'Vessel-mount 150 kHz Narrowband', '1) Narrow Band 150 kHz', '153.6 kHz hull mounted ADCP', '300 narrow band', '150 kHz broadband', '150 kHz narrow band', '"150 broad band, concave"', 'Vessel-Mount 150 kHz Narrowband' (HAS ALL SERIAL NUMBERS)

**Special cases**:\
1.) 'RD-VM' (HAS ALL SERIAL NUMBERS, READ TRANSMIT FREQUENCY, TOO - IF 150 THEN VM-150, ETC.)  
2.) 'VM-150 and VM-300' (CHECKED NC FILES - CAN'T DTMN IF IT'S 150 OR 300 --> DISCARD)

In [None]:
# - Define fxn to uniformly rename different original hardware_model names
def set_uniform_name_orig_hardware_model(row):
    if row['hardware_model']:
        if row['hardware_model']=='DCP4400A':
            return 'DCP4400A'
        elif row['hardware_model'] in ['Ocean Surveyer 38', 'Ocean Surveyor 38']:
            return 'OS-38' 
        elif row['hardware_model'] in ['Ocean Surveyor 75', 'OS75 narrowband',
                                       'Ocean Surveyer 75', 'Ocean Surveyor 75 narrowband',
                                       'OS75 (Ocean Surveyor)', 'Ocean Surveyor 75 broadband',
                                       'Ocean Surveyor OS75', 'OS75', 'Ocean Surveryor 75',
                                       'Ocean Surveyor 75 narroband', 'Ocean Surveyor 75 Broadband',
                                       'Ocean Surveyor 75 Narrowband', '75KHz Ocean Surveyor narrowband',
                                       'Ocean Surveyor 75 kHz', 'RDI 75KHz Ocean Surveyor',
                                       'Ocean Surveyor 75 kHz Phased Array', '75KHz Ocean Surveyor']:
            return 'OS-75' 
        elif row['hardware_model'] in ['Ocean Surveyer 150', 'Ocean Surveryor 150',
                                       'Ocean Surveyor 150 narroband', 'Ocean Surveyor 150 narrowband',
                                       'Ocean Surveyor 150 broadband']:
            return 'OS-150'
        elif row['hardware_model'] in ['Ocean Surveyor II (OS-II 38)']:
            return 'OS-II-38'
        elif row['hardware_model'] in ['OSII75S phased-array']:
            return 'OS-II-75'
        elif row['hardware_model'] in ['VM75 narrowband']:
            return 'VM-75'
        elif row['hardware_model'] in ['VM-150', 'RD-VM150', 'VM-150 Narrowband', 'RD-VM150 Narrow band',
                                       'VM-150 (NB)', 'VM-150 narrowband', 'RDI VM150 narrowband',
                                       'VM150', 'RD-VM150 narrowband', 'RD-VM0150']:
            return 'VM-150'
        elif row['hardware_model'] in ['VM-300', 'RD-VM300']:
            return 'VM-300'
        elif row['hardware_model'] in ['RD-VM']:
            if re.search('150', row['transmit_frequency']):
                return 'VM-150'
        elif row['hardware_model'] in ['NB 150 (VM-150-18HP)']:
            return 'VM-150-18HP'
        elif row['hardware_model'] in ['WorkHorse 300', 'Workhorse 300', 'Workhorse 300; 300 kHz']:
            return 'Workhorse-300'
        elif row['hardware_model'] in ['Workhorse 1200']:
            return 'Workhorse-1200'
        elif row['hardware_model'] in ['Workhorse Mariner (300 kHz)']:
            return 'Workhorse-Mariner-300'
        elif row['hardware_model'] in ['WorkHorse Mariner 600']:
            return 'Workhorse-Mariner-600'
        else:
            return 'ZUNCLEAR: ' + row['hardware_model']
    else:
        return None

In [None]:
# - Create df w/ uniform instrument names
df['instrument_name']=df.apply(set_uniform_name_orig_hardware_model, axis=1)
cols = list(df.columns.values)
cols = cols[0:2]+['instrument_name']+cols[2:-1]
df = df[cols]

In [None]:
# - Get rid of NULL instrument_name rows (there's only 1 and it has no other info)
# and 'ZUNCLEAR: VM-150 and VM-300' rows
df = df[~df['instrument_name'].isnull()]
df = df[~(df['instrument_name']=='ZUNCLEAR: VM-150 and VM-300')]

In [None]:
# - Get rid of columns not needed anymore
df.drop(columns=['bw_from_comments','bw_from_hardware_model',
                 'bw_from_transmit_frequency'], inplace=True)