In [3]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import xarray as xr

In [4]:
# - Set data path
dpath = '/opt/acoustic-variability/data/'

In [5]:
# - Set saved out figure path
figpath = '/opt/acoustic-variability/python/figures/'

# Compile JASADCP metadata

## Read JASADCP metadata into df

In [6]:
fnames = sorted(os.listdir(dpath + 'JASADCP/ncfiles'))

In [7]:
nc_counter = len(fnames)
hardware_model = [None]*nc_counter
serial_numbers = [None]*nc_counter
transmit_frequency = [None]*nc_counter
phased_array = [None]*nc_counter
cruise_beg_date = [None]*nc_counter
blanking_interval = [None]*nc_counter
bin_length = [None]*nc_counter
transducer_beam_angle = [None]*nc_counter
transmit_pulse_length = [None]*nc_counter
comments = [None]*nc_counter
biomass_dtmn = [None]*nc_counter
geo_region = [None]*nc_counter
chief_scientist = [None]*nc_counter
personnel_charge = [None]*nc_counter
manufacturer = [None]*nc_counter
transducer_config = [None]*nc_counter
depth_range = [None]*nc_counter
num_bins = [None]*nc_counter
ens_avg_intvl = [None]*nc_counter
ss_calc = [None]*nc_counter
project = [None]*nc_counter
nb150 = [None]*nc_counter
nb300 = [None]*nc_counter
bb150 = [None]*nc_counter
bb300 = [None]*nc_counter
beam_width = [None]*nc_counter
cruise_name = [None]*nc_counter
platform_name = [None]*nc_counter
comment = [None]*nc_counter
all_text = [None]*nc_counter

In [8]:
ifile = 0
for fname in fnames:
    ncfile = dpath + 'JASADCP/ncfiles/' + fname
    ncnow = xr.open_dataset(ncfile)
    strnow = ncnow.attrs['cruise_sonar_summary']
    # 1.) hardware_model
    hardware_modelnow = re.findall("HARDWARE MODEL *: *((?:\S+ )*\S+)", strnow)
    if hardware_modelnow:
        hardware_model[ifile] = hardware_modelnow[0]
    elif not hardware_modelnow: # if hardware_modelnow is empty
        manufacturernow = re.findall("MANUFACTURER *: *((?:\S+ )*\S+)", strnow)
        if manufacturernow:
            hardware_model[ifile] = manufacturernow[0]
    # 2.) serial_numbers
    serial_numbersnow = re.findall("SERIAL NUMBERS *: *((?:\S+ )*\S+)", strnow)
    if serial_numbersnow:
        serial_numbers[ifile] = serial_numbersnow[0]
    # 3.) transmit_frequency
    transmit_frequencynow = re.findall("TRANSMIT FREQUENCY *: *((?:\S+ )*\S+)", strnow)
    if transmit_frequencynow:
        transmit_frequency[ifile] = transmit_frequencynow[0]
    # 4.) phased_array 
    phased_arraynow = re.findall("phased.array", strnow, re.IGNORECASE)
    if phased_arraynow:
        sep = '///'; phased_arraynow = sep.join(phased_arraynow)
        phased_array[ifile] = phased_arraynow
    # 5.) cruise_beg_date
    cruise_beg_date[ifile] = ncnow['time'][0].values
    # 6.) blanking_interval
    blanking_intervalnow = re.findall("BLANKING INTERVAL *: *((?:\S+ )*\S+)", strnow)
    if blanking_intervalnow:
        blanking_interval[ifile] = blanking_intervalnow[0]
    # 7.) bin_length
    bin_lengthnow = re.findall("BIN LENGTH *: *((?:\S+ )*\S+)", strnow)
    if bin_lengthnow:
        bin_length[ifile] = bin_lengthnow[0]
    # 8.) transducer_beam_angle
    transducer_beam_anglenow = re.findall("TRANSDUCER BEAM ANGLE *: *((?:\S+ )*\S+)", strnow)
    if transducer_beam_anglenow:
        transducer_beam_angle[ifile] = transducer_beam_anglenow[0]
    # 9.) transmit_pulse_length
    transmit_pulse_lengthnow = re.findall("TRANSMIT PULSE LENGTH *: *((?:\S+ )*\S+)", strnow)
    if transmit_pulse_lengthnow:
        transmit_pulse_length[ifile] = transmit_pulse_lengthnow[0]
    # 10.) comments
    commentsnow = re.findall("COMMENTS *: *((?:\S+ )*\S+)", strnow)
    if commentsnow:
        sep = '///'; commentsnow = sep.join(commentsnow)
        comments[ifile] = commentsnow
    # 11.) biomass_dtmn
    biomass_dtmnnow = re.findall("BIOMASS DETERMINATION *: *((?:\S+ )*\S+)", strnow)
    if biomass_dtmnnow:
        biomass_dtmn[ifile] = biomass_dtmnnow[0]
    # 12.) geo_region
    geo_regionnow = re.findall("GEOGRAPHIC_REGION *: *((?:\S+ )*\S+)", strnow)
    if geo_regionnow:
        geo_region[ifile] = geo_regionnow[0]
    # 13.) chief_scientist
    chief_scientistnow = re.findall("CHIEF SCIENTIST ON SHIP *: *((?:\S+ )*\S+)", strnow)
    if chief_scientistnow:
        chief_scientist[ifile] = chief_scientistnow[0]
    # 14.) personnel_charge
    personnel_chargenow = re.findall("PERSONNEL IN CHARGE *: *((?:\S+ )*\S+)", strnow)
    if personnel_chargenow:
        personnel_charge[ifile] = personnel_chargenow[0]
    # 15.) manufacturer
    manufacturernow = re.findall("MANUFACTURER *: *((?:\S+ )*\S+)", strnow)
    if manufacturernow:
        manufacturer[ifile] = manufacturernow[0]
    # 16.) transducer_config
    transducer_confignow = re.findall("TRANSDUCER CONFIGURATION *: *((?:\S+ )*\S+)", strnow)
    if transducer_confignow:
        transducer_config[ifile] = transducer_confignow[0]
    # 17.) depth_range
    depth_rangenow = re.findall("DEPTH RANGE *: *((?:\S+ )*\S+)", strnow)
    if depth_rangenow:
        depth_range[ifile] = depth_rangenow[0]
    # 18.) num_bins
    num_binsnow = re.findall("NUMBER OF BINS *: *((?:\S+ )*\S+)", strnow)
    if num_binsnow:
        num_bins[ifile] = num_binsnow[0]
    # 19.) ens_avg_intvl
    ens_avg_intvlnow = re.findall("ENSEMBLE AVERAGING INTERVAL *: *((?:\S+ )*\S+)", strnow)
    if ens_avg_intvlnow:
        ens_avg_intvl[ifile] = ens_avg_intvlnow[0]
    # 20.) ss_calc
    ss_calcnow = re.findall("SOUND SPEED CALCULATION *: *((?:\S+ )*\S+)", strnow)
    if ss_calcnow:
        ss_calc[ifile] = ss_calcnow[0]
    # 21.) project
    projectnow = re.findall("PROJECT *: *((?:\S+ )*\S+)", strnow)
    if projectnow:
        project[ifile] = projectnow[0]
    # 22.) nb150
    nb150now = re.findall("nb150", strnow, re.IGNORECASE)
    if nb150now:
        sep = '///'; nb150now = sep.join(nb150now)
        nb150[ifile] = nb150now
    # 23.) nb300
    nb300now = re.findall("nb300", strnow, re.IGNORECASE)
    if nb300now:
        sep = '///'; nb300now = sep.join(nb300now)
        nb300[ifile] = nb300now
    # 22.) bb150
    bb150now = re.findall("bb150", strnow, re.IGNORECASE)
    if bb150now:
        sep = '///'; bb150now = sep.join(bb150now)
        bb150[ifile] = bb150now
    # 23.) bb300
    bb300now = re.findall("bb300", strnow, re.IGNORECASE)
    if bb300now:
        sep = '///'; bb300now = sep.join(bb300now)
        bb300[ifile] = bb300now
    # 24.) beam_width
    beam_widthnow = re.findall("ACOUSTIC BEAM WIDTH *: *((?:\S+ )*\S+)", strnow)
    if beam_widthnow:
        beam_width[ifile] = beam_widthnow[0]
    # 25.) cruise_name
    cruise_namenow = re.findall("CRUISE_NAME *: *((?:\S+ )*\S+)", strnow)
    if cruise_namenow:
        cruise_name[ifile] = cruise_namenow[0]
    # 26.) platform_name
    platform_namenow = re.findall("PLATFORM_NAME *: *((?:\S+ )*\S+)", strnow)
    if platform_namenow:
        platform_name[ifile] = platform_namenow[0]
    # 27.) comment
    commentnow = re.findall("COMMENT *: *((?:\S+ )*\S+)", strnow)
    if commentnow:
        sep = '///'; commentnow = sep.join(commentnow)
        comment[ifile] = commentnow
    # 28.) all_text
    all_text[ifile] = strnow
    ifile = ifile+1

In [9]:
df = pd.concat([pd.Series(fnames,name='fname'),
     pd.Series(hardware_model,name='hardware_model'),pd.Series(serial_numbers,name='serial_numbers'),
     pd.Series(transmit_frequency,name='transmit_frequency'),pd.Series(phased_array,name='phased_array'),
     pd.Series(cruise_beg_date,name='cruise_beg_date'),pd.Series(blanking_interval,name='blanking_interval'),
     pd.Series(bin_length,name='bin_length'),pd.Series(transducer_beam_angle,name='transducer_beam_angle'),
     pd.Series(transmit_pulse_length,name='transmit_pulse_length'),pd.Series(comments,name='comments'),
     pd.Series(biomass_dtmn,name='biomass_dtmn'),pd.Series(geo_region,name='geo_region'),
     pd.Series(manufacturer,name='manufacturer'),pd.Series(transducer_config,name='transducer_config'),
     pd.Series(depth_range,name='depth_range'),pd.Series(num_bins,name='num_bins'),
     pd.Series(ens_avg_intvl,name='ens_avg_intvl'),pd.Series(ss_calc,name='ss_calc'),
     pd.Series(project,name='project'),pd.Series(beam_width,name='beam_width'),
     pd.Series(cruise_name,name='cruise_name'),pd.Series(platform_name,name='platform_name'),
     pd.Series(comment,name='comment'),pd.Series(all_text,name='all_text')],
    axis=1)

In [10]:
# - Define fxns to describe bandwidth from all_text, comments, comments, 
# cruise_beg_date, cruise_name, beam_width, hardware_model, and transmit_frequency 

def set_bandwidth_from_all_text(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['all_text']:
        if (re.search('narrowband', row['all_text'], re.IGNORECASE) or
              re.search('narrow band', row['all_text'], re.IGNORECASE) or
              re.search('nb75', row['all_text'], re.IGNORECASE) or
              re.search('nb150', row['all_text'], re.IGNORECASE) or
              re.search('nb300', row['all_text'], re.IGNORECASE) or
              re.search('nb600', row['all_text'], re.IGNORECASE) or
              re.search('nb1200', row['all_text'], re.IGNORECASE) or
              re.search('pingtype = nb', row['all_text'], re.IGNORECASE)
             ):
            return 'narrowband'
        elif (re.search('pingtype = bb', row['all_text'], re.IGNORECASE)):
            return 'broadband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_comment(row):
    if row['comment']:
        if re.search('narrowband', row['comment'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_comments(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['comments']:
        if (re.search('broadband', row['comments'], re.IGNORECASE) and
                re.search('narro', row['comments'], re.IGNORECASE)):
            return 'both broad and narrowband?' 
        elif re.search('broadband', row['comments'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['comments'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_cruise_beg_date(row):
    if row['cruise_beg_date']:
        if row['cruise_beg_date']<pd.Timestamp(1991,12,1):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_cruise_name(row):
    if row['cruise_name']:
        if (re.search('os38bb', row['cruise_name'], re.IGNORECASE) or
              re.search('os75bb', row['cruise_name'], re.IGNORECASE) or
              re.search('os150bb', row['cruise_name'], re.IGNORECASE)
             ):        
            return 'broadband'
        elif (re.search('os38nb', row['cruise_name'], re.IGNORECASE) or
              re.search('os75nb', row['cruise_name'], re.IGNORECASE) or
              re.search('os150nb', row['cruise_name'], re.IGNORECASE) or
              re.search('nb150', row['cruise_name'], re.IGNORECASE) or
              re.search('nb300', row['cruise_name'], re.IGNORECASE)
             ):        
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_beam_width(row):
    if row['beam_width']:
        if re.search('narrow band', row['beam_width'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_hardware_model(row):
    if row['hardware_model']:
        if re.search('broad', row['hardware_model'], re.IGNORECASE):
            return 'broadband'
        elif (re.search('narro', row['hardware_model'], re.IGNORECASE) or
                re.search('nb', row['hardware_model'], re.IGNORECASE)):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_transmit_frequency(row):
    if row['transmit_frequency']:
        if re.search('broadband', row['transmit_frequency'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['transmit_frequency'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_final_bandwidth(row):
    # lsnow is in order of priority
    lsnow = [row['bw_from_transmit_frequency'],row['bw_from_hardware_model'],
             row['bw_from_beam_width'],row['bw_from_cruise_name'],row['bw_from_cruise_beg_date'],
             row['bw_from_comments'],row['bw_from_comment'],row['bw_from_all_text']] 
    idxnow = [i for i,lsitem in enumerate(lsnow) if lsitem != None]
    if len(idxnow)>0:
        return lsnow[np.asarray(idxnow).min()]
    else:
        return None

In [11]:
df['bw_from_transmit_frequency']=df.apply(set_bandwidth_from_transmit_frequency, axis=1)
df['bw_from_hardware_model']=df.apply(set_bandwidth_from_hardware_model, axis=1)
df['bw_from_beam_width']=df.apply(set_bandwidth_from_beam_width, axis=1)
df['bw_from_cruise_name']=df.apply(set_bandwidth_from_cruise_name, axis=1)
df['bw_from_cruise_beg_date']=df.apply(set_bandwidth_from_cruise_beg_date, axis=1)
df['bw_from_comments']=df.apply(set_bandwidth_from_comments, axis=1)
df['bw_from_comment']=df.apply(set_bandwidth_from_comment, axis=1)
df['bw_from_all_text']=df.apply(set_bandwidth_from_all_text, axis=1)
df['bandwidth']=df.apply(set_final_bandwidth, axis=1)

## Uniformly rename hardware_model names

In [12]:
# - Save out all unique hardware_model names + # of occurrences to look at in separate window
# --> use this output to create a legend of how to translate different instrument names to a uniform list of names
#df['hardware_model'].value_counts().to_csv('jasadcp_unique_instruments.csv', header=['hardware_model count'])
# --> see hardware_model_renaming_instructions.ipynb for this legend,
# as well as the code cell/fxn immediately below

In [13]:
# - Define fxn to uniformly rename different original hardware_model names
def set_uniform_name_orig_hardware_model(row):
# returns flag_instrument_name, instrument_name

    if row['hardware_model']!=None:
        if row['hardware_model'] in ['Ocean Surveyer 38', 'Ocean Surveyor 38']:
            return None, 'OS-38'
        elif row['hardware_model'] in ['Ocean Surveyor 75', 'OS75 narrowband',
                                       'Ocean Surveyer 75', 'Ocean Surveyor 75 narrowband',
                                       'OS75 (Ocean Surveyor)', 'Ocean Surveyor 75 broadband',
                                       'Ocean Surveyor OS75', 'OS75', 'Ocean Surveryor 75',
                                       'Ocean Surveyor 75 narroband', 'Ocean Surveyor 75 Broadband',
                                       'Ocean Surveyor 75 Narrowband', '75KHz Ocean Surveyor narrowband',
                                       'Ocean Surveyor 75 kHz', 'RDI 75KHz Ocean Surveyor',
                                       'Ocean Surveyor 75 kHz Phased Array', '75KHz Ocean Surveyor']:
            return None, 'OS-75'
        elif row['hardware_model'] in ['Ocean Surveyer 150', 'Ocean Surveryor 150',
                                       'Ocean Surveyor 150 narroband', 'Ocean Surveyor 150 narrowband',
                                       'Ocean Surveyor 150 broadband']:
            return None, 'OS-150'
        elif row['hardware_model'] in ['Ocean Surveyor II (OS-II 38)']:
            return None, 'OS-II-38'
        elif row['hardware_model'] in ['OSII75S phased-array']:
            return None, 'OS-II-75'
        elif row['hardware_model'] in ['WorkHorse 300', 'Workhorse 300', 'Workhorse 300; 300 kHz']:
            return None, 'WH-300'
        elif row['hardware_model'] in ['Workhorse 1200']:
            return None, 'WH-1200'
        elif row['hardware_model'] in ['Workhorse Mariner (300 kHz)']:
            return None, 'WH-Mariner-300'
        elif row['hardware_model'] in ['WorkHorse Mariner 600']:
            return None, 'WH-Mariner-600'
        elif row['hardware_model']=='DCP4400A':
            return None, 'DCP4400A'
        elif row['hardware_model'] in ['VM75 narrowband']:
            return None, 'NB-VM-75'
        elif row['hardware_model'] in ['narrowband 75 kHz']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return flagnow, 'NB-VM-75'
        elif row['hardware_model'] in ['Narrowband 150', 'NB 150', '150 narrowband',
                                       '150 kHz Narrowband', 'Narrow Band 150Khz',
                                       '1) Narrow Band 150 kHz', '150 kHz narrow band']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return flagnow, 'NB-VM-150'
        elif row['hardware_model'] in ['VM-150 Narrowband', 'RD-VM150 Narrow band',
                                       'VM-150 (NB)', 'VM-150 narrowband', 'RDI VM150 narrowband',
                                       'RD-VM150 narrowband']:
            return None, 'NB-VM-150'
        elif row['hardware_model'] in ['NB 150 (VM-150-18HP)']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + \
                ', assumed same as NB-VM-150'
            return None, 'NB-VM-150'
        elif row['hardware_model'] in ['RD-VM']:
            if ('150' in row['transmit_frequency']
               ) & ('David Starr Jordan' in row['platform_name']):
                # http://tryfan.ucsd.edu/calcofi/calcofi.htm --> NB ADCP on R/V David Starr Jordan!
                return None, 'NB-VM-150'
        elif row['hardware_model'] in ['Vessel-mounted 150 kHz Narrowband',
                                       'Vessel-Mount 150 kHz Narrowband', 
                                       'Vessel-mount 150 kHz Narrowband']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + \
                ', assumed VM but may be DR b/c of convex transducer_config' + \
                ' - see metadata in 00400-00408_short.nc'
            return flagnow, 'NB-VM-150'
        elif row['hardware_model'] in ['150']:
            if row['transducer_config']=='JANUS CONCAVE':
                if 'R/V Meteor' in row['platform_name']:
                # The paper "Surveying the Upper Ocean with the Ocean Surveyor: A New
                # Phased Array Doppler Current Profiler" says R/V Meteor has NB ADCP
                    return None, 'NB-VM-150'
                elif 'R/V Poseidon' in row['platform_name']:
                    flagnow = 'hardware_model = ' + row['hardware_model'] + \
                        ', transducer_config = JANUS CONCAVE --> VM, assumed NB'
                    return flagnow, 'NB-VM-150' 
        elif row['hardware_model'] in ['RDI']:
            if (row['transmit_frequency']=='153 KHz'
               ) & (row['transducer_config']=='JANUS CONCAVE'
               ) & (row['cruise_beg_date']<pd.Timestamp(1991,12,1)):
                return None, 'NB-VM-150'
            else:
                return None, None
        elif row['hardware_model'] in ['150 kHz hull mounted ADCP']:
            if row['cruise_beg_date']<pd.Timestamp(1991,12,1):
            # 4 files, all same ship 1991-04 - 1991-09
                return None, 'NB-VM-150'
        elif (row['hardware_model'] in ['Narrowband']
             ) & (row['transmit_frequency']=='153.6 kHz'
             ) & (row['transducer_config']=='JANUS CONCAVE'):
            return None, 'NB-VM-150'
        elif row['hardware_model'] in ['150 kHz']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM and NB'
            return flagnow, 'NB-VM-150'
        elif row['hardware_model'] in ['153.6 kHz hull mounted ADCP']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM and NB'
            return flagnow, 'NB-VM-150'
        elif row['hardware_model'] in ['Narrowband 300', '300 narrow band']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return None, 'NB-VM-300'
        elif row['hardware_model'] in ['Direct-Read 150 kHz Narrowband']:
            return None, 'NB-DR-150'
        elif row['hardware_model'] in ['Broadband 150', 'Broad Band 150',
                                       '150 kHz broadband', '150 broad band, concave']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return None, 'BB-VM-150'
        elif row['hardware_model'] in ['VM-150', 'RD-VM150', 'VM150', 'RD-VM0150']:
            if row['bandwidth']==None:
            # assumes NB if no bandwidth
                flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed NB'
                return flagnow, 'NB-VM-150'
            elif (row['bandwidth']=='narrowband'
               ) | (row['cruise_beg_date']<pd.Timestamp(1991,12,1)):
                return None, 'NB-VM-150'
            elif row['bandwidth']=='broadband':
                return None, 'BB-VM-150'
        elif row['hardware_model'] in ['VM-300', 'RD-VM300']:
            if row['bandwidth']==None:
            # assumes NB if no bandwidth
                flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed NB'
                return flagnow, 'NB-VM-300'
            elif (row['bandwidth']=='narrowband'
               ) | (row['cruise_beg_date']<pd.Timestamp(1991,12,1)):
                return None, 'NB-VM-300'
            elif row['bandwidth']=='broadband':
                return None, 'BB-VM-300'
        else:
            return None, 'ZUNCLEAR: ' + row['hardware_model']
    else: # row['hardware_model']==None
        if row['fname']=='01897_short.nc':
            # they somehow just deleted the instrument name from the metadata
            return None, 'WH-300'
        else: 
            return None, None

In [14]:
#df[['flag','instrument_name']]=df.apply(set_uniform_name_orig_hardware_model, axis=1, result_type='expand')
df['flag_instrument_name'], df['instrument_name'] = zip(*df.apply(set_uniform_name_orig_hardware_model, axis=1))
cols = list(df.columns.values)
cols = cols[0:2]+['instrument_name']+cols[2:-1]
df = df[cols]

## Cleanup

In [15]:
# - Get rid of NULL instrument_name rows (there are only 3)
# and 'ZUNCLEAR: VM-150 and VM-300' rows
df = df[~df['instrument_name'].isnull()]
df = df[~(df['instrument_name']=='ZUNCLEAR: VM-150 and VM-300')]

## Manual corrections to + make assumptions about bandwidth based on instrument_name

In [16]:
%%script false --no-raise-error
# include the line above if you don't want to run this cell

# - How many files does each criteria additionally classify as NB or BB?
dfnow = df[(df['bw_from_transmit_frequency'].isnull()) & # 253
           (df['bw_from_hardware_model'].isnull()) & # 464
           (df['bw_from_beam_width'].isnull()) & # 7
           (df['bw_from_cruise_name'].isnull()) & # 598
           (df['bw_from_cruise_beg_date'].isnull()) & # 56 
           (df['bw_from_comments'].isnull()) & # 56 
           (df['bw_from_comment'].isnull()) & # 0
           (df['bw_from_all_text'].notnull()) # 133
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
#dfnow

In [17]:
%%script false --no-raise-error
# include the line above if you don't want to run this cell

# - How many files are still left unclassified as NB/BB that will need assumptions?
# --> 685
# - How many of those 685 are in the Pacific?
# --> 322 

dfnow = df[(df['bw_from_transmit_frequency'].isnull()) &
           (df['bw_from_hardware_model'].isnull()) &
           (df['bw_from_beam_width'].isnull()) &
           (df['bw_from_cruise_name'].isnull()) &
           (df['bw_from_cruise_beg_date'].isnull()) &
           (df['bw_from_comments'].isnull()) &
           (df['bw_from_comment'].isnull()) &
           (df['bw_from_all_text'].isnull())
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments','geo_region']]
print(len(dfnow))
print(len(df[df['bandwidth'].isnull()]))
dfnow = dfnow.dropna(subset = ['geo_region'])
print(len(dfnow[dfnow['geo_region'].str.contains('acific')]))
#dfnow

In [18]:
# - Get rid of columns not needed anymore
df.drop(columns=['bw_from_all_text','bw_from_comment','bw_from_comments',
                 'bw_from_cruise_beg_date','bw_from_cruise_name','bw_from_beam_width',
                 'bw_from_hardware_model','bw_from_transmit_frequency'], inplace=True)

In [19]:
# - Manually set bw to bb for 02095_short.nc and 02125_short.nc (I think they had mistakenly written nb in comments)
df.loc[df['fname']=='02095_short.nc', ['bandwidth']] = 'broadband'
df.loc[df['fname']=='02125_short.nc', ['bandwidth']] = 'broadband'

In [20]:
# - Set all WH instruments' bandwidth to broadband
# (https://currents.soest.hawaii.edu/uhdas_adcp/year2011.html and
# https://www.go-ship.org/Manual/Firing_SADCP.pdf say that WH-300 is only broadband)

#print(len(df[(df['instrument_name']=='WH-300') & (df['geo_region'].str.contains('acific'))]))
#print(len(df[(df['instrument_name']=='WH-300')]))
#dfnow = df[(df['instrument_name'].isin(
#    ['WH-300','WH-Mariner-300','WH-Mariner-600','WH-1200'])
#             ) & (df['bandwidth'].isnull())]
#print(len(dfnow))
df.loc[(df['instrument_name'].isin(
    ['WH-300','WH-Mariner-300','WH-Mariner-600','WH-1200'])
       ) & (df['bandwidth'].isnull()), ['bandwidth']] = 'broadband'

In [21]:
# - Set bandwidth for those hardware_model names that were assumed to be BB or NB
# in hardware_model --> instrument_name renaming step

#dfnow = df[(df['instrument_name'].str.contains('NB')) & (df['bandwidth'].isnull())]
#print(len(dfnow))
#dfnow
df.loc[(df['instrument_name'].str.contains('NB')) & (df['bandwidth'].isnull()), ['bandwidth']] = 'narrowband'
#print(len(df[df['bandwidth'].isnull()]))

#dfnow = df[(df['instrument_name'].str.contains('BB')) & (df['bandwidth'].isnull())]
#print(len(dfnow))
#dfnow
df.loc[(df['instrument_name'].str.contains('BB')) & (df['bandwidth'].isnull()), ['bandwidth']] = 'broadband'
#print(len(df[df['bandwidth'].isnull()]))

In [22]:
# - How many more files are w/o bandwidth where bw can't be assumed?
# --> 57

print('# of files w/o bandwidth:', len(df[df['bandwidth'].isnull()]))
#df[df['bandwidth'].isnull()][['fname','hardware_model','instrument_name',
#                              'cruise_beg_date','depth_range','project','platform_name']]

# of files w/o bandwidth: 57


## Create rounded frequency column

In [23]:
# - Define fxn to uniformly name rounded transmit frequency
# --> returns rounded frequency in kHz
def round_freq(row):
    if row['instrument_name']!=None:
        if re.search('38', row['instrument_name'], re.IGNORECASE):
            return 38
        elif re.search('75', row['instrument_name'], re.IGNORECASE):
            return 75
        elif re.search('150', row['instrument_name'], re.IGNORECASE):
            return 150
        elif re.search('300', row['instrument_name'], re.IGNORECASE):
            return 300
        elif re.search('600', row['instrument_name'], re.IGNORECASE):
            return 600
        elif re.search('1200', row['instrument_name'], re.IGNORECASE):
            return 1200
        else:
            return None
        
df['freq_round_kHz']=df.apply(round_freq, axis=1)

## Create theta column (number) from transducer_beam_angle column (string)

In [24]:
# - Standardize transducer_beam_angle to a number
# AND figure out if transducer beam angle is standardized by instrument
# to fill in missing thetas based on instrument_name 
# --> Seems a good bet to fill in 30 degrees when there is no info
# --> Need to grab only first 2 chars to get a # for theta
#print(df['transducer_beam_angle'].value_counts())
#dfgroupnow = df.groupby(['instrument_name','transducer_beam_angle'])
#print(dfgroupnow['fname'].nunique())

# - Define fxn to uniformly number transducer_beam_angle (theta)
# --> returns theta in degrees
def return_theta_degrees(row):
    if row['transducer_beam_angle']!=None:
        return None, int(row['transducer_beam_angle'][:2])
    else:
    # if transducer_beam_angle is missing, assume it's 30 degrees
        return 'theta was missing; assumed 30 degrees', 30
        
df['flag_theta'], df['theta'] = zip(*df.apply(return_theta_degrees, axis=1))

## Create transmit_pulse_length_num (number) from transmit_pulse_length column (string)

In [223]:
# - Standardize transmit_pulse_length to a number
pd.options.display.max_rows = 999
print(df['transmit_pulse_length'].value_counts())
dfgroupnow = df.groupby(['instrument_name','transmit_pulse_length'])
print(dfgroupnow['fname'].nunique())

8 m                                      633
16 m                                     415
24 m                                     277
8m                                       177
13 m                                     165
4 m                                      122
2 m                                      102
24m                                       36
4 - 16 m                                  26
13m                                       15
16m                                       13
4 - 6 m                                   12
8                                         12
4m                                        11
8.33 m                                    10
16                                        10
16 m (4 m at times on shelf)              10
12m                                        9
8 m (32 m during bottom tracking)          8
8.08 m                                     6
9.32 m                                     5
12 m                                       5
??        

In [263]:
# - Define fxn to uniformly number transmit_pulse_length
# --> returns transmit_pulse_length in m
def return_tpl_m(row):
    if row['transmit_pulse_length']==None:
        return None, None
    elif 'vary' in row['transmit_pulse_length']:
    # applies to 01006_short.nc
        return 'non-constant', 6 
    elif 'varies' in row['transmit_pulse_length']:
    # applies to 00249, 00250, 00251_short.nc
        return 'non-constant', (4+8+16)/3
    elif row['transmit_pulse_length']=='??':
    # applies to 00342, 00343, 00344, 00345_short.nc
        return None, None
    elif row['transmit_pulse_length']!=None:
        tpl_old = row['transmit_pulse_length']
        flagnow = None
        if ('(' in tpl_old) & (')' in tpl_old):
            tpl_old = re.sub("[\(].*[\)]", "", tpl_old)
            flagnow = 'non-constant'
        if ('?' in tpl_old):
            tpl_old = tpl_old.replace('?','')
            flagnow = '? in metadata'
        tpl_old = tpl_old.replace('m','').replace(' ','')
        if (',' in tpl_old):
            tpl_old = tpl_old.replace('or','').split(',')
            flagnow = 'non-constant'
        elif 'or' in tpl_old:
            tpl_old = tpl_old.split('or')
            flagnow = 'non-constant'
        elif '-' in tpl_old:
            tpl_old = tpl_old.split('-')
            flagnow = 'non-constant'
        tpl_new = np.array(tpl_old, dtype='float')
        return flagnow, np.mean(tpl_new)
    
df['flag_tpl'], df['tpl'] = zip(*df.apply(return_tpl_m, axis=1))
df['tpl'] = df['tpl'].astype(float)

# - Make sure that all transmit_pulse_length values
# became the correct tpl values
# --> yes, they did!
#test = df['transmit_pulse_length'].value_counts()
#test = pd.DataFrame(test)
#test = test.reset_index().drop(columns=['transmit_pulse_length']).rename(columns={'index':'transmit_pulse_length'})
#dftest = test
#dftest['flag_tpl'], dftest['tpl'] = zip(*test.apply(return_tpl_m, axis=1))
#dftest

tpl_instrument_name_means = \
    df.groupby(['instrument_name']).mean()['tpl']
def fill_in_tpl_m(row):
    if np.isnan(row['tpl']):
        flagnow = 'assumed/filled in from mean tpl of instrument_name' 
        return flagnow, tpl_instrument_name_means[row['instrument_name']]
    else:
        return row['flag_tpl'], row['tpl']
    
df['flag_tpl'], df['tpl'] = zip(*df.apply(fill_in_tpl_m, axis=1))

In [270]:
np.isnan(df['tpl'].iloc[0])

False

In [265]:
pd.options.display.max_rows = 3000
df[['transmit_pulse_length','flag_tpl','tpl']]

Unnamed: 0,transmit_pulse_length,flag_tpl,tpl
0,8 m,,8.0
1,8 m,,8.0
2,16 m,,16.0
3,16 m,,16.0
4,8m,,8.0
5,8 m,,8.0
6,8 m,,8.0
7,16 m,,16.0
8,16 m,,16.0
9,16 m,,16.0
