In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import xarray as xr

In [2]:
# - Set data path
dpath = '/opt/acoustic-variability/data/'

In [3]:
# - Set saved out figure path
figpath = '/opt/acoustic-variability/python/figures/'

# Compile JASADCP metadata

## Read JASADCP metadata into df

In [4]:
fnames = sorted(os.listdir(dpath + 'JASADCP/ncfiles'))

In [5]:
nc_counter = len(fnames)
hardware_model = [None]*nc_counter
serial_numbers = [None]*nc_counter
transmit_frequency = [None]*nc_counter
phased_array = [None]*nc_counter
cruise_beg_date = [None]*nc_counter
blanking_interval = [None]*nc_counter
bin_length = [None]*nc_counter
transducer_beam_angle = [None]*nc_counter
transmit_pulse_length = [None]*nc_counter
comments = [None]*nc_counter
biomass_dtmn = [None]*nc_counter
geo_region = [None]*nc_counter
chief_scientist = [None]*nc_counter
personnel_charge = [None]*nc_counter
manufacturer = [None]*nc_counter
transducer_config = [None]*nc_counter
depth_range = [None]*nc_counter
num_bins = [None]*nc_counter
ens_avg_intvl = [None]*nc_counter
ss_calc = [None]*nc_counter
project = [None]*nc_counter
nb150 = [None]*nc_counter
nb300 = [None]*nc_counter
bb150 = [None]*nc_counter
bb300 = [None]*nc_counter
beam_width = [None]*nc_counter
cruise_name = [None]*nc_counter
platform_name = [None]*nc_counter
comment = [None]*nc_counter
all_text = [None]*nc_counter

In [6]:
ifile = 0
for fname in fnames:
    ncfile = dpath + 'JASADCP/ncfiles/' + fname
    ncnow = xr.open_dataset(ncfile)
    strnow = ncnow.attrs['cruise_sonar_summary']
    # 1.) hardware_model
    hardware_modelnow = re.findall("HARDWARE MODEL *: *((?:\S+ )*\S+)", strnow)
    if hardware_modelnow:
        hardware_model[ifile] = hardware_modelnow[0]
    elif not hardware_modelnow: # if hardware_modelnow is empty
        manufacturernow = re.findall("MANUFACTURER *: *((?:\S+ )*\S+)", strnow)
        if manufacturernow:
            hardware_model[ifile] = manufacturernow[0]
    # 2.) serial_numbers
    serial_numbersnow = re.findall("SERIAL NUMBERS *: *((?:\S+ )*\S+)", strnow)
    if serial_numbersnow:
        serial_numbers[ifile] = serial_numbersnow[0]
    # 3.) transmit_frequency
    transmit_frequencynow = re.findall("TRANSMIT FREQUENCY *: *((?:\S+ )*\S+)", strnow)
    if transmit_frequencynow:
        transmit_frequency[ifile] = transmit_frequencynow[0]
    # 4.) phased_array 
    phased_arraynow = re.findall("phased.array", strnow, re.IGNORECASE)
    if phased_arraynow:
        sep = '///'; phased_arraynow = sep.join(phased_arraynow)
        phased_array[ifile] = phased_arraynow
    # 5.) cruise_beg_date
    cruise_beg_date[ifile] = ncnow['time'][0].values
    # 6.) blanking_interval
    blanking_intervalnow = re.findall("BLANKING INTERVAL *: *((?:\S+ )*\S+)", strnow)
    if blanking_intervalnow:
        blanking_interval[ifile] = blanking_intervalnow[0]
    # 7.) bin_length
    bin_lengthnow = re.findall("BIN LENGTH *: *((?:\S+ )*\S+)", strnow)
    if bin_lengthnow:
        bin_length[ifile] = bin_lengthnow[0]
    # 8.) transducer_beam_angle
    transducer_beam_anglenow = re.findall("TRANSDUCER BEAM ANGLE *: *((?:\S+ )*\S+)", strnow)
    if transducer_beam_anglenow:
        transducer_beam_angle[ifile] = transducer_beam_anglenow[0]
    # 9.) transmit_pulse_length
    transmit_pulse_lengthnow = re.findall("TRANSMIT PULSE LENGTH *: *((?:\S+ )*\S+)", strnow)
    if transmit_pulse_lengthnow:
        transmit_pulse_length[ifile] = transmit_pulse_lengthnow[0]
    # 10.) comments
    commentsnow = re.findall("COMMENTS *: *((?:\S+ )*\S+)", strnow)
    if commentsnow:
        sep = '///'; commentsnow = sep.join(commentsnow)
        comments[ifile] = commentsnow
    # 11.) biomass_dtmn
    biomass_dtmnnow = re.findall("BIOMASS DETERMINATION *: *((?:\S+ )*\S+)", strnow)
    if biomass_dtmnnow:
        biomass_dtmn[ifile] = biomass_dtmnnow[0]
    # 12.) geo_region
    geo_regionnow = re.findall("GEOGRAPHIC_REGION *: *((?:\S+ )*\S+)", strnow)
    if geo_regionnow:
        geo_region[ifile] = geo_regionnow[0]
    # 13.) chief_scientist
    chief_scientistnow = re.findall("CHIEF SCIENTIST ON SHIP *: *((?:\S+ )*\S+)", strnow)
    if chief_scientistnow:
        chief_scientist[ifile] = chief_scientistnow[0]
    # 14.) personnel_charge
    personnel_chargenow = re.findall("PERSONNEL IN CHARGE *: *((?:\S+ )*\S+)", strnow)
    if personnel_chargenow:
        personnel_charge[ifile] = personnel_chargenow[0]
    # 15.) manufacturer
    manufacturernow = re.findall("MANUFACTURER *: *((?:\S+ )*\S+)", strnow)
    if manufacturernow:
        manufacturer[ifile] = manufacturernow[0]
    # 16.) transducer_config
    transducer_confignow = re.findall("TRANSDUCER CONFIGURATION *: *((?:\S+ )*\S+)", strnow)
    if transducer_confignow:
        transducer_config[ifile] = transducer_confignow[0]
    # 17.) depth_range
    depth_rangenow = re.findall("DEPTH RANGE *: *((?:\S+ )*\S+)", strnow)
    if depth_rangenow:
        depth_range[ifile] = depth_rangenow[0]
    # 18.) num_bins
    num_binsnow = re.findall("NUMBER OF BINS *: *((?:\S+ )*\S+)", strnow)
    if num_binsnow:
        num_bins[ifile] = num_binsnow[0]
    # 19.) ens_avg_intvl
    ens_avg_intvlnow = re.findall("ENSEMBLE AVERAGING INTERVAL *: *((?:\S+ )*\S+)", strnow)
    if ens_avg_intvlnow:
        ens_avg_intvl[ifile] = ens_avg_intvlnow[0]
    # 20.) ss_calc
    ss_calcnow = re.findall("SOUND SPEED CALCULATION *: *((?:\S+ )*\S+)", strnow)
    if ss_calcnow:
        ss_calc[ifile] = ss_calcnow[0]
    # 21.) project
    projectnow = re.findall("PROJECT *: *((?:\S+ )*\S+)", strnow)
    if projectnow:
        project[ifile] = projectnow[0]
    # 22.) nb150
    nb150now = re.findall("nb150", strnow, re.IGNORECASE)
    if nb150now:
        sep = '///'; nb150now = sep.join(nb150now)
        nb150[ifile] = nb150now
    # 23.) nb300
    nb300now = re.findall("nb300", strnow, re.IGNORECASE)
    if nb300now:
        sep = '///'; nb300now = sep.join(nb300now)
        nb300[ifile] = nb300now
    # 22.) bb150
    bb150now = re.findall("bb150", strnow, re.IGNORECASE)
    if bb150now:
        sep = '///'; bb150now = sep.join(bb150now)
        bb150[ifile] = bb150now
    # 23.) bb300
    bb300now = re.findall("bb300", strnow, re.IGNORECASE)
    if bb300now:
        sep = '///'; bb300now = sep.join(bb300now)
        bb300[ifile] = bb300now
    # 24.) beam_width
    beam_widthnow = re.findall("ACOUSTIC BEAM WIDTH *: *((?:\S+ )*\S+)", strnow)
    if beam_widthnow:
        beam_width[ifile] = beam_widthnow[0]
    # 25.) cruise_name
    cruise_namenow = re.findall("CRUISE_NAME *: *((?:\S+ )*\S+)", strnow)
    if cruise_namenow:
        cruise_name[ifile] = cruise_namenow[0]
    # 26.) platform_name
    platform_namenow = re.findall("PLATFORM_NAME *: *((?:\S+ )*\S+)", strnow)
    if platform_namenow:
        platform_name[ifile] = platform_namenow[0]
    # 27.) comment
    commentnow = re.findall("COMMENT *: *((?:\S+ )*\S+)", strnow)
    if commentnow:
        sep = '///'; commentnow = sep.join(commentnow)
        comment[ifile] = commentnow
    # 28.) all_text
    all_text[ifile] = strnow
    ifile = ifile+1

In [7]:
df = pd.concat([pd.Series(fnames,name='fname'),
     pd.Series(hardware_model,name='hardware_model'),pd.Series(serial_numbers,name='serial_numbers'),
     pd.Series(transmit_frequency,name='transmit_frequency'),pd.Series(phased_array,name='phased_array'),
     pd.Series(cruise_beg_date,name='cruise_beg_date'),pd.Series(blanking_interval,name='blanking_interval'),
     pd.Series(bin_length,name='bin_length'),pd.Series(transducer_beam_angle,name='transducer_beam_angle'),
     pd.Series(transmit_pulse_length,name='transmit_pulse_length'),pd.Series(comments,name='comments'),
     pd.Series(biomass_dtmn,name='biomass_dtmn'),pd.Series(geo_region,name='geo_region'),
     pd.Series(chief_scientist,name='chief_scientist'),pd.Series(personnel_charge,name='personnel_charge'),
     pd.Series(manufacturer,name='manufacturer'),pd.Series(transducer_config,name='transducer_config'),
     pd.Series(depth_range,name='depth_range'),pd.Series(num_bins,name='num_bins'),
     pd.Series(ens_avg_intvl,name='ens_avg_intvl'),pd.Series(ss_calc,name='ss_calc'),
     pd.Series(project,name='project'),pd.Series(nb150,name='nb150'),
     pd.Series(nb300,name='nb300'),pd.Series(bb150,name='bb150'),
     pd.Series(bb300,name='bb300'),pd.Series(beam_width,name='beam_width'),
     pd.Series(cruise_name,name='cruise_name'),pd.Series(platform_name,name='platform_name'),
     pd.Series(comment,name='comment'),pd.Series(all_text,name='all_text')],
    axis=1)

In [11]:
# - Define fxns to describe bandwidth from all_text, beam_width, cruise_name, comments, hardware_model, and transmit_frequency 

def set_bandwidth_from_all_text(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['all_text']:
        if (re.search('narrowband', row['all_text'], re.IGNORECASE) or
              re.search('narrow band', row['all_text'], re.IGNORECASE) or
              re.search('nb75', row['all_text'], re.IGNORECASE) or
              re.search('nb150', row['all_text'], re.IGNORECASE) or
              re.search('nb300', row['all_text'], re.IGNORECASE) or
              re.search('nb600', row['all_text'], re.IGNORECASE) or
              re.search('nb1200', row['all_text'], re.IGNORECASE) or
              re.search('pingtype = nb', row['all_text'], re.IGNORECASE)
             ):
            return 'narrowband'
        elif (re.search('pingtype = bb', row['all_text'], re.IGNORECASE)):
            return 'broadband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_comment(row):
    if row['comment']:
        if re.search('narrowband', row['comment'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_comments(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['comments']:
        if (re.search('broadband', row['comments'], re.IGNORECASE) and
                re.search('narro', row['comments'], re.IGNORECASE)):
            return 'both broad and narrowband?' 
        elif re.search('broadband', row['comments'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['comments'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_cruise_beg_date(row):
    if row['cruise_beg_date']:
        if row['cruise_beg_date']<pd.Timestamp(1991,6,1):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_cruise_name(row):
    if row['cruise_name']:
        if (re.search('os38bb', row['cruise_name'], re.IGNORECASE) or
              re.search('os75bb', row['cruise_name'], re.IGNORECASE) or
              re.search('os150bb', row['cruise_name'], re.IGNORECASE)
             ):        
            return 'broadband'
        elif (re.search('os38nb', row['cruise_name'], re.IGNORECASE) or
              re.search('os75nb', row['cruise_name'], re.IGNORECASE) or
              re.search('os150nb', row['cruise_name'], re.IGNORECASE) or
              re.search('nb150', row['cruise_name'], re.IGNORECASE) or
              re.search('nb300', row['cruise_name'], re.IGNORECASE)
             ):        
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_beam_width(row):
    if row['beam_width']:
        if re.search('narrow band', row['beam_width'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_hardware_model(row):
    if row['hardware_model']:
        if re.search('broad', row['hardware_model'], re.IGNORECASE):
            return 'broadband'
        elif (re.search('narro', row['hardware_model'], re.IGNORECASE) or
                re.search('nb', row['hardware_model'], re.IGNORECASE)):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_transmit_frequency(row):
    if row['transmit_frequency']:
        if re.search('broadband', row['transmit_frequency'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['transmit_frequency'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_final_bandwidth(row):
    lsnow = [row['bw_from_transmit_frequency'],row['bw_from_hardware_model'],
             row['bw_from_beam_width'],row['cruise_name'],row['cruise_beg_date'],
             row['bw_from_comments'],row['bw_from_comment'],row['bw_from_all_text']                     
    if lsnow[0] != None:
        return lsnow[0]
    elif lsnow[1] != None:
             CONTINUE HERE!!!!!!!!!!
    else:
        return None

In [12]:
df['bw_from_transmit_frequency']=df.apply(set_bandwidth_from_transmit_frequency, axis=1)
df['bw_from_hardware_model']=df.apply(set_bandwidth_from_hardware_model, axis=1)
df['bw_from_beam_width']=df.apply(set_bandwidth_from_beam_width, axis=1)
df['bw_from_cruise_name']=df.apply(set_bandwidth_from_cruise_name, axis=1)
df['bw_from_cruise_beg_date']=df.apply(set_bandwidth_from_cruise_beg_date, axis=1)
df['bw_from_comments']=df.apply(set_bandwidth_from_comments, axis=1)
df['bw_from_comment']=df.apply(set_bandwidth_from_comment, axis=1)
df['bw_from_all_text']=df.apply(set_bandwidth_from_all_text, axis=1)
df['bandwidth']=df.apply(set_final_bandwidth, axis=1)

In [None]:
# - How many files does each criteria classify as NB or BB?
dfnow = df[(df['bw_from_transmit_frequency'].isnull()) # 253
           (df['bw_from_hardware_model'].isnull()) # 464
           (df['bw_from_beam_width'].isnull()) & # 7
           (df['bw_from_cruise_name'].isnull()) & # 598
           (df['bw_from_cruise_beg_date'].isnull()) & # 40 
           (df['bw_from_comments'].isnull()) & # 56 
           (df['bw_from_comment'].isnull()) & # 0
           (df['bw_from_all_text'].notnull()) # 132
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
#dfnow

In [34]:
# - How many files are still left unclassified as NB/BB that will need assumptions?
# --> 698

dfnow = df[(df['bw_from_transmit_frequency'].isnull()) &
           (df['bw_from_hardware_model'].isnull()) &
           (df['bw_from_beam_width'].isnull()) &
           (df['bw_from_cruise_name'].isnull()) &
           (df['bw_from_cruise_beg_date'].isnull()) &
           (df['bw_from_comments'].isnull()) &
           (df['bw_from_comment'].isnull()) &
           (df['bw_from_all_text'].isnull())
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
#dfnow

698


## Uniformly rename hardware_model names

In [15]:
# - Save out all unique hardware_model names + # of occurrences to look at in separate window
# --> use this output to create the legend below of how to translate different instrument names to a uniform list of names
#df['hardware_model'].value_counts().to_csv('jasadcp_unique_instruments.csv', header=['hardware_model count'])

In [16]:
# - Define fxn to uniformly rename different original hardware_model names
def set_uniform_name_orig_hardware_model(row):
# returns flag, instrument_name
    if row['hardware_model']:
        if row['hardware_model'] in ['Ocean Surveyer 38', 'Ocean Surveyor 38']:
            return None, 'OS-38'
        
        elif row['hardware_model'] in ['Ocean Surveyor 75', 'OS75 narrowband',
                                       'Ocean Surveyer 75', 'Ocean Surveyor 75 narrowband',
                                       'OS75 (Ocean Surveyor)', 'Ocean Surveyor 75 broadband',
                                       'Ocean Surveyor OS75', 'OS75', 'Ocean Surveryor 75',
                                       'Ocean Surveyor 75 narroband', 'Ocean Surveyor 75 Broadband',
                                       'Ocean Surveyor 75 Narrowband', '75KHz Ocean Surveyor narrowband',
                                       'Ocean Surveyor 75 kHz', 'RDI 75KHz Ocean Surveyor',
                                       'Ocean Surveyor 75 kHz Phased Array', '75KHz Ocean Surveyor']:
            return None, 'OS-75'
        
        elif row['hardware_model'] in ['Ocean Surveyer 150', 'Ocean Surveryor 150',
                                       'Ocean Surveyor 150 narroband', 'Ocean Surveyor 150 narrowband',
                                       'Ocean Surveyor 150 broadband']:
            return None, 'OS-150'
        
        elif row['hardware_model'] in ['Ocean Surveyor II (OS-II 38)']:
            return None, 'OS-II-38'
        
        elif row['hardware_model'] in ['OSII75S phased-array']:
            return None, 'OS-II-75'
        
        elif row['hardware_model'] in ['WorkHorse 300', 'Workhorse 300', 'Workhorse 300; 300 kHz']:
            return None, 'WH-300'
        
        elif row['fname']=='01897_short.nc':
            # they somehow just deleted the instrument name from the metadata
            return None, 'WH-300'
        
        elif row['hardware_model'] in ['Workhorse 1200']:
            return None, 'WH-1200'
        
        elif row['hardware_model'] in ['Workhorse Mariner (300 kHz)']:
            return None, 'WH-Mariner-300'
        
        elif row['hardware_model'] in ['WorkHorse Mariner 600']:
            return None, 'WH-Mariner-600'
        
        elif row['hardware_model']=='DCP4400A':
            return None, 'DCP4400A'
        
        elif row['hardware_model'] in ['VM75 narrowband']:
            return None, 'NB-VM-75'
        
        elif row['hardware_model'] in ['narrowband 75 kHz']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return flagnow, 'NB-VM-75'
        
        elif row['hardware_model'] in ['Narrowband 150', 'NB 150', '150 narrowband',
                                       '150 kHz Narrowband', 'Narrow Band 150Khz',
                                       '1) Narrow Band 150 kHz', '150 kHz narrow band']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return flagnow, 'NB-VM-150'
        
        elif row['hardware_model'] in ['VM-150 Narrowband', 'RD-VM150 Narrow band',
                                       'VM-150 (NB)', 'VM-150 narrowband', 'RDI VM150 narrowband',
                                       'RD-VM150 narrowband']:
            return None, 'NB-VM-150'
        
        elif row['hardware_model'] in ['NB 150 (VM-150-18HP)']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + \
                ', assumed same as NB-VM-150'
            return None, 'NB-VM-150'
        
        elif row['hardware_model'] in ['RD-VM']:
            if ('150' in row['transmit_frequency']
               ) & ('David Starr Jordan' in row['platform_name']):
                # http://tryfan.ucsd.edu/calcofi/calcofi.htm --> NB ADCP on R/V David Starr Jordan!
                return None, 'NB-VM-150'
            
        elif row['hardware_model'] in ['Vessel-mounted 150 kHz Narrowband',
                                       'Vessel-Mount 150 kHz Narrowband', 
                                       'Vessel-mount 150 kHz Narrowband']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + \
                ', assumed VM but may be DR b/c of convex transducer_config' + \
                ' - see 00400-00408_short.nc'
            return flagnow, 'NB-VM-150'
        
        elif row['hardware_model'] in ['150']:
            if row['transducer_config']=='JANUS CONCAVE':
                if 'R/V Meteor' in row['platform_name']:
                # The paper "Surveying the Upper Ocean with the Ocean Surveyor: A New
                # Phased Array Doppler Current Profiler" says R/V Meteor has NB ADCP
                    return None, 'NB-VM-150'
                elif 'R/V Poseidon' in row['platform_name']:
                    flagnow = 'hardware_model = ' + row['hardware_model'] + \
                        ', transducer_config = JANUS CONCAVE, assumed NB'
                    return flagnow, 'NB-VM-150' 
                
        elif row['hardware_model'] in ['RDI']:
            if (row['transmit_frequency']=='153 KHz'
               ) & (row['transducer_config']=='JANUS CONCAVE'
               ) & (row['cruise_beg_date']<pd.Timestamp(1991,6,1)):
                return None, 'NB-VM-150'
            
        elif row['hardware_model'] in ['150 kHz hull mounted ADCP']:
            if row['cruise_beg_date']<pd.Timestamp(1991,6,1):
                return None, 'NB-VM-150'
        
        elif (row['hardware_model'] in ['Narrowband']
             ) & (row['transmit_frequency']=='153.6 kHz'
             ) & (row['transducer_config']=='JANUS CONCAVE'):
            return None, 'NB-VM-150'
        
        elif row['hardware_model'] in ['150 kHz']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM and NB'
            return flagnow, 'NB-VM-150'
        
        elif row['hardware_model'] in ['153.6 kHz hull mounted ADCP']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM and NB'
            return flagnow, 'NB-VM-150'
            
        elif row['hardware_model'] in ['Narrowband 300', '300 narrow band']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return None, 'NB-VM-300'
        
        elif row['hardware_model'] in ['Direct-Read 150 kHz Narrowband']:
            return None, 'NB-DR-150'
        
        elif row['hardware_model'] in ['Broadband 150', 'Broad Band 150',
                                       '150 kHz broadband', '150 broad band, concave']:
            flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed VM'
            return None, 'BB-VM-150'
        
        elif row['hardware_model'] in ['VM-150', 'RD-VM150', 'VM150', 'RD-VM0150']:
            if (row['bandwidth']=='narrowband'
               ) | (row['cruise_beg_date']<pd.Timestamp(1991,6,1)):
                return None, 'NB-VM-150'
            elif row['bandwidth']=='broadband':
                return None, 'BB-VM-150'
            elif row['bandwidth']==None:
            # assumes NB if no bandwidth
                flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed NB'
                return flagnow, 'NB-VM-150'
                     
        elif row['hardware_model'] in ['VM-300', 'RD-VM300']:
            if (row['bandwidth']=='narrowband'
               ) | (row['cruise_beg_date']<pd.Timestamp(1991,6,1)):
                return None, 'NB-VM-300'
            elif row['bandwidth']=='broadband':
                return None, 'BB-VM-300'
            elif row['bandwidth']==None:
            # assumes NB if no bandwidth
                flagnow = 'hardware_model = ' + row['hardware_model'] + ', assumed NB'
                return flagnow, 'NB-VM-300'
        
        else:
            return None, 'ZUNCLEAR: ' + row['hardware_model']
        
    else:
        return None, None

In [17]:
# - Create df w/ uniform instrument names
# https://stackoverflow.com/questions/23586510/return-multiple-columns-from-apply-pandas
df[['flag','instrument_name']]=df.apply(set_uniform_name_orig_hardware_model, axis=1, result_type='expand')
cols = list(df.columns.values)
cols = cols[0:2]+['instrument_name']+cols[2:-1]
df = df[cols]

In [18]:
# - Get rid of NULL instrument_name rows (there's only 1 and it has no other info)
# and 'ZUNCLEAR: VM-150 and VM-300' rows
df = df[~df['instrument_name'].isnull()]
df = df[~(df['instrument_name']=='ZUNCLEAR: VM-150 and VM-300')]

In [None]:
%%script false --no-raise-error
# include the line above if you don't want to run this cell

# - Get rid of columns not needed anymore
df.drop(columns=['bw_from_comments','bw_from_hardware_model',
                 'bw_from_transmit_frequency'], inplace=True)

## Make assumptions based on instrument_name

In [None]:
# THINGS TO RUN AFTER RENAMING:
# - https://currents.soest.hawaii.edu/uhdas_adcp/year2011.html and https://www.go-ship.org/Manual/Firing_SADCP.pdf
# say that WH-300 is only broadband --> add in broadband after instrument_name is ran
# - Manually set bw to bb for 02095_short.nc and 02125_short.nc (I think they had mistakenly written nb in comments)
# - After all this, ask TRDI for help + the remaining files w/ unclassified bw will be assumed from the majority of bw classifications of the corresponding instrument.

# -------------------------------------------

# Important TRDI ADCP history
- "This is the second edition of Acoustic Doppler Current Profiler Principles of Operation: A Practical Primer. The first edition addressed narrowband Acoustic Doppler Current Profilers (ADCPs). Since then, Teledyne RD Instruments has introduced the BroadBand ADCP, and more recently the Workhorse, which uses BroadBand technology. This edition has been revised to reflect changes introduced with BroadBand technology."
- "In 1982, TRDI produced its first ADCP, a self-contained instrument designed for use in long-term, battery-powered deployments (Pettigrew, Beardsley and Irish, 1986). In 1983, TRDI produced its first vessel-mounted ADCP. By 1986, TRDI had five different frequencies (75-1200 kHz) and three different ADCP models (self-contained, vessel-mounted, and direct-reading)." 
- "In 1991, TRDI began shipping its first production prototype BroadBand ADCPs. The BroadBand method (patents 5,208,785 and 5,343,443) enables ADCPs to take advantage of the full signal bandwidth available for measuring velocity. Greater bandwidth gives a BroadBand ADCP far more information with which to estimate velocity. With typically 100 times as much bandwidth, BroadBand ADCPs reduce variance nearly 100 times when compared with narrowband ADCPs."
- "Self-Contained and Direct-Reading ADCPs use convex transducers (Figure 26) to allow the ADCP to be mounted in an in-line mooring cage. Vessel-Mounted ADCP transducers are concave to allow them to be mounted inside the smallest possible sea chest in the ship‘s hull."

In [None]:
# - Does using a 1991 year cutoff help define NB vs. BB on any more cruises?
# (before 1991 = NB only)
# --> YES!

pd.set_option('max_rows', 1000)
print(len(df[(df['cruise_beg_date']< '1991-06-01') & (df['bandwidth'].isnull())]))
print(len(df[(df['instrument_name']=='VM-150') &
             (df['cruise_beg_date']< '1991-06-01') & (df['bandwidth'].isnull())]))
print(len(df[(df['cruise_beg_date']< '1991-06-01') & (df['bandwidth'].isnull()) & 
             (df['bw_from_all_text'].isnull())]))
print(len(df[(df['instrument_name']=='VM-150') & (df['bw_from_all_text'].isnull()) &
             (df['cruise_beg_date']< '1991-06-01') & (df['bandwidth'].isnull())]))
print(len(df[(df['instrument_name']=='VM-150') & (df['bandwidth'].isnull())]))
#df[(df['cruise_beg_date']< '1991-06-01') & (df['bandwidth'].isnull())][
#    ['fname','hardware_model','instrument_name','serial_numbers','cruise_beg_date','bw_from_all_text']]

In [None]:
# - Checking assumption that most/all files that don't explicitly say are VM, not DR
# (transducer_config concave = VM, convex = DR)
# --> Should be good to assume that config is VM
# --> 00400-00408_short.nc should prob all be direct-read

print(len(df[df['hardware_model'].str.contains('irect')]))
#df[df['hardware_model'].str.contains('irect')]
dfnow = df[df['transducer_config'].notnull()]
print(len(dfnow[dfnow['transducer_config'].str.contains('convex', case=False)]))
print(len(dfnow[dfnow['transducer_config'].str.contains('concave', case=False)]))
#dfnow[dfnow['transducer_config'].str.contains('convex', case=False)][
#    ['fname','hardware_model','instrument_name','serial_numbers','cruise_beg_date',
#     'transducer_config']]
# --> 00400-00408_short.nc should prob all be direct-read
#df['transducer_config'].unique()
#dfnow[dfnow['transducer_config'].str.contains('beam', case=False)][
#    ['fname','hardware_model','instrument_name','serial_numbers','cruise_beg_date',
#     'transducer_config']]
# --> Besides the WH-Mariner-300, I think only the DR are convex

In [None]:
# - How many instrument names that say "(assume VM)" can be classified for sure as VM
# using concave vs. convex transducer_config? (concave = VM, convex = DR)
# --> can classify "JANUS CONCAVE" transducer_config values as VM
# for a total of 11+8=19 files

hwmnow = '150 broad band, concave'
dfnow = df[df['transducer_config'].notnull()]
print(len(dfnow[(dfnow['hardware_model']==hwmnow)]))
print(len(df[(df['hardware_model']==hwmnow)]))
dfnow[(dfnow['hardware_model']==hwmnow)][
    ['fname','hardware_model','instrument_name','serial_numbers',
     'cruise_beg_date','transducer_config']]
# 'narrowband 75 kHz' --> classifies 0 notnull transducer_config out of 12
# 'Narrowband 150' --> classifies 11 "JANUS CONCAVE" out of 68
# 'NB 150' --> 0/29
# '150 narrowband' --> "JANUS CONCAVE" 8/8
# '150 kHz Narrowband' --> 0/8
# 'Narrow Band 150Khz' --> 0/2
# '1) Narrow Band 150 kHz' --> 0/2
# '150 kHz narrow band' --> 0/1
# 'Narrowband 300' --> 0/3
# '300 narrow band' --> 0/1
# 'Broadband 150' --> 0/6
# 'Broad Band 150' --> 0/5
# '150 kHz broadband' --> weird "< JANUS CONVEX > < JANUS CONCAVE >" 1/1
# '150 broad band, concave' --> 0/1

In [None]:
# - For defining unknown VM-150 bandwidths, are most defined VM-150
# bandwidths NB or BB?
# --> They are all NB, so assume NB for unknown

print(len(df[(df['instrument_name']=='VM-150') & (df['bandwidth'].isnull())]))
print(len(df[(df['instrument_name']=='VM-150') & (df['bandwidth'].notnull())]))
print(len(df[(df['instrument_name']=='VM-150') & (df['bandwidth'].notnull()) &
             (df['bandwidth']=='narrowband')]))
print(df[(df['instrument_name']=='VM-150') & (df['bandwidth'].notnull())]['bandwidth'].unique())
# --> all non-null VM-150 bandwidths are narrowband
# BUT CHANGE THE RENAMING FIRST, THEN RETURN TO THIS 6:47PM

# Testing from when:
- df bandwidth-related columns were bw_from_comments, bw_from_hardware_model, bw_from_transmit_frequency, bandwidth (derived from all 3 bw columns)
- df instrument_names were the same as in create_JASADCP_instrument_spreadsheet.ipynb

### *Purpose 1) Trying to clarify mildly annoying hardware_model names*

In [None]:
# - fraction of cruises w/ weird names
#pd.set_option('max_rows', 1000)
#pd.set_option('display.max_colwidth', -1)
weird_names = ['VM-150', 'RD-VM150', 'VM150', 'RD-VM0150', 'VM-300', 'RD-VM300', '150',
               'RDI', '150 kHz hull mounted ADCP', 'Narrowband', '150 kHz', '153.6 kHz hull mounted ADCP'] 
weird_names_vm = ['VM-150', 'RD-VM150', 'VM150', 'RD-VM0150', 'VM-300', 'RD-VM300']
print(len(df[df['hardware_model'].isin(weird_names)])/len(df)) # --> 0.3580301685891748
print(len(df[df['hardware_model'].isin(weird_names_vm)])/len(df)) # --> 0.34072759538598046
print(len(df[df['hardware_model']=='VM-150'])/len(df)) # --> 0.25643300798580304
print(len(df[df['hardware_model']=='RD-VM150'])/len(df)) # --> 0.07364685004436557
print(len(df[df['hardware_model']=='VM150'])/len(df)) # --> 0.0008873114463176575
print(len(df[df['hardware_model']=='RD-VM0150'])/len(df)) # --> 0.00044365572315882877
print(len(df[df['hardware_model']=='VM-300'])/len(df)) # --> 0.005767524401064774
print(len(df[df['hardware_model']=='RD-VM300'])/len(df)) # --> 0.00354924578527063

# - fraction of cruises w/ weird names in Pacific
print(len(df[(df['hardware_model'].isin(weird_names)) & (
    df['geo_region'].str.contains('acific') |
    df['geo_region'].str.contains('alifornia'))])/len(df)) # --> 0.17701863354037267
print(len(df[(df['hardware_model'].isin(weird_names_vm)) & (
    df['geo_region'].str.contains('acific') |
    df['geo_region'].str.contains('alifornia'))])/len(df)) # --> 0.17213842058562556

# - fraction of cruises in Pacific
print(len(df[df['geo_region'].str.contains('acific') |
             df['geo_region'].str.contains('alifornia')])/len(df)) # --> 0.6313220940550133

# - earliest and latest cruises using OS 
print(df[(df['instrument_name'].str.contains('OS-'))]['cruise_beg_date'].min()) # --> '2000-08-31 00:01:35'
print(df[(df['instrument_name'].str.contains('OS-'))]['cruise_beg_date'].max()) # --> '2018-11-15 17:22:09'

# - earliest and latest cruises using NB-VM
print(df[(df['instrument_name'].str.contains('VM')) &
         (df['bandwidth']=='narrowband')]['cruise_beg_date'].min()) # --> '1989-08-15 10:20:00'
print(df[(df['instrument_name'].str.contains('VM')) &
         (df['bandwidth']=='narrowband')]['cruise_beg_date'].max()) # --> '2014-03-20 10:06:28'

# - earliest and latest cruises using BB
print(df[(df['hardware_model'].str.contains('road')) &
         ~(df['instrument_name'].str.contains('OS'))]['cruise_beg_date'].min()) # --> '1996-07-02 04:42:00'
print(df[(df['hardware_model'].str.contains('road')) &
         ~(df['instrument_name'].str.contains('OS'))]['cruise_beg_date'].max() # --> '2014-08-01 12:11:31'
# --> It could be that cruises that ONLY say VM and not BB or NB are NB and happened BEFORE BB existed

# - justification that VM usually means narrowband (though doesn't really work,
# given that 25.6% of all cruises are VM-150 w/ unknown BB or NB)
print(len(df[(df['instrument_name'].str.contains('VM-')) & (df['bandwidth']=='broadband')]['bandwidth'])) # --> 0
print(len(df[(df['instrument_name'].str.contains('VM-')) & (df['bandwidth']=='narrowband')]['bandwidth'])) # --> 178
print(len(df[(df['hardware_model'].str.contains('VM')) & (df['bandwidth']=='broadband')]['bandwidth'])) # --> 0
print(len(df[(df['hardware_model'].str.contains('VM')) & (df['bandwidth']=='narrowband')]['bandwidth'])) # --> 178

**All names to become 'VM-150'**:\
'VM-150', 'RD-VM150', 'VM150', 'RD-VM0150'
--> need to figure out if these are NB or BB

**All names to become 'VM-300'**:\
'VM-300', 'RD-VM300'
--> need to figure out if these are NB or BB

**All names to become 'UNCLEAR'**:\
--> ( ) = added info for better characterization
'150' (1990-1998; assume RDI + VM + NB or BB),  
'RDI' (1985-1993; some give you transmit frequency, w/ those just assume VM + either NB or BB),  
'150 kHz hull mounted ADCP' (1991; assume RDI + VM + NB or BB),  
'Narrowband' (1994-1996; all 153.6kHz transmit frequency; assume RDI + VM + NB),  
'150 kHz' (1994; assume RDI + VM + NB or BB),  
'153.6 kHz hull mounted ADCP' (2002; assume RDI + maybe OS?)

### *Purpose 2) Trying to clarify NB vs. BB when it's not immediately obvious*

In [None]:
# - Define fxns to describe bandwidth from comments, hardware_model, and transmit_frequency 
def set_bandwidth_from_comments(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['comments']:
        if (re.search('broadband', row['comments'], re.IGNORECASE) and
                re.search('narro', row['comments'], re.IGNORECASE)):
            return 'both broad and narrowband?' 
        elif re.search('broadband', row['comments'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['comments'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_bandwidth_from_hardware_model(row):
    if row['hardware_model']:
        if re.search('broad', row['hardware_model'], re.IGNORECASE):
            return 'broadband'
        elif (re.search('narro', row['hardware_model'], re.IGNORECASE) or
                re.search('nb', row['hardware_model'], re.IGNORECASE)):
            return 'narrowband'
        else:
            return None
    else:
        return None

def set_bandwidth_from_transmit_frequency(row):
    if row['transmit_frequency']:
        if re.search('broadband', row['transmit_frequency'], re.IGNORECASE):
            return 'broadband'
        elif re.search('narro', row['transmit_frequency'], re.IGNORECASE):
            return 'narrowband'
        else:
            return None
    else:
        return None
    
def set_final_bandwidth(row):
    lsnow = [row['bw_from_comments'],row['bw_from_hardware_model'],row['bw_from_transmit_frequency']]
    idxnow = [i for i,lsitem in enumerate(lsnow) if lsitem != None]
    if len(idxnow)>0:
        if len(idxnow)==3:
            if lsnow[1]!=lsnow[2]:
                return 'CHECK NC FILE'
            else:
                return lsnow[2]
        elif len(idxnow)==2:
            if idxnow==[0,1]:
            # prioritize bw_from_hardware_model over bw_from_comments
                return lsnow[1]
            elif idxnow==[0,2]:
            # prioritize bw_from_transmit_frequency over bw_from_comments
                return lsnow[2]
            elif (idxnow==[1,2]) and (lsnow[1]==lsnow[2]):
                return lsnow[2]
            elif (idxnow==[1,2]) and (lsnow[1]!=lsnow[2]):
            # equal priority for bw_from_transmit_frequency and bw_from_hardware_model
                return 'CHECK NC FILE'            
        elif len(idxnow)==1:
            return lsnow[idxnow[0]]
    else:
        return None

df['bw_from_comments']=df.apply(set_bandwidth_from_comments, axis=1)
df['bw_from_hardware_model']=df.apply(set_bandwidth_from_hardware_model, axis=1)
df['bw_from_transmit_frequency']=df.apply(set_bandwidth_from_transmit_frequency, axis=1)
df['bandwidth']=df.apply(set_final_bandwidth, axis=1)

In [None]:
# - Testing whether this classification of bandwidth is correct/good (applicable before 1/28/20)
pd.set_option('max_rows', 500)
pd.set_option('display.max_colwidth', -1)
df[df['bw_from_comments']=='narrowband'][['hardware_model','transmit_frequency','bw_from_comments','bw_from_hardware_model','bw_from_transmit_frequency','bandwidth','comments']]
# also looked at: ['bw_from_comments']=='broadband', ['bw_from_hardware_model']=='narrowband',
# ['bw_from_hardware_model']=='broadband', ['bw_from_transmit_frequency']=='narrowband',
# ['bw_from_hardware_model']=='broadband', ['bandwidth']=='CHECK NC FILE'
# --> I CHECKED ALL PRINTED OUT ROWS AND ALL GOT THE RIGHT ANSWER IN THE 
# FINAL COLUMN NAMED 'bandwidth'; ALSO no files triggered the 'CHECK NC FILE'
# designation in the 'bandwidth' column, yay!
# --> BUT THIS STILL LEAVES TOO MANY FILES UNCLASSIFIED INTO NB OR BB...
# SEE NEXT TESTING SECTION BELOW FOR FURTHER REFINEMENT.

# Testing from when:
- df bandwidth-related columns were bw_from_comments, bw_from_hardware_model, bw_from_transmit_frequency, bw_from_all_text, bandwidth (derived from 1st 3 bw columns)  
- df instrument_names were the same as in create_JASADCP_instrument_spreadsheet.ipynb

### *Purpose 1) Trying to clarify NB vs. BB when it's not immediately obvious*

#### a.) See if you can tell the difference btwn NB and BB just by looking at specific params like bin_length, depth_range, etc.  
**CONCLUSION:** can't really tell the diff btwn NB and BB from diffs in params alone

In [None]:
# - 1.) Look at broadband instruments
#df[df['bandwidth']=='broadband'] # --> too hard/many to look at

# - 2.) Look at only specific OS-75 cruises where they separated broadband and narrowband measurements
#dfnow = df.dropna(subset=['comments'])
#dfnow[dfnow['comments'].str.contains('separate set')]
# --> looks like broadband/narrowband always has bin_length, transmit_pulse_length, depth_range, num_bins, ens_avg_intvl
# as 8m/16m, 8m/16m, 25-657m/32-1136m, 80/70, 300s/300s

# - 3.) Look at instruments w/ broadband in hardware_model
#df[(df['hardware_model'].str.contains('road'))]
#df[(df['hardware_model'].str.contains('road')) & (df.index>1367)]
# --> ignore OS for now
# --> BB-150: blank = 4, bin = 8, range = 17-413/16-408, numbins = 50, ens = 300/60/180
# --> BB-150 concave: blank = 5, bin = 8, range = 30-332, numbins = 40, ens = 60

# - 4.) Look at narrowband instruments
#df[(df['instrument_name'].str.contains('VM')) & (df['bandwidth']=='narrowband')]
# --> VM-150 (NB): blank = 4, bin = 8, range = 400, numbins = 60, ens = 3min
# --> lots more, but doesn't seem any different from BB :(

#### b.) See if searching file's entire metadata for "nb150", "nb300", "bb300", "bb150" helps w/ figuring out NB vs. BB
**CONCLUSION:** Seems like looking for "nb150" in entire metadata may help

In [None]:
#df[(df['nb150'].notnull()) & ~(df['bandwidth'].notnull())
#  ][['hardware_model','nb150','nb300','bb150','bb300','bw_from_all_text','bandwidth']]
# --> lots of them

#df[(df['nb300'].notnull()) & ~(df['bandwidth'].notnull())
#  ][['hardware_model','nb150','nb300','bb150','bb300','bw_from_all_text','bandwidth']]
# --> none

#df[(df['bb300'].notnull()) & ~(df['bandwidth'].notnull())
#  ][['hardware_model','nb150','nb300','bb150','bb300','bw_from_all_text','bandwidth']]
# --> none

#df[(df['bb150'].notnull()) & ~(df['bandwidth'].notnull())
#  ][['hardware_model','nb150','nb300','bb150','bb300','bw_from_all_text','bandwidth']]
# --> none

#### c.) See if reading ACOUSTIC BEAM WIDTH from metadata helps
**CONCLUSION:** ACOUSTIC BEAM WIDTH = "unconfirmed (narrow band)" tells us NB! that's only a few files though

In [None]:
# - What does beam_width tell us about NB vs. BB?
#pd.set_option('display.max_colwidth', 200)
#print(df[df['beam_width'].notnull()]['beam_width'].unique())
#df[df['beam_width'].notnull()][['fname','beam_width','bandwidth']]
# --> "unconfirmed (narrow band)" tells us NB! that's only a few files though

# - What does beam_width = "30 degrees" tell us?
#print(len(df[df['beam_width']=='30 degrees']))
#df[df['beam_width']=='30 degrees'].drop('all_text', axis=1)
#df[df['beam_width']=='30 degrees'][['fname','comments','bandwidth']]
# --> discovered an error in these files
# --> 30 degrees should most prob be under TRANSDUCER BEAM ANGLE, NOT ACOUSTIC BEAM WIDTH

#### d.) See if using the set_bandwidth_from_all_text fxn below helps
**CONCLUSION:**
- Don't look for broadband from all_text (only 1 file comes up b/c "broadband seismograph")
- Require "narrowband" or "narrow band" in all_text (b/c get files w/ "narrow time period" or some place Narrows)
- Add ACOUSTIC BEAM WIDTH bw defn, looking for "narrow band" w/ priority above comments (only helps for 00111-00117_short.nc though)
- Add CRUISE_NAME bw defn, looking for "os38bb","os75bb","os150bb","nb150","nb300","os38nb","os75nb","os150nb" w/ priority above comments
- Add COMMENT bw defn, looking for "narrowband" (sometimes ppl erased the S from COMMENTS) w/ priority below comments
- Add all_text bw defn, looking for "300kHz broadband "Workhorse" ADCP" w/ priority below comment (no s)
- Add all_text bw defn, looking for "pingtype = bb" and "pingtype = nb" (perhaps others) w/ priority below comment (no s)
- Manually set bw to bb for 02095_short.nc and 02125_short.nc (I think they had mistakenly written nb in comments)
- After all this, ask TRDI for help + the remaining files w/ unclassified bw will be assumed from the majority of bw classifications of the corresponding instrument.

In [None]:
# fxns set_bandwidth_from_comments, set_bandwidth_from_hardware_model,
# set_bandwidth_from_transmit_frequency, set_final_bandwidth
# all same as in testing section above

def set_bandwidth_from_all_text(row):
# I use "broadband" instead of "broad" here b/c we have some cruise comments that say "broad-scale"
    if row['all_text']:
        if   ( (re.search('broadband', row['all_text'], re.IGNORECASE) or
                re.search('broad band', row['all_text'], re.IGNORECASE))
                and
               (re.search('narro', row['all_text'], re.IGNORECASE) or
                re.search('nb75', row['all_text'], re.IGNORECASE) or
                re.search('nb150', row['all_text'], re.IGNORECASE) or
                re.search('nb300', row['all_text'], re.IGNORECASE) or
                re.search('nb600', row['all_text'], re.IGNORECASE) or
                re.search('nb1200', row['all_text'], re.IGNORECASE))
             ):
            return 'both broad and narrowband?' 
        elif (re.search('broadband', row['all_text'], re.IGNORECASE) or
              re.search('broad band', row['all_text'], re.IGNORECASE)
             ):
            return 'broadband'
        elif (re.search('narro', row['all_text'], re.IGNORECASE) or
              re.search('nb75', row['all_text'], re.IGNORECASE) or
              re.search('nb150', row['all_text'], re.IGNORECASE) or
              re.search('nb300', row['all_text'], re.IGNORECASE) or
              re.search('nb600', row['all_text'], re.IGNORECASE) or
              re.search('nb1200', row['all_text'], re.IGNORECASE)
             ):
            return 'narrowband'
        else:
            return None
    else:
        return None

df['bw_from_all_text']=df.apply(set_bandwidth_from_all_text, axis=1)
df['bw_from_comments']=df.apply(set_bandwidth_from_comments, axis=1)
df['bw_from_hardware_model']=df.apply(set_bandwidth_from_hardware_model, axis=1)
df['bw_from_transmit_frequency']=df.apply(set_bandwidth_from_transmit_frequency, axis=1)
df['bandwidth']=df.apply(set_final_bandwidth, axis=1)

In [None]:
# ------ Looking for "broadband" in bw_from_all_text column
# ***** = first mention of this technique

#--> 1 total
# The following info from manual reaading of nc file metadata:
#--> 00595_short.nc came up only b/c of "broadband seismograph". Not real :(
#    ...*****don't look for broadband from all_text

pd.set_option('max_rows', 2200)
pd.set_option('display.max_colwidth', 20)
dfnow = df[(df['bw_from_all_text']=='broadband') &
           ~(df['bw_from_hardware_model'].notnull()) &
           ~(df['bw_from_transmit_frequency'].notnull()) &
           ~(df['bw_from_comments'].notnull())
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
dfnow

In [None]:
# ------ Looking for "narrowband" in bw_from_all_text column
# ***** = first mention of this technique

# --> 174 total
# The following info from manual reaading of nc file metadata:
# --> 00014_short.nc not narrowband, just had the phrase narrow time period
#     ...*****require "narrowband" or "narrow band" in all_text
# --> 00090-00095_short not narrowband, just had some place Narrows 
#     ...require "narrowband" or "narrow band" in all_text
# --> 00111-00117_short IS narrowband, VM150, "narrow band" from ACOUSTIC BEAM WIDTH so added beam_width.notnull()
#     ...the above is not in this set anymore then (now will be <174 total)
#     ...*****add beam_width as criteria looking for "narrow band" w/ priority above comments and all_text (only matters for 00111-00117_short.nc)
# --> 00544-00545_short not narrowband, narrow channel/narrowly-spaced
#     ...require "narrowband" or "narrow band" in all_text
# --> 00803,5_short IS narrowband, NB150 in next-line comments, nb150 from CRUISE_NAME
#     ...*****use CRUISE_NAME looking for "nb150"
# --> 00900-00923_short not narrowband, just had some place Narrows and maybe other phrases
#     ...require "narrowband" or "narrow band" in all_text
# --> 01285,7,9_short IS narrowband, NB150 in next-line comments, nb150 from CRUISE_NAME
#     ...use CRUISE_NAME looking for "nb150"
# --> 01340-01435(skipping every other ish)_short IS narrowband, NB150 in next-line comments, nb150 from CRUISE_NAME
#     ...use CRUISE_NAME looking for "nb150"
# --> 01527-8_short IS narrowband, nb150 from CRUISE_NAME
#     ...use CRUISE_NAME looking for "nb150"
# --> 01567_short IS narrowband, nb150 from CRUISE_NAME
#     ...use CRUISE_NAME looking for "nb150"
# --> 01574-01760(skipping every other ish)_short IS narrowband, nb150 from CRUISE_NAME, NB150 in next-line comments and ATTACHMENT TO THE HULL
#     ...use CRUISE_NAME looking for "nb150"
# --> 01814-6_short IS narrowband, "narrowband mode" from COMMENT (no S! UGH)
#     ...*****use COMMENT like COMMENTS looking for "narrowband"
# --> 01987-02159(skipping every other ish)_short IS narrowband, nb150 from CRUISE_NAME, NB150 in next-line comments and ATTACHMENT TO THE HULL
#     ...use CRUISE_NAME looking for "nb150"
# --> 02161_short IS narrowband, "75 kHz narrowband" from COMMENT (no S! UGH) 
#     ...use COMMENT like COMMENTS looking for "narrowband"
# --> 02172-02253(skipping every other ish)_short IS narrowband, nb150 from CRUISE_NAME, NB150 in next-line comments and ATTACHMENT TO THE HULL
#     ...use CRUISE_NAME looking for "nb150"
# --> 02355-68_short not narrowband, just had some place Narrows 
#     ...require "narrowband" or "narrow band" in all_text
pd.set_option('max_rows', 2200)
pd.set_option('display.max_colwidth', 20)
dfnow = df[(df['bw_from_all_text']=='narrowband') &
           ~(df['bw_from_hardware_model'].notnull()) &
           ~(df['bw_from_transmit_frequency'].notnull()) &
           ~(df['bw_from_comments'].notnull()) #&
           #(df['beam_width'].str.contains('narrow band'))
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
dfnow

In [None]:
# ------ Looking for "both broad and narrowband?" in bw_from_all_text column
# ***** = first mention of this technique

# --> 143 total
# The following info from manual reaading of nc file metadata:
# --> 00925 = implies broadband?? or maybe there was only one bandwidth avail then? HOT, WH300, 2005
#     ..."Broadband gives greater resolution but less depth penetration and narrowband visa versa. The Workhorse 300 allows the highest resolution."
#     ...also b/c it says "ADCP data from the RDI OS38 Broadband are found in SAC ID 00924."
# --> 00927 = narrowband?? or maybe there was only one bandwidth avail then? HOT, WH300, 2005
#     ...b/c it says "ADCP data from the RDI OS38 narrowband are found in SAC ID 00926."
#     ...but also says the same 1st quote as above
# --> 00930 = broadband?? HOT, WH300, 2005
#     ...it says "ADCP data from the RDI OS38 broadband (SAC ID 00928) and narrowband (SAC ID 00929) are also available for this cruise."
#     ...but also says the same 1st quote as above
# --> 00945 = broadband?? WH300, 2005
#     ...it says "Narrowband mode (SAC ID 00944) and the broadband (SAC 00943) mode are inter-leaved from the OS38 instrument.  This cruise also includes ADCP data from the Workhorse 300, which is in SAC ID 00945."
#     ...but also says the same 1st quote as above
# --> 00948,82,85 = broadband?? same as above
# --> 01087-96(skipping every other ish) = narrowband, nb150 from CRUISE_NAME, NB150 in next-line comments and ATTACHMENT TO THE HULL
#     ...use CRUISE_NAME looking for "nb150"
# --> 01137-91(skipping every other ish) = broadband?? HOT, WH300, 2006-7
# --> 01247 = broadband?? not HOT, but same as all above, WH300, 2005
# --> 01250-61(skipping every other ish) = broadband?? HOT, WH300, 2005
# --> 01264 = broadband?? not HOT but in Pac, same as all above, WH300, 2006
# --> 01267-70 = broadband?? not HOT but in Pac, same as all above, WH300, 2007
# --> 01273-6 = broadband?? not HOT but in Pac, same as all above, WH300, 2008
# --> 01291 = narrowband
#     ...use CRUISE_NAME looking for "nb150"
# --> 01325-01430(skipping every other ish) = narrowband, nb150 from CRUISE_NAME, NB150 in next-line comments and ATTACHMENT TO THE HULL
#     ...use CRUISE_NAME looking for "nb150"
# --> 01447 = broadband! NPSG, WH300, 2008, "This cruise has two instruments, 38kHz phased array ("Ocean Surveyor") capable of pinging in broadband or narrowband (or interleaved) modes, and a 300kHz broadband "Workhorse" ADCP." in comments. "The Workhorse 300 has the highest time- and depth- resolution, but rarely reaches deeper than 80m." in comments.
#     ...*****needs manual specification! Or searching for "300kHz broadband "Workhorse" ADCP"
# --> 01452-01664,01721,01780-02098,02122,02348(skipping every other ish) = broadband! same as above, HOT and others, WH300, 2008-14
#     ...needs manual specification! Or searching for "300kHz broadband "Workhorse" ADCP"
# --> 01688,01737,02106,02180 = narrowband, nb150 from CRUISE_NAME, NB150 in next-line comments and ATTACHMENT TO THE HULL
#     ...use CRUISE_NAME looking for "nb150"
# --> 02127 = narrowband! "These datasets NB150 and OS75 narrowband have NODC SAC IDs 02126 and 02127, respectively."
#     ...use COMMENT like COMMENTS looking for "narrowband"
#     ...*****OR use CRUISE_NAME looking for "os75nb"
# --> 02384,7 = broadband! WH300, 2018, 1 comment said: "was a Sentinel model?" lol
#     ...*****search for "pingtype = bb"
# --> IN SUM, ALL WH300 WITH "both broad and narrowband?" STATUS IN bw_from_all_text CAN BE FILLED IN TO BE BROADBAND
# (After 01447, this is for sure. Before 01447, is highly likely to be a good assumption.)
pd.set_option('max_rows', 2200)
pd.set_option('display.max_colwidth', 20)
dfnow = df[(df['bw_from_all_text']=='both broad and narrowband?') &
           ~(df['bw_from_hardware_model'].notnull()) &
           ~(df['bw_from_transmit_frequency'].notnull()) &
           ~(df['bw_from_comments'].notnull()) &
           (df.index>2261)
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
dfnow

In [None]:
# ------ Looking for None in bw_from_all_text column

# --> 703 total
# --> THESE ARE A PROBLEM AND WILL REQUIRE ASSUMPTIONS
# BASED ON WHAT'S MOST LIKELY, BUT LET'S SEE HOW MANY
# OF THESE CAN BE TAKEN CARE OF BY ADDING IN THE NEW
# METHODS OF CLASSIFICATION BASED ON all_text SUGGESTED
# IN THE 3 CELLS ABOVE
pd.set_option('max_rows', 2200)
pd.set_option('display.max_colwidth', 20)
dfnow = df[~(df['bw_from_all_text'].notnull()) &
           ~(df['bw_from_hardware_model'].notnull()) &
           ~(df['bw_from_transmit_frequency'].notnull()) &
           ~(df['bw_from_comments'].notnull())
  ][['hardware_model','transmit_frequency','fname','cruise_beg_date','beam_width',
     'bw_from_all_text','bw_from_comments','bw_from_hardware_model',
     'bw_from_transmit_frequency','bandwidth','comments']]
print(len(dfnow))
print(dfnow['hardware_model'].unique())
print(dfnow['cruise_beg_date'].min(),dfnow['cruise_beg_date'].max())

In [None]:
# - What values of cruise_name should be searched for?
# --> 'os38bb','os75bb','os150bb','nb150','nb300','os38nb','os75nb','os150nb'

pd.set_option('max_rows', 2200)
cn = pd.Series(df['cruise_name'].unique())
print(len(cn[cn.str.contains('bb')]))
cn[cn.str.contains('bb')]
# --> look for os38bb, os75bb, os150bb 
print(len(cn[cn.str.contains('nb')]))
cn[cn.str.contains('nb')]
# --> look for nb150, os75nb, os38nb, nb300, os150nb

#### e.) See if all classified WH-300 are broadband (therefore, any WH-300 w/ bandwidth missing can probably be assumed to be broadband)
**CONCLUSION:** WH-300 w/ defined bw are all pretty much bb. MANUALLY OVERWRITE BANDWIDTH FOR THE 2 WEIRD FILES (or just make all WH-300 bb, we'll see).

In [None]:
# - Testing the idea that WH-300 is ALL broadband
#df[(df['instrument_name']=='Workhorse-300')][
#    ['fname','bw_from_all_text','bw_from_comments','bw_from_hardware_model','bw_from_transmit_frequency','bandwidth']]
#df[(df['instrument_name']=='Workhorse-300') & (df['bandwidth'].notnull())][
#    ['fname','bw_from_all_text','bw_from_comments','bw_from_hardware_model','bw_from_transmit_frequency','bandwidth']]
print(len(df[(df['instrument_name']=='Workhorse-300') & (df['bandwidth'].notnull())][
    ['fname','bw_from_all_text','bw_from_comments','bw_from_hardware_model','bw_from_transmit_frequency','bandwidth']]))
# --> only 2 out of 42 w/ bandwidth defined are narrowband (there are of course lots of None bandwidth though),
# BUT they are problematic nc files: (02095_short.nc, 02125_short.nc) say in one comments
# section that WH-300 is narrowband and in another that it's broadband, but I think they made a mistake
# saying it's narrowband; I also checked for pingtype = bb in the post cruise processing notes; these 2 files had that!
# I think that means they are both broadband and/or that was at least assumed in the post cruise processing
# --> *****MANUALLY OVERWRITE BANDWIDTH FOR THESE 2 FILES (or just make all WH-300 bb, we'll see)
# --> OR use the search for 'pingtype = bb' in all_text method w/ priority above comments-derived bw
print(len(df[(df['instrument_name']=='Workhorse-300') & (df['bandwidth'].isnull()) &
             (df['bw_from_all_text'].isnull())]))
#df[(df['instrument_name']=='Workhorse-300') & (df['bandwidth'].isnull()) &
#             (df['bw_from_all_text'].isnull())][['fname','bw_from_all_text','bw_from_comments','bw_from_hardware_model','bw_from_transmit_frequency','bandwidth']]
print(len(df[(df['instrument_name']=='Workhorse-300') & (df['bandwidth'].isnull()) &
             (df['bw_from_all_text'].isnull()) & (df['all_text'].str.contains('pingtype'))] # SAME RESULTS IF: ...contains('pingtype = bb'))]
          [['fname','instrument_name','bandwidth']]))
print(len(df[(df['instrument_name']=='Workhorse-300') & (df['bandwidth'].isnull()) &
             (df['all_text'].str.contains('pingtype'))]
          [['fname','instrument_name','bandwidth']]))
# --> pingtype may help define 4 out of 18 more undefined bandwidth WH-300s 
# --> pingtype may help define bandwidths for 95 WH-300, whether bw is already defined or not 

# - How many undefined bw files can pingtype potentially help define, WH-300 or not?
print(len(df[(df['bandwidth'].isnull()) & (df['bw_from_all_text'].isnull()) & df['all_text'].str.contains('pingtype')]))
#df[(df['bandwidth'].isnull()) & (df['bw_from_all_text'].isnull()) & df['all_text'].str.contains('pingtype')]
# --> 4 are WH-300 as we saw above, w/ the 1 new WH-Marine-600

# OLD

In [None]:
#df[(df['hardware_model']=='Narrowband') & (df['transmit_frequency']=='153.6 kHz')]
#df[(df['hardware_model']=='RD-VM')]
#df.iloc[292]['cruise_beg_date']>pd.Timestamp(1991,6,1)
#df[df['fname']=='00400_short.nc']
#df[357:367][['fname','hardware_model','instrument_name',
#             'serial_numbers','transducer_config','cruise_beg_date',
#             'platform_name']]
#print(len(df[(df['hardware_model']=='150') & (df['transducer_config']=='JANUS CONCAVE')][['fname','platform_name','transducer_config','cruise_beg_date']]))
#df[(df['instrument_name'].str.contains('OS')) & (df['transducer_config'].notnull())][['instrument_name','fname','platform_name','transducer_config','cruise_beg_date']]
#df[(df['hardware_model']=='153.6 kHz hull mounted ADCP')][['fname','platform_name','transmit_frequency','transducer_config','cruise_beg_date']]
#df[(df['hardware_model']=='RDI')]
#df[(df['platform_name']=='R/V Charles Darwin')][['fname','platform_name','transmit_frequency','transducer_config','cruise_beg_date']]
# assume vm from metadata for 00015