In [6]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import xarray as xr

In [7]:
# - Set data path
dpath = '/opt/acoustic-variability/data/'

In [8]:
# - Set saved out figure path
figpath = '/opt/acoustic-variability/python/figures/'

# Compile JASADCP metadata

### Read JASADCP metadata into df

In [9]:
fnames = sorted(os.listdir(dpath + 'JASADCP/ncfiles'))

In [10]:
nc_counter = len(fnames)
hardware_model = [None]*nc_counter
serial_numbers = [None]*nc_counter
transmit_frequency = [None]*nc_counter
phased_array = [None]*nc_counter
cruise_beg_date = [None]*nc_counter
blanking_interval = [None]*nc_counter
bin_length = [None]*nc_counter
transducer_beam_angle = [None]*nc_counter
transmit_pulse_length = [None]*nc_counter
comments = [None]*nc_counter
biomass_dtmn = [None]*nc_counter

In [11]:
ifile = 0
for fname in fnames:
    ncfile = dpath + 'JASADCP/ncfiles/' + fname
    ncnow = xr.open_dataset(ncfile)
    strnow = ncnow.attrs['cruise_sonar_summary']
    # 1.) hardware_model
    hardware_modelnow = re.findall("HARDWARE MODEL *: *((?:\S+ )*\S+)", strnow)
    if hardware_modelnow:
        hardware_model[ifile] = hardware_modelnow[0]
    elif not hardware_modelnow: # if hardware_modelnow is empty
        manufacturernow = re.findall("MANUFACTURER *: *((?:\S+ )*\S+)", strnow)
        if manufacturernow:
            hardware_model[ifile] = manufacturernow[0]
    # 2.) serial_numbers
    serial_numbersnow = re.findall("SERIAL NUMBERS *: *((?:\S+ )*\S+)", strnow)
    if serial_numbersnow:
        serial_numbers[ifile] = serial_numbersnow[0]
    # 3.) transmit_frequency
    transmit_frequencynow = re.findall("TRANSMIT FREQUENCY *: *((?:\S+ )*\S+)", strnow)
    if transmit_frequencynow:
        transmit_frequency[ifile] = transmit_frequencynow[0]
    # 4.) phased_array 
    phased_arraynow = re.findall("phased.array", strnow, re.IGNORECASE)
    if phased_arraynow:
        sep = '///'; phased_arraynow = sep.join(phased_arraynow)
        phased_array[ifile] = phased_arraynow
    # 5.) cruise_beg_date
    cruise_beg_date[ifile] = ncnow['time'][0].values
    # 6.) blanking_interval
    blanking_intervalnow = re.findall("BLANKING INTERVAL *: *((?:\S+ )*\S+)", strnow)
    if blanking_intervalnow:
        blanking_interval[ifile] = blanking_intervalnow[0]
    # 7.) bin_length
    bin_lengthnow = re.findall("BIN LENGTH *: *((?:\S+ )*\S+)", strnow)
    if bin_lengthnow:
        bin_length[ifile] = bin_lengthnow[0]
    # 8.) transducer_beam_angle
    transducer_beam_anglenow = re.findall("TRANSDUCER BEAM ANGLE *: *((?:\S+ )*\S+)", strnow)
    if transducer_beam_anglenow:
        transducer_beam_angle[ifile] = transducer_beam_anglenow[0]
    # 9.) transmit_pulse_length
    transmit_pulse_lengthnow = re.findall("TRANSMIT PULSE LENGTH *: *((?:\S+ )*\S+)", strnow)
    if transmit_pulse_lengthnow:
        transmit_pulse_length[ifile] = transmit_pulse_lengthnow[0]
    # 10.) comments
    commentsnow = re.findall("COMMENTS *: *((?:\S+ )*\S+)", strnow)
    if commentsnow:
        sep = '///'; commentsnow = sep.join(commentsnow)
        comments[ifile] = commentsnow
    # 11.) biomass_dtmn
    biomass_dtmnnow = re.findall("BIOMASS DETERMINATION *: *((?:\S+ )*\S+)", strnow)
    if biomass_dtmnnow:
        biomass_dtmn[ifile] = biomass_dtmnnow[0]
    ifile = ifile+1

In [12]:
df = pd.concat(
    [pd.Series(hardware_model,name='hardware_model'), pd.Series(serial_numbers,name='serial_numbers'),
     pd.Series(transmit_frequency,name='transmit_frequency'),pd.Series(phased_array,name='phased_array'),
     pd.Series(cruise_beg_date,name='cruise_beg_date'),pd.Series(blanking_interval,name='blanking_interval'),
     pd.Series(bin_length,name='bin_length'),pd.Series(transducer_beam_angle,name='transducer_beam_angle'),
     pd.Series(transmit_pulse_length,name='transmit_pulse_length'),pd.Series(comments,name='comments')],
     axis=1)

In [13]:
# - Define fxns to describe bandwidth from comments and hardware_model

# --> Checks for hardware_model names containing NB = narrowband
#dfnow = df.dropna(subset=['hardware_model'])
#dfnow[dfnow['hardware_model'].str.contains('NB')]['hardware_model'].unique()
# --> Results are:
# array(['NB 150 (VM-150-18HP)', 'NB 150', 'VM-150 (NB)'], dtype=object)

def set_bandwidth_from_comments(row):
    if row['comments']:
        if (re.search('broad', row['comments'], re.IGNORECASE) and
                re.search('narro', row['comments'], re.IGNORECASE)):
            return 'both broad and narrow?' 
        elif (re.search('broad', row['comments'], re.IGNORECASE) and
                re.search('phased.array', row['comments'], re.IGNORECASE)):
            return 'both broad and phased.array?' 
        elif re.search('broad', row['comments'], re.IGNORECASE):
            return 'broad'
        elif (re.search('narro', row['comments'], re.IGNORECASE) or
                re.search('phased.array', row['comments'], re.IGNORECASE)): 
            return 'narrow'
        else:
            return 'unknown'
    else:
        return 'unknown'

def set_bandwidth_from_hardware_model(row):
    if row['hardware_model']:
        if re.search('broad', row['hardware_model'], re.IGNORECASE):
            return 'broad'
        elif (re.search('narro', row['hardware_model'], re.IGNORECASE) or
                re.search('nb', row['hardware_model'], re.IGNORECASE) or
                  re.search('phased.array', row['hardware_model'], re.IGNORECASE)):
            return 'narrow'
        else:
            return 'unknown'
    else:
        return 'unknown'

In [14]:
df['bw_from_comments']=df.apply(set_bandwidth_from_comments, axis=1)
df['bw_from_hardware_model']=df.apply(set_bandwidth_from_hardware_model, axis=1)

In [15]:
#pd.set_option('max_rows', 100)
#pd.set_option('display.max_colwidth', -1)
df.head()

Unnamed: 0,hardware_model,serial_numbers,transmit_frequency,phased_array,cruise_beg_date,blanking_interval,bin_length,transducer_beam_angle,transmit_pulse_length,comments,bw_from_comments,bw_from_hardware_model
0,RD-VM150,,150 kHz except,,1992-11-08 02:21:12.000000000,4 m,8 m,30 deg,8 m,very little bottom tracking///At several point...,unknown,unknown
1,RD-VM150,,,,1992-12-12 02:18:41.000000000,4 m,8 m,,8 m,,unknown,unknown
2,RD-VM150,,,,1993-01-23 09:47:09.000000000,4 m,8 m,,16 m,,unknown,unknown
3,RD-VM150,,153 kHz,,1992-10-27 23:36:48.999999996,4 m,8 m,30 deg.,16 m,data quality good,unknown,unknown
4,RD-VM0150,,153 kHz,,1992-12-02 07:00:53.999999996,4m,8m,30 deg.,8m,,unknown,unknown


### Uniformly rename hardware_model names

In [16]:
# Save out all unique hardware_model names + # of occurrences
df['hardware_model'].value_counts().to_csv('jasadcp_unique_instruments.csv', header=['hardware_model count'])

**All names to become 'Ocean Surveyor 38'**:  
'Ocean Surveyer 38', 'Ocean Surveyor 38'

**All names to become 'Ocean Surveyor 75'**:  
'Ocean Surveyor 75', 'OS75 narrowband', 'Ocean Surveyer 75', 'Ocean Surveyor 75 narrowband', 'OS75 (Ocean Surveyor)', 'Ocean Surveyor 75 broadband', 'Ocean Surveyor OS75', 'OS75', 'Ocean Surveryor 75', 'Ocean Surveyor 75 narroband', 'Ocean Surveyor 75 Broadband', 'Ocean Surveyor 75 Narrowband', '75KHz Ocean Surveyor narrowband', 'Ocean Surveyor 75 kHz', 'RDI 75KHz Ocean Surveyor', 'Ocean Surveyor 75 kHz Phased Array', '75KHz Ocean Surveyor'

**All names to become 'Ocean Surveyor 150'**:  
'Ocean Surveyer 150', 'Ocean Surveryor 150', 'Ocean Surveyor 150 narroband', 'Ocean Surveyor 150 narrowband', 'Ocean Surveyor 150 broadband'

**All names to become 'Ocean Surveyor II 38'**:  
'Ocean Surveyor II (OS-II 38)'

**All names to become 'Ocean Surveyor II 75'**:  
'OSII75S phased-array'

**All names to become 'VM-75'**:
'VM75 narrowband'

**All names to become 'VM-150'**:  
'VM-150', 'RD-VM150', 'VM-150 Narrowband', 'RD-VM150 Narrow band', 'VM-150 (NB)', 'VM-150 narrowband', 'RDI VM150 narrowband', 'VM150', 'RD-VM150 narrowband', 'RD-VM0150'

**All names to become 'VM-300'**:  
'VM-300', 'RD-VM300'

**All names to become 'VM-150-18HP'**:  
'NB 150 (VM-150-18HP)'

**All names to become 'Workhorse 300'**:  
'WorkHorse 300', 'Workhorse 300', 'Workhorse 300; 300 kHz' 

**All names to become 'Workhorse 1200'**:  
'Workhorse 1200'

**All names to become 'Workhorse Mariner 300'**:  
'Workhorse Mariner (300 kHz)'

**All names to become 'Workhorse Mariner 600'**:  
'WorkHorse Mariner 600'

**All names to become 'DCP4400A'**:  
'DCP4400A'

**All unknown**:  
'Narrowband 150', 'NB 150', 'VM-150 and VM-300', '150', 'narrowband 75 kHz', 'RDI', '150 narrowband', '150 kHz Narrowband', 'Broadband 150', 'Broad Band 150', 'RD-VM', '150 kHz hull mounted ADCP', 'Narrowband', 'Narrowband 300', '150 kHz', 'Direct-Read 150 kHz Narrowband', 'Vessel-mounted 150 kHz Narrowband', 'Narrow Band 150Khz', 'Vessel-mount 150 kHz Narrowband', '1) Narrow Band 150 kHz', '153.6 kHz hull mounted ADCP', '300 narrow band', '150 kHz broadband', '150 kHz narrow band', '"150 broad band, concave"', 'Vessel-Mount 150 kHz Narrowband'

### Dealing w/ the unknown hardware_model names

In [28]:
# Check a few of the nc files to see if there is more info on the instrument;
# after printing out fnamesnow, use ncdump -h on tern to look at metadata of those files
# --> doesn't seem to be more info gleaned :( when I looked at a bunch of files
idxsnow = df[df['hardware_model']=='VM-150 and VM-300'].index.tolist()
fnamesnow = [fnames[i] for i in idxsnow] 
fnamesnow

#### Let's just try to make another spreadsheet for RDI that has some extra info. Maybe they can figure out which instrument it is from this extra info.

**Info to add when the hardware model is unknown:**  
'CHIEF SCIENTIST ON SHIP', 'PERSONNEL IN CHARGE',
'MANUFACTURER', 'HARDWARE MODEL' (already in df), 'SERIAL NUMBERS' (already in df),  
'TRANSMIT FREQUENCY' (already in df), 'TRANSDUCER CONFIGURATION',  
'DEPTH RANGE', 'BIN LENGTH' (already in df), 'NUMBER OF BINS',  
'TRANSMIT PULSE LENGTH' (already in df), 'BLANKING INTERVAL' (already in df), 'ENSEMBLE AVERAGING INTERVAL'

In [None]:
df.to_csv('jasadcp_metadata_for_TRDI.csv')

In [None]:
df.dtypes # datetime64 doesn't go to csv right

# Compile TAO mooring metadata

In [None]:
os.listdir(dpath + 'TAO_NDBC/ncfiles')

In [None]:
ncfile = dpath + 'TAO_NDBC/ncfiles/TAO_T0N170W_KA019-20151201_D_ADCP.nc'
nct = xr.open_dataset(ncfile)

In [None]:
nct['INTENSITY']

In [None]:
#nct['ADCP_CONFIG'].attrs['model_name']
nct['ADCP_CONFIG'].attrs

In [None]:
nct['INTENSITY'].plot()

# TESTING/OLD

In [None]:
#ncfile = dpath + 'JASADCP/ncfiles/01305_short.nc'
#ncfile = dpath + 'JASADCP/ncfiles/00200_short.nc'
ncfile = dpath + 'JASADCP/ncfiles/02000_short.nc'
ncj = xr.open_dataset(ncfile)
ncj.attrs['cruise_sonar_summary'].split('\n')
string = ncj.attrs['cruise_sonar_summary']

# USE THIS ONE!!!
# hardware_model, serial_numbers, transmit_frequency, phased_array,
# cruise_beg_date, blanking_interval, bin_length, transducer_beam_angle,
# transmit_pulse_length, ...C (from Mullison 2017 Table 2),
# transmit_power (from Mullison 2017 Table 2)??
hardware_model = re.findall("HARDWARE MODEL *: *((?:\w+ )*\w+)", string)
blanking_interval = re.findall("BLANKING INTERVAL *: *((?:\w+ )*\w+)", string)
phased_array = re.findall("(phased array)", string, re.IGNORECASE)[0]
cruise_beg_date = ncj.time[0]
# \w = [A-Za-z0-9]

In [None]:
print(df[df['phased_array']=='phased array']['hardware_model'].to_string())

In [None]:
fnames = []
for fname in os.listdir(dpath + 'JASADCP/ncfiles/'):
    if fname.endswith('.nc'):
        fnames.append(fname)

In [None]:
fnames[2236]

In [None]:
[i for i,x in enumerate(hardware_model) if not x]

In [None]:
# hardware_model troubleshooting
# 5, 10, 11
# 00573_short.nc - NB 150 (VM-150-18HP)
# 00139_short.nc - VM-150 
# 01305_short.nc - RD-VM150 Narrow band 
# 43
# 00726_short.nc - blank with name under MANUFACTURER 

In [None]:
ncfile = dpath + 'JASADCP/ncfiles/01872_short.nc'
ncj = xr.open_dataset(ncfile)
ncj.attrs['cruise_sonar_summary'].split('\n')