# Using EcoFOCIpy to process raw field data

## WE1008

## Nutrient Data with WOCE columns for historic datasets.

This is a streamlined version of generation routines to merge bottle data and Mordy Nut. Lab Nutrient Data for long term archive.

Outputs will be in CF convention data for erddap but may need to read blt/EPIC netcdf files from historic ecoraid archive

<div class="warning" style='background-color:#E9D8FD; color: #69337A; border-left: solid #805AD5 4px; border-radius: 4px; padding:0.7em;'>
<span>
<p style='margin-top:1em; text-align:center'>
<b>A template for Nutrient Lab ASCII files to NETCDF</b></p>
<p style='margin-left:1em;'>
Populate the necessary paths in the following cells.</p>
<p style='margin-bottom:1em; margin-right:1em; text-align:right; font-family:Georgia'> <b>- Shaun Bell</b>
</p></span>
</div>


In [1]:
import yaml
import glob
import pandas as pd
import os
import xarray as xa

import EcoFOCIpy.io.sbe_ctd_parser as sbe_ctd_parser #<- instrument specific
import EcoFOCIpy.io.ncCFsave as ncCFsave
import EcoFOCIpy.metaconfig.load_config as load_config

In [2]:
sample_data_dir = '/Users/bell/ecoraid/2010/CTDcasts/we1008/' #root path to cruise directory
ecofocipy_dir = '/Users/bell/Programs/EcoFOCIpy/'

In [3]:
###############################################################
# edit to point to {cruise sepcific} raw datafiles 
datafile = sample_data_dir+'rawconverted/' #<- point to cruise and process all files within
datafile = '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/' #<- point to cruise and process all files within
nutdatafile = sample_data_dir+'working/DiscreteNutrients/WE1008 Nutrient Data.txt' #<- point to cruise and process all files within
cruise_name = 'we1008' #no hyphens
cruise_meta_file = sample_data_dir+'logs/WE1008.yaml'
inst_meta_file = sample_data_dir+'logs/FOCI_standard_CTDpNutsWOCE.yaml' #<- copy to each deployment for simplicity?
group_meta_file = ecofocipy_dir+'staticdata/institutional_meta_example.yaml'
###############################################################
#init and load data
filename_list = sorted(glob.glob(datafile + '*.nc'))

In [4]:
cruise_data = {}

for i in filename_list:
    cast = 'ctd'+i.split('.')[0].split('c')[-1].replace('_','.')
    cruise_data.update({cast:xa.load_dataset(datafile + i.split('/')[-1],decode_times=False)})



In [5]:
filename_list

['/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc001_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc002_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc003_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc004_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc005_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc006_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc007_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc008_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc009_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc010_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc011_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008bc012_btl.nc',
 '/Users/bell/ecoraid/2010/CTDcasts/we1008/final_data/btl/w1008b

In [6]:
cruise_data['ctd053.btl'].to_dataframe().reset_index()

Unnamed: 0,time,dep,lat,lon,time2,BTL_103,S_41,S_42,O_65,OST_62,...,PAR_916,Chl_933,PAR_905,ATTN_55,Tr_904,PO4_186,SI_188,NO3_182,NO2_184,NH4_189
0,2455436,1.536,59.107334,170.247498,62760000,7.0,30.719299,30.711901,290.802002,96.676697,...,2.8225,0.2504,38.862,0.4903,88.462799,0.605,3.2,0.1,0.01,0.03
1,2455436,10.029,59.107334,170.247498,62760000,6.0,30.749001,30.746401,289.934998,96.286613,...,2.1521,0.2486,8.247,0.4725,88.857498,0.589,3.3,0.0,0.01,0.05
2,2455436,19.809,59.107334,170.247498,62760000,5.0,30.775499,30.7964,292.237,95.444641,...,1.6193,0.424,2.3752,0.467,88.981903,0.705,6.3,0.6,0.03,0.41
3,2455436,30.320999,59.107334,170.247498,62760000,4.0,31.278099,31.264799,295.25,80.649689,...,1.049,0.0566,0.59584,0.6737,84.499199,1.361,24.700001,5.6,0.06,3.48
4,2455436,40.334999,59.107334,170.247498,62760000,3.0,31.316401,31.304701,293.024994,79.593582,...,0.4534,0.076,0.10709,0.6874,84.211304,1.426,26.5,6.4,0.05,3.73
5,2455436,49.658001,59.107334,170.247498,62760000,2.0,31.3354,31.323299,289.981995,78.701401,...,0.0,0.0274,1e-12,0.6997,83.951599,1.47,27.5,7.1,0.05,3.85
6,2455436,61.965,59.107334,170.247498,62760000,1.0,31.425501,31.4133,285.484985,77.570381,...,0.0,0.0226,1e-12,1.0164,77.562103,1.531,28.6,7.9,0.05,4.09


## Load csv Nutrient File

In [7]:
nut_data = pd.read_csv(nutdatafile,delimiter='\t')
nut_data = nut_data.dropna(how='all',axis=1)
nut_data

Unnamed: 0,Cast,Niskin,PO4 (uM),PO4_Flag,Sil (uM),Sil_Flag,NO3 (uM),NO3_Flag,NO2 (uM),NO2_Flag,NH4 (uM),NH4_Flag
0,2,12,0.610000,2,7.7,2,1.6,2,0.05,2,0.37,2
1,2,11,0.789000,2,4.7,2,4.5,2,0.10,2,0.95,2
2,2,10,0.925000,2,6.7,2,5.6,2,0.12,2,1.64,2
3,2,9,1.333000,2,19.1,2,10.6,2,0.15,2,2.62,2
4,2,8,1.816000,2,34.5,2,18.1,2,0.20,2,2.95,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1473,185,4,1.276098,2,20.0,2,11.0,2,0.27,2,1.86,2
1474,185,5,1.040571,2,15.7,2,6.9,2,0.22,2,2.01,2
1475,185,6,0.688856,2,10.1,2,3.1,2,0.12,2,1.10,2
1476,185,7,0.378639,2,5.9,2,0.3,2,0.01,2,0.34,2


In [8]:
##nw1201 relable niskins
# nut_data.loc[nut_data['Niskin']> 4,'Niskin'] = nut_data.loc[nut_data['Niskin']> 4,'Niskin']+2
# nut_data

## Merge Bottle and Nutrient Data but drop non nutrient vars?

<div class="warning" style='background-color:#ffcccb; color: #FF0000; border-left: solid #805AD5 4px; border-radius: 4px; padding:0.7em;'>
<span>
<p style='margin-top:1em; text-align:center'>
<b>WARNING</b></p>
<p style='margin-left:1em;'>bottle/niskin and rosette position should be the same but can be different (example, bottles are labeled sequentially but a rosette position is skipped due to balancing or other instruments.</p>
</div>

In [9]:
nut_data['Cast'] = nut_data['Cast'].astype(int)
nut_data['Niskin'] = nut_data['Niskin'].replace('Bucket',0).astype(int)

In [10]:
keep_param = ['BTL_103','dep'] #sometimes prsm

for cast,cdata in cruise_data.items():
    try:
        matchcast = int((cast.split('.')[0]).lower().split('ctd')[-1])
        cruise_data[cast] = pd.merge(nut_data[nut_data['Cast']==matchcast],cdata.to_dataframe().reset_index()[keep_param],right_on='BTL_103',left_on='Niskin').set_index('BTL_103').drop(columns=['Cast'])
    except:
        print(f'{cast} : something is wrong')

## Add Deployment meta information

In [11]:
#just a dictionary of dictionaries - simple
with open(cruise_meta_file) as file:
    cruise_config = yaml.full_load(file)

## Add Instrument meta information

Time, depth, lat, lon should be added regardless (always our coordinates) but for a mooring site its going to be a (1,1,1,t) dataset
The variables of interest should be read from the data file and matched to a key for naming.  That key is in the inst_config file seen below and should represent common conversion names in the raw data

In [12]:
with open(inst_meta_file) as file:
    inst_config = yaml.full_load(file)

## Add institutional meta-information


In [13]:
with open(group_meta_file) as file:
    group_config = yaml.full_load(file)

## Save CF Netcdf files

Currently stick to netcdf3 classic... but migrating to netcdf4 (default) may be no problems for most modern purposes.  Its easy enough to pass the `format` kwargs through to the netcdf api of xarray.

In [14]:
#loop over all casts and perform tasks shown above

for cast in cruise_data.keys():
    try:
        cruise_data[cast] = cruise_data[cast].rename(columns={
                            'Sil (uM)':'SI',
                            'PO4 (uM)':'PO4',
                            'NO2 (uM)':'NO2', 
                            'NO3 (uM)':'NO3',
                            'NH4 (uM)':'NH4',
                            'BTL_103':'BTLID',
                            # 'prdm':'pressure',
                            'dep':'depth',
                            'empty':'empty', #this will be ignored
                            'flag':'flag'})

        cruise_data[cast].index = cruise_data[cast].index.rename('bottle')
        
        cruise_data_nc = ncCFsave.EcoFOCI_CFnc(df=cruise_data[cast], 
                                    instrument_yaml=inst_config, 
                                    operation_yaml=cruise_config,
                                    operation_type='ctd')

        cruise_data_nc.expand_dimensions(dim_names=['latitude','longitude','time'],geophys_sort=False)

        cruise_data_nc.variable_meta_data(variable_keys=list(cruise_data[cast].columns.values),drop_missing=False)
        #adding dimension meta needs to come after updating the dimension values... BUG?
        cruise_data_nc.dimension_meta_data(variable_keys=['time','latitude','longitude'])
        cruise_data_nc.temporal_geospatioal_meta_data_ctd(positiveE=False,conscastno=cast.split('.')[0])

        #add global attributes
        cruise_data_nc.deployment_meta_add(conscastno=cast.split('.')[0].upper())

        #add instituitonal global attributes
        cruise_data_nc.institution_meta_add(group_config)

        #add creation date/time - provenance data
        cruise_data_nc.provinance_meta_add()

        #provide intial qc status field
        cruise_data_nc.qc_status(qc_status='excellent') #<- options are unknown, excellent, probably good, mixed, unqcd

        cruise_data_nc.xarray2netcdf_save(xdf = cruise_data_nc.get_xdf(),
                                   filename=cruise_name+'c'+cast.lower().split('d')[-1].split('.')[0].zfill(3)+'_nut.nc',format="NETCDF3_CLASSIC")
    except KeyError:
        print(f'Skipping {cast}')
    except RuntimeError:
        print(f'Skipping & Removing {cast}')
        os.remove(path=cruise_name+'c'+cast.lower().split('d')[-1].split('.')[0].zfill(3)+'_nut.nc')

  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days since 1900-01-01'}})
  xdf.to_netcdf(filename,format=kwargs['format'],encoding={'time':{'units':'days s

In [15]:
cast

'ctd185.btl'

## Next Steps

QC of data (plot parameters with other instruments)
- be sure to updated the qc_status and the history

In [16]:
cruise_data_nc.get_xdf()