look for variables that are missing across entire file and remove
- mostly for timeseries data (CTD data may have missing columns if instrument worked during cruise but failed during cast)

In [1]:
import xarray as xa
import os
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def drop_leading_na_dataset(ds, dim):
    """
    Drops leading slices from a Dataset where all variables are NaN.

    This function finds the first index along a dimension where at least one
    data variable has a non-missing value and slices the dataset from
    that point onwards.

    Args:
        ds (xr.Dataset): The input xarray Dataset.
        dim (str): The dimension along which to operate (e.g., 'time').

    Returns:
        xr.Dataset: A new Dataset with the leading all-NaN slices removed.
    """
    # Find the index of the first slice that is not all-NaN
    # `notnull()` returns a boolean Dataset (True where not-NaN)
    # `to_array()` stacks variables into a new dimension called 'variable'
    # `any('variable')` checks if any variable is True (not-NaN) for each coordinate
    is_valid = ds.notnull().to_array().any('variable')
    
    # `argmax()` on a boolean array finds the index of the first `True` value
    first_valid_index = is_valid.argmax(dim=dim).item()

    # Slice the dataset from that first valid index to the end
    return ds.isel({dim: slice(first_valid_index, None)})

In [3]:
path = '/Users/bell/ecoraid/2018/Moorings/'

In [8]:
for year in range(2018,2019,1):
    path = f'/Users/bell/ecoraid/{year}/Moorings/'
    for tfile in os.listdir(path):
        tfile = tfile+'/erddap/final_data/' #EPIC
        # tfile = tfile+'/final_data_cf/'
    
        if os.path.isdir(path+tfile):
            if 'ckp1a' in path+tfile:
                for file in os.listdir(path+tfile):
                    if file.endswith('.nc'):
                        print(file)
                        try:
                            xdf = xa.load_dataset(f'{path}{tfile}{file}')
                            print(f'deltat: {xdf['time'].diff('time').dt.total_seconds().mean().values}')
                        except:
                            break
                            
                        try:
                            #specific unit update
                            try:
                                attrs = xdf['P_1'].attrs
                                attrs['units'] = 'dbar'
                                print('P_1 var to modified')
                            except:
                                print('no P_1 var to modify')
                                pass
                    
                            # xdf.to_netcdf(path=file,format="NETCDF3_CLASSIC",
                            #       encoding={'time':{'units':'days since 1900-01-01'}})
                
                            # print(xdf.mean())
                            try:
                                xdf = xdf.drop_vars('history')
                            except:
                                pass
                                
                            for var in xdf.var():
                                # print(f'{var}')
                                #all variable update
                                attrs = xdf[var].attrs
                                attrs['missing_value'] = '-9999'
        
                                #remove missing variables
                                print(f'median {var}: {xdf[var].median().values}')
                                if np.isinf(xdf[var].median()):
                                    # xdf =xdf.drop_vars(var)
                                    print(f"trimming {var}")
                                elif xdf[var].median()>1e34:
                                    # xdf =xdf.drop_vars(var)
                                    print(f"dropping {var}")
                                elif xdf[var].median()==-9999.0:
                                    # xdf =xdf.drop_vars(var)
                                    print(f"dropping {var}")      
        
                                #trim start end missing variables?
                            try:
                                xdf = drop_leading_na_dataset(xdf.where(xdf['T_20'] != 1e35), dim='time')
                                # ds_trimmed = drop_leading_na_dataset(xdf.where(xdf['temperature'] != 1e35), dim='time')
                                    
                                print(f'updated deltat: {xdf['time'].diff('time').dt.total_seconds().mean().values}')
                                # fig, ax = plt.subplots(1)
                                # try:
                                #     xdf.T_20.plot(ax=ax)
                                #     ds_trimmed.T_20.plot(ax=ax)
                                # except:
                                #     xdf.temperature.plot(ax=ax)
                                #     ds_trimmed.temperature.plot(ax=ax)
                            except:
                                print('No Temp var')
        
                            xdf = xdf.assign_attrs({'PROGRAM':'NOAA PMEL EcoFOCI'})
                            xdf.to_netcdf(f'{path}{tfile}{file}'.replace('.nc','.nc'))

                            xdf = None
                        except:
                            print("something is wonky")
                            pass

18ckp1a_ecf_0040m.cf.nc
deltat: 3600.0
no P_1 var to modify
median fluor_3031: 57.958003997802734
median Fch_906: 0.1621398627758026
median fluorstd_2031: 0.9203158617019653
median Trb_2980: 2.493685245513916
No Temp var
18ckp1a_wcp_ein.cf.nc
deltat: 3600.0
no P_1 var to modify
median AGC3_1223: 89.0
median AGC4_1224: 87.0
median AGC1_1221: 85.0
median AGC2_1222: 94.0
No Temp var
18ckp1a_sc_0039m.cf.nc
deltat: 3600.0
no P_1 var to modify
median T_20: -1.401104211807251
median C_50: 26.586286544799805
median S_41: 32.24674987792969
median PAR_908: 0.19605976343154907
updated deltat: 3600.0
18ckp1a_wcp_scal.cf.nc
deltat: 3600.0
no P_1 var to modify
median T_20: -1.4600000381469727
median Roll_1217: 0.1899999976158142
median HSD_1218: 1.0
median RSD_1220: 0.0
median Ptch_1216: -0.1599999964237213
median Hdg_1215: 155.4499969482422
median PSD_1219: 0.0
updated deltat: 3600.0
18ckp1a_wcp_vel.cf.nc
deltat: 3600.0
no P_1 var to modify
median v_1206: 0.06622868031263351
median u_1205: 13.44864