# Debugging incorrect file names in xarray 

In [23]:
import numpy as np
import os
import gcsfs
import fsspec
import json
import xarray as xr
import sys
sys.path.append("../../xapres/")
import ApRESDefs
import zarr
from tqdm.notebook import trange, tqdm
from dask.distributed import performance_report
import matplotlib.pyplot as plt

First, illustrating the issue. We load in the data as we would before. We see the filename is properly displayed in full.

In [2]:
# Loading site data
def reload(site):
    filename = f'gs://ldeo-glaciology/apres/greenland/2022/single_zarrs_noencode/{site}'
    ds = xr.open_dataset(filename,
        engine='zarr', 
        chunks={}) 
    return ds

ds_104 = reload("A103")


In [5]:
for value in ds_104.filename.values:
    if 'Survey' in value:
        print(value)

Doing the same with our newly saved zarr data. If not in the survey folder, we see the filename is cut off. 

In [65]:
def reload(site):
    filename = f'gs://ldeo-glaciology/apres/greenland/2022/single_zarrs_noencode/{site}_winter22_23'
    ds = xr.open_dataset(filename,
        engine='zarr', 
        consolidated=True, 
        chunks={}) 
    return ds

ds_104 = reload("A103")
ds_104.filename.values

array(['ldeo-glaciology/GL_apres_2022/A103/winter22_23/DIR2022-09-25-1209/DATA2022-09-25-1209.DAT',
       'ldeo-glaciology/GL_apres_2022/A103/winter22_23/DIR2022-09-25-1215/DATA2022-09-25-1215.DAT',
       'ldeo-glaciology/GL_apres_2022/A103/winter22_23/DIR2022-09-25-1215/DATA2022-09-25-1215.DAT',
       ...,
       'ldeo-glaciology/GL_apres_2022/A103/winter22_23/DIR2023-04-16-0101/DATA2023-05-19-0802.DAT',
       'ldeo-glaciology/GL_apres_2022/A103/winter22_23/DIR2023-04-16-0101/DATA2023-05-19-0802.DAT',
       'ldeo-glaciology/GL_apres_2022/A103/winter22_23/DIR2023-04-16-0101/DATA2023-05-19-0802.DAT'],
      dtype='<U89')

Let's try and go through the different points in which the processing could've messed up. First, checking the Google bucket, I can confirm that the raw .DAT files were correctly updated. 

In [8]:
xa = ApRESDefs.xapres(max_range=1400)
dat_file_list = xa.list_files(directory=f'gs://ldeo-glaciology/GL_apres_2022/A104/winter22_23', 
            remote_load = True)
dat_file_list

['ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1348/DATA2022-09-25-1348.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-09-25-1354.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-09-26-1324.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-09-27-1254.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-09-28-1224.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-09-29-1154.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-09-30-1124.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-10-01-1054.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-10-02-1024.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-25-1354/DATA2022-10-03-0954.DAT',
 'ldeo-glaciology/GL_apres_2022/A104/winter22_23/DIR2022-09-

Now let's try looking at the individual zarrs. It is fine. So then something in the conversion to the single zarr messes it up.

In [59]:
ds = xr.open_mfdataset(f'gs://ldeo-glaciology/apres/greenland/2022/A101/individual_zarrs_prechunked_winter22_23/dat_*',
                           chunks = {}, 
                           engine = 'zarr', 
                           consolidated = False, 
                           parallel = True)
#ds['attenuator'] = ds.attenuator[100]
#ds['AFGain'] = ds.AFGain[100]

for var in ds:
    del ds[var].encoding['chunks']

profile_stacked = ds.profile.mean(dim='chirp_num')
ds_stacked = ds.assign({'profile_stacked':profile_stacked})

ds_stacked_rechunked = ds_stacked.chunk({'time':20})

#encoding = {i: {"dtype": "float64"} for i in ds_stacked_rechunked.data_vars}
encoding = {
    'time': {
        'units': 'seconds since 1970-01-01'
    }
}

filename = f'gs://ldeo-glaciology/apres/greenland/2022/single_zarrs_noencode/A101_winter22_23' 
with open('../../secrets/ldeo-glaciology-bc97b12df06b.json') as token_file:
    token = json.load(token_file)
mapper = fsspec.get_mapper(filename, mode='w', token=token) 

In [63]:
ds_stacked_rechunked.filename.encoding

{'chunks': (1,),
 'preferred_chunks': {'time': 1},
 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
 'filters': None,
 'dtype': dtype('<U89')}