In [None]:
# default_exp convert_to_zarr

# Notebook which converts per region netCDF files to Zarr files to make them more efficient when indexing

### uses pangeo_small environment

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#export
import xarray as xr
import zarr
from dask.distributed import Client
from joblib import Parallel, delayed
import pandas as pd
import os

## Set the parameters here
### Ensure all State and Regions you want to transform are specified here. 

In [None]:
data_root = '/media/scottcha/E1/Data/OAPMLData/'

interpolation = 1

#currently only have Washington regions and one season specified for the tutorial
#uncomment regions and seasons if doing a larger transform
regions = {#'Utah': ['Abajos', 'Logan', 'Moab', 'Ogden', 'Provo', 
           #'Salt Lake', 'Skyline', 'Uintas'],  
           #'Colorado': ['Grand Mesa Zone', 'Sangre de Cristo Range', 'Steamboat Zone', 'Front Range Zone',
           #'Vail Summit Zone', 'Sawatch Zone', 'Aspen Zone', 
           #'North San Juan Mountains', 'South San Juan Mountains', 'Gunnison Zone'],
           'Washington': ['Mt Hood', 'Olympics', 'Snoqualmie Pass', 'Stevens Pass',
           'WA Cascades East, Central', 'WA Cascades East, North', 'WA Cascades East, South',
           'WA Cascades West, Central', 'WA Cascades West, Mt Baker', 'WA Cascades West, South'
           ]
           }
seasons = ['15-16']#, '16-17', '17-18', '18-19']

In [None]:
#export
class ConvertToZarr:
    """
    Class which encapsulates the logic to convert a set of filtered netCDF files to Zarr
    """
    
    def __init__(self, seasons, regions, data_root, interpolate=1):
        """
        Initialize the class
        
        Keyword Arguments
        seasons: list of season values to process
        regions: dictonary of Key: State and Value: List of Regions to process for that state
        data_root: the root path of the data folders which contains the 3.GFSFiltered1xInterpolation
        """
        self.processed_path = data_root + '/3.GFSFiltered'+ str(interpolate) + 'xInterpolation/'
        self.zarr_base_path = data_root + '/4.GFSFiltered'+ str(interpolate) + 'xInterpolationZarr/'
        
        self.seasons = seasons
        self.regions = regions
        self.data_root = data_root
        
        if not os.path.exists(self.zarr_base_path):
            os.makedirs(self.zarr_base_path)
    
    def compute_region(self, region_name, season, state):
        """
        Calculates the zarr conversion for a specific region, season and state and indexes it for efficient lookup 
        
        Keyword Arguments
        region_name: name of the region to process
        season: season to process
        state: state to process (region must be a part of the state)
        """
        first = True
        base_path = self.processed_path + season + '/' + '/Region_' + region_name 
        zarr_path = self.zarr_base_path + season + '/' + state + '/Region_' + region_name + '.zarr'
        
        #TODO: refactor these to be shared code as logic also exists in ParseGFS
        p = 181
        if season in ['15-16', '19-20']:
            p = 182 #leap years

        snow_start_date = '2015-11-01'
        if season == '16-17':
            snow_start_date = '2016-11-01'
        if season == '17-18':
            snow_start_date = '2017-11-01'
        if season == '18-19':
            snow_start_date = '2018-11-01'
        if season == '19-20':
            snow_start_date = '2019-11-01'

        date_values_pd = pd.date_range(snow_start_date, periods=p, freq="D")
        try:
            with xr.open_zarr(zarr_path) as z:
                if z.time.values[-1] == date_values_pd[-1]:
                    print(' already exists: ' + region_name + ' ' + season + ' ' + state)
                    z.close()
                    return
                else:
                    #already exists but incomplete
                    date_values_pd = [pd.Timestamp(v) for v in date_values_pd.values.astype('datetime64[ns]') if v not in z.time.values]
                    print(' some exist but have to complete ' + str(len(date_values_pd)))
                    first = False
        except ValueError as err:
            #ignore as it doesn't exist yet
            pass

        for d in date_values_pd:

            path =  base_path + '_' + d.strftime('%Y%m%d') + '.nc'
            print('On ' + str(path.split('/')[-1]))

            try:
                ds = xr.open_dataset(path, chunks={'latitude':1, 'longitude':1})
            except OSError as err:
                print(' missing file: ' + path)
                continue

            ds = ds.to_array(name='vars').chunk({'time':1, 'latitude':1, 'longitude':1, 'variable':-1}).to_dataset()

            try:

                if first:
                    ds.to_zarr(zarr_path, consolidated=True)
                    first=False
                else:
                    ds.to_zarr(zarr_path, consolidated=True, append_dim='time')
            except ValueError as err:
                print('Value Error on ' + zarr_path)
                return

    def process_tuple(self, t): 
        """
        Entry method to call compute_region with a tuple
        Basically a helper for executing with joblib parallel
        
        Keyword Arguments
        t: the tuple containing the region, season and state
        """
        self.compute_region(t[0], t[1], t[2])
    
    def make_list(self):
        """
        Helper method to make the list of values to process
        """
        to_process = []
        for s in self.seasons:
            for state in self.regions.keys():           
                for r in self.regions[state]:
                    to_process.append((r,s,state))
        return to_process
    
    def convert_local(self, jobs=15):
        l = self.make_list()
    
        #one state & season takes about 6 hours with 15 cores on my machine
        Parallel(n_jobs=jobs, backend="multiprocessing")(map(delayed(self.process_tuple), l))

In [None]:
ctz = ConvertToZarr(seasons, regions, data_root)

In [None]:
ctz.convert_local()

On Region_Snoqualmie Pass_20151101.ncOn Region_Mt Hood_20151101.ncOn Region_Olympics_20151101.ncOn Region_Stevens Pass_20151101.nc



On Region_WA Cascades East, Central_20151101.ncOn Region_WA Cascades East, North_20151101.ncOn Region_WA Cascades East, South_20151101.nc
On Region_WA Cascades West, Mt Baker_20151101.ncOn Region_WA Cascades West, Central_20151101.nc



On Region_WA Cascades West, South_20151101.nc
On Region_Stevens Pass_20151102.nc
On Region_Mt Hood_20151102.nc
On Region_Snoqualmie Pass_20151102.nc
On Region_Stevens Pass_20151103.nc
On Region_Mt Hood_20151103.nc
On Region_Stevens Pass_20151104.nc
On Region_Mt Hood_20151104.nc
On Region_Snoqualmie Pass_20151103.nc
On Region_Stevens Pass_20151105.nc
On Region_Mt Hood_20151105.nc
On Region_Olympics_20151102.nc
On Region_WA Cascades East, South_20151102.nc
On Region_Stevens Pass_20151106.nc
On Region_WA Cascades West, Mt Baker_20151102.nc
On Region_Snoqualmie Pass_20151104.nc
On Region_Mt Hood_20151106.nc
On Region_Stevens 

On Region_Olympics_20151115.nc
On Region_Mt Hood_20151227.nc
On Region_Stevens Pass_20151228.nc
On Region_Snoqualmie Pass_20151207.nc
On Region_Mt Hood_20151228.nc
On Region_Stevens Pass_20151229.nc
On Region_WA Cascades West, Central_20151110.nc
On Region_WA Cascades East, South_20151113.nc
On Region_Snoqualmie Pass_20151208.nc
On Region_WA Cascades West, Mt Baker_20151113.nc
On Region_Mt Hood_20151229.nc
On Region_Stevens Pass_20151230.nc
On Region_Mt Hood_20151230.nc
On Region_Stevens Pass_20151231.nc
On Region_Olympics_20151116.nc
On Region_Snoqualmie Pass_20151209.nc
On Region_Mt Hood_20151231.nc
On Region_Stevens Pass_20160101.nc
On Region_WA Cascades East, Central_20151109.nc
On Region_WA Cascades West, South_20151109.nc
On Region_Snoqualmie Pass_20151210.nc
On Region_Mt Hood_20160101.nc
On Region_Stevens Pass_20160102.nc
On Region_Stevens Pass_20160103.nc
On Region_Mt Hood_20160102.nc
On Region_Snoqualmie Pass_20151211.nc
On Region_WA Cascades East, South_20151114.nc
On Region_

On Region_Snoqualmie Pass_20160112.nc
On Region_Stevens Pass_20160223.nc
On Region_Mt Hood_20160222.nc
On Region_Stevens Pass_20160224.nc
On Region_Olympics_20151130.nc
On Region_Mt Hood_20160223.nc
On Region_Snoqualmie Pass_20160113.nc
On Region_WA Cascades West, Central_20151119.nc
On Region_Stevens Pass_20160225.nc
On Region_WA Cascades East, South_20151125.nc
On Region_WA Cascades West, Mt Baker_20151125.nc
On Region_Mt Hood_20160224.nc
On Region_Stevens Pass_20160226.nc
On Region_Snoqualmie Pass_20160114.nc
On Region_Mt Hood_20160225.nc
On Region_Stevens Pass_20160227.nc
On Region_Mt Hood_20160226.nc
On Region_Snoqualmie Pass_20160115.nc
On Region_WA Cascades East, North_20151116.nc
On Region_Stevens Pass_20160228.nc
On Region_Olympics_20151201.nc
On Region_Mt Hood_20160227.nc
On Region_Stevens Pass_20160229.nc
On Region_Snoqualmie Pass_20160116.nc
On Region_Mt Hood_20160228.nc
On Region_WA Cascades East, South_20151126.nc
On Region_Stevens Pass_20160301.nc
On Region_WA Cascades W

 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Stevens Pass_20160418.nc
On Region_Stevens Pass_20160419.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Stevens Pass_20160419.nc
On Region_Stevens Pass_20160420.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Stevens Pass_20160420.nc
On Region_Stevens Pass_20160421.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Stevens Pass_20160421.nc
On Region_Stevens Pass_20160422.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Stevens Pass_20160422.nc
On Region_Stevens Pass_20160423.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Stevens Pass_20160423.nc
On Region_Stevens Pass_20160424.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-

On Region_WA Cascades East, South_20151203.nc
On Region_Olympics_20151210.nc
On Region_Snoqualmie Pass_20160207.nc
On Region_Snoqualmie Pass_20160208.nc
On Region_WA Cascades East, North_20151121.nc
On Region_WA Cascades West, South_20151122.nc
On Region_Snoqualmie Pass_20160209.nc
On Region_Olympics_20151211.nc
On Region_WA Cascades East, Central_20151122.nc
On Region_WA Cascades West, Mt Baker_20151204.nc
On Region_WA Cascades East, South_20151204.nc
On Region_WA Cascades West, Central_20151126.nc
On Region_Snoqualmie Pass_20160210.nc
On Region_Snoqualmie Pass_20160211.nc
On Region_Olympics_20151212.nc
On Region_Snoqualmie Pass_20160212.nc
On Region_WA Cascades West, Mt Baker_20151205.nc
On Region_WA Cascades East, South_20151205.nc
On Region_Snoqualmie Pass_20160213.nc
On Region_WA Cascades West, South_20151123.nc
On Region_WA Cascades East, North_20151122.nc
On Region_WA Cascades East, Central_20151123.nc
On Region_WA Cascades West, Central_20151127.nc
On Region_Snoqualmie Pass_201

 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Snoqualmie Pass_20160408.nc
On Region_Snoqualmie Pass_20160409.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Snoqualmie Pass_20160409.nc
On Region_Snoqualmie Pass_20160410.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Snoqualmie Pass_20160410.nc
On Region_Snoqualmie Pass_20160411.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Snoqualmie Pass_20160411.nc
On Region_Snoqualmie Pass_20160412.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Snoqualmie Pass_20160412.nc
On Region_Snoqualmie Pass_20160413.nc
 missing file: /media/scottcha/E1/Data/OAPMLData//3.GFSFiltered1xInterpolation/15-16//Region_Snoqualmie Pass_20160413.nc
On Region_Snoqualmie Pass_20160414.nc
 missing file: /media/scottcha/E1/Data/OAPMLDa

In [None]:

    
l = make_list()     
l

[('Abajos', '15-16', 'Utah'),
 ('Logan', '15-16', 'Utah'),
 ('Moab', '15-16', 'Utah'),
 ('Ogden', '15-16', 'Utah'),
 ('Provo', '15-16', 'Utah'),
 ('Salt Lake', '15-16', 'Utah'),
 ('Skyline', '15-16', 'Utah'),
 ('Uintas', '15-16', 'Utah'),
 ('Grand Mesa Zone', '15-16', 'Colorado'),
 ('Sangre de Cristo Range', '15-16', 'Colorado'),
 ('Steamboat Zone', '15-16', 'Colorado'),
 ('Front Range Zone', '15-16', 'Colorado'),
 ('Vail Summit Zone', '15-16', 'Colorado'),
 ('Sawatch Zone', '15-16', 'Colorado'),
 ('Aspen Zone', '15-16', 'Colorado'),
 ('North San Juan Mountains', '15-16', 'Colorado'),
 ('South San Juan Mountains', '15-16', 'Colorado'),
 ('Gunnison Zone', '15-16', 'Colorado')]

In [None]:
#one state & season takes about 6 hours with 15 cores on my machine
Parallel(n_jobs=15, backend="multiprocessing")(map(delayed(process_tuple), l))









On Region_Moab_20151101.nc
On Region_Logan_20151101.ncOn Region_Grand Mesa Zone_20151101.nc
On Region_Ogden_20151101.ncOn Region_Abajos_20151101.nc
On Region_Provo_20151101.nc

On Region_Uintas_20151101.ncOn Region_Salt Lake_20151101.nc
On Region_Steamboat Zone_20151101.nc



On Region_Sawatch Zone_20151101.nc



On Region_Aspen Zone_20151101.ncOn Region_Front Range Zone_20151101.nc


On Region_Skyline_20151101.ncOn Region_Sangre de Cristo Range_20151101.nc


On Region_Vail Summit Zone_20151101.nc

On Region_Abajos_20151102.nc
On Region_Salt Lake_20151102.nc
On Region_Moab_20151102.nc
On Region_Abajos_20151103.nc
On Region_Salt Lake_20151103.nc
On Region_Moab_20151103.nc
On Region_Provo_20151102.nc
On Region_Ogden_20151102.nc
On Region_Abajos_20151104.nc
On Region_Salt Lake_20151104.nc
On Region_Moab_20151104.nc
On Region_Logan_20151102.nc
On Region_Grand Mesa Zone_20151102.nc
On Region_Aspen Zone_20151102.ncOn Region_Sawatch Zone_20151102.nc

On Region_Abajos_20151105.nc
On Re

On Region_Moab_20160131.ncOn Region_Sawatch Zone_20151125.nc

On Region_Grand Mesa Zone_20151125.nc
On Region_Salt Lake_20160131.nc
On Region_Aspen Zone_20151125.nc
On Region_Abajos_20160201.nc
On Region_Moab_20160201.nc
On Region_Salt Lake_20160201.nc
On Region_Skyline_20151114.nc
On Region_Provo_20151214.nc
On Region_Uintas_20151123.nc
On Region_Vail Summit Zone_20151123.nc
On Region_Abajos_20160202.nc
On Region_Ogden_20151214.nc
On Region_Moab_20160202.nc
On Region_Salt Lake_20160202.nc
On Region_Abajos_20160203.nc
On Region_Moab_20160203.nc
On Region_Salt Lake_20160203.nc
On Region_Logan_20151126.nc
On Region_Provo_20151215.nc
On Region_Sawatch Zone_20151126.nc
On Region_Abajos_20160204.nc
On Region_Moab_20160204.nc
On Region_Grand Mesa Zone_20151126.nc
On Region_Aspen Zone_20151126.nc
On Region_Ogden_20151215.nc
On Region_Salt Lake_20160204.nc
On Region_Abajos_20160205.nc
On Region_Moab_20160205.nc
On Region_Salt Lake_20160205.nc
On Region_Abajos_20160206.nc
On Region_Provo_201512

On Region_Provo_20160126.nc
On Region_Abajos_20160430.nc
On Region_Salt Lake_20160429.nc
On Region_Moab_20160430.nc

On Region_North San Juan Mountains_20151101.nc
On Region_Salt Lake_20160430.nc
On Region_Front Range Zone_20151110.nc
On Region_Ogden_20160127.nc
On Region_Logan_20151221.nc
On Region_Vail Summit Zone_20151216.nc

On Region_South San Juan Mountains_20151101.nc
On Region_Sawatch Zone_20151221.nc
On Region_Provo_20160127.nc

On Region_Gunnison Zone_20151101.nc
On Region_Grand Mesa Zone_20151221.nc
On Region_Aspen Zone_20151221.nc
On Region_Uintas_20151216.nc
On Region_Ogden_20160128.nc
On Region_Skyline_20151128.nc
On Region_Provo_20160128.nc
On Region_Logan_20151222.nc
On Region_Ogden_20160129.nc
On Region_Sawatch Zone_20151222.nc
On Region_Vail Summit Zone_20151217.nc
On Region_Aspen Zone_20151222.nc
On Region_Grand Mesa Zone_20151222.nc
On Region_Provo_20160129.nc
On Region_Sangre de Cristo Range_20151114.nc
On Region_Uintas_20151217.nc
On Region_Steamboat Zone_20151112

On Region_Provo_20160408.nc
On Region_Sawatch Zone_20160201.nc
On Region_Ogden_20160408.nc
On Region_Aspen Zone_20160201.nc
On Region_Logan_20160201.nc
On Region_Uintas_20160123.nc
On Region_South San Juan Mountains_20151113.nc
On Region_Provo_20160409.nc
On Region_Ogden_20160409.nc
On Region_Grand Mesa Zone_20160202.nc
On Region_Vail Summit Zone_20160124.nc
On Region_North San Juan Mountains_20151114.nc
On Region_Sawatch Zone_20160202.nc
On Region_Steamboat Zone_20151121.nc
On Region_Aspen Zone_20160202.nc
On Region_Provo_20160410.nc
On Region_Logan_20160202.nc
On Region_Skyline_20151221.nc
On Region_Ogden_20160410.nc
On Region_Uintas_20160124.nc
On Region_Front Range Zone_20151118.nc
On Region_Gunnison Zone_20151126.nc
On Region_Provo_20160411.nc
On Region_Ogden_20160411.nc
On Region_Grand Mesa Zone_20160203.nc
On Region_Vail Summit Zone_20160125.nc
On Region_Sawatch Zone_20160203.nc
On Region_Aspen Zone_20160203.nc
On Region_Logan_20160203.nc
On Region_Provo_20160412.nc
On Region_Og

On Region_Uintas_20160310.nc
On Region_Grand Mesa Zone_20160325.nc
On Region_Vail Summit Zone_20160311.nc
On Region_Sangre de Cristo Range_20151208.nc
On Region_South San Juan Mountains_20151128.nc
On Region_Logan_20160325.nc
On Region_Skyline_20160118.nc
On Region_Aspen Zone_20160325.nc
On Region_Sawatch Zone_20160325.nc
On Region_Uintas_20160311.nc
On Region_Grand Mesa Zone_20160326.nc
On Region_Gunnison Zone_20151226.nc
On Region_Vail Summit Zone_20160312.nc
On Region_North San Juan Mountains_20151130.nc
On Region_Logan_20160326.nc
On Region_Aspen Zone_20160326.nc
On Region_Sawatch Zone_20160326.nc
On Region_Grand Mesa Zone_20160327.nc
On Region_Uintas_20160312.nc
On Region_Vail Summit Zone_20160313.nc
On Region_Skyline_20160119.nc
On Region_Logan_20160327.nc
On Region_Gunnison Zone_20151227.nc
On Region_Aspen Zone_20160327.nc
On Region_Sawatch Zone_20160327.nc
On Region_Grand Mesa Zone_20160328.nc
On Region_Uintas_20160313.nc
On Region_Front Range Zone_20151128.nc
On Region_Vail Su

On Region_South San Juan Mountains_20151222.nc
On Region_Steamboat Zone_20151221.nc
On Region_Gunnison Zone_20160214.nc
On Region_Skyline_20160305.nc
On Region_North San Juan Mountains_20151226.nc
On Region_Gunnison Zone_20160215.nc
On Region_Sangre de Cristo Range_20151231.nc
On Region_Skyline_20160306.nc
On Region_South San Juan Mountains_20151223.nc
On Region_Gunnison Zone_20160216.nc
On Region_Front Range Zone_20151214.nc
On Region_Steamboat Zone_20151222.nc
On Region_Skyline_20160307.nc
On Region_North San Juan Mountains_20151227.nc
On Region_Gunnison Zone_20160217.nc
On Region_Sangre de Cristo Range_20160101.nc
On Region_Skyline_20160308.nc
On Region_South San Juan Mountains_20151224.nc
On Region_Gunnison Zone_20160218.nc
On Region_Skyline_20160309.nc
On Region_North San Juan Mountains_20151228.nc
On Region_Gunnison Zone_20160219.nc
On Region_Front Range Zone_20151215.nc
On Region_Steamboat Zone_20151223.nc
On Region_Sangre de Cristo Range_20160102.nc
On Region_South San Juan Mou

On Region_Sangre de Cristo Range_20160225.nc
On Region_South San Juan Mountains_20160222.nc
On Region_Steamboat Zone_20160207.nc
On Region_Front Range Zone_20160123.nc
On Region_North San Juan Mountains_20160301.nc
On Region_South San Juan Mountains_20160223.nc
On Region_Sangre de Cristo Range_20160226.nc
On Region_North San Juan Mountains_20160302.nc
On Region_Steamboat Zone_20160208.nc
On Region_South San Juan Mountains_20160224.nc
On Region_Sangre de Cristo Range_20160227.nc
On Region_Front Range Zone_20160124.nc
On Region_North San Juan Mountains_20160303.nc
On Region_South San Juan Mountains_20160225.nc
On Region_Steamboat Zone_20160209.nc
On Region_Sangre de Cristo Range_20160228.nc
On Region_North San Juan Mountains_20160304.nc
On Region_Front Range Zone_20160125.nc
On Region_South San Juan Mountains_20160226.nc
On Region_Sangre de Cristo Range_20160229.nc
On Region_Steamboat Zone_20160210.nc
On Region_North San Juan Mountains_20160305.nc
On Region_South San Juan Mountains_20160

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]