# Working with many files

In `1-singlefile.ipynb` we learned how to extract subsets and reproject a single image using a variety of tools (GDAL, rasterio, xarray, rioxarray, and holoviz). Often you want to work with a whole stack of imagery - for example let's see how to create a timeseries of backscatter over [Jakobshavn_Glacier](https://en.wikipedia.org/wiki/Jakobshavn_Glacier).

In [None]:
# Bounding box of interest 
# draw on here: http://geojson.io/
import geopandas as gpd
import geoviews as gv
import hvplot.pandas
gf = gpd.read_file('jakobshavn.geojson')
tiles = gv.tile_sources.EsriTerrain
bbox = gf.hvplot.polygons(alpha=0.2, geo=True)

lonmin, latmin, lonmax, latmax = gf.bounds.values[0]
print('bounding box=', lonmin, latmax, lonmax, latmin)

tiles * bbox

In [None]:
# get urls from CMR
import cmr
def get_cmr_urls():
    short_name = 'NSIDC-0723'
    version = '3'
    time_start = '2010-01-01T00:00:00Z'
    time_end = '2022-10-05T15:43:33Z' #some far off time in the future
    #time_start = None
    #time_end = None
    #bounding_box = '-54.85,69.31,-52.18,70.26'
    bounding_box = None
    polygon = None
    filename_filter = '*gamma0*'
    #filename_filter = None

    urls = cmr.get_urls(short_name, version, time_start, time_end, bounding_box, polygon, filename_filter)
    cogs = [url for url in urls if url.endswith('tif')]
    return cogs
    
assets = get_cmr_urls()

In [None]:
# NOTE: reading from NSIDC SERVER REQUIRES you have a ~/.netrc file 
# behind the scenes we're using GDAL to make requests, and we set some Env vars for performance
#GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies

import os
env = dict(GDAL_DISABLE_READDIR_ON_OPEN='EMPTY_DIR', 
           GDAL_HTTP_COOKIEFILE='.urs_cookies',
           GDAL_HTTP_COOKIEJAR='.urs_cookies',
           GDAL_MAX_RAW_BLOCK_CACHE_SIZE='200000000',
           GDAL_SWATH_SIZE='200000000',
           VSI_CURL_CACHE_SIZE='200000000')
os.environ.update(env)

In [None]:
# Using COG overviews is a great way to get a quick low-resolution view of the data
import xarray as xr
import rioxarray
import hvplot.xarray
da = rioxarray.open_rasterio(asset_list[-1], overview_level=4, masked=True).squeeze('band') 
img = da.hvplot.image(cmap='gray', aspect='equal', frame_width=500)

# convert our bounding box to epsg:3413 (south polar sterographic)
gf3413 = gf.to_crs(3413)
aoi = gf3413.hvplot.polygons(alpha=0.2, color='red', aspect='equal', frame_width=500)

img *  aoi

In [None]:
# In many cases we're only interested in a small subset of big data (like that bounding box above!)
#da.rio.clip(gf3413.geometry).hvplot.image(cmap='gray') #loads full raster
#da.rio.clip_box(**gf3413.bounds).hvplot.image(cmap='gray') # more memory-efficent

#new option in 0.2 release https://nbviewer.jupyter.org/github/corteva/rioxarray/blob/master/docs/examples/clip_geom.ipynb#Clipping-larger-rasters
#da = rioxarray.open(url, masked=True).rio.clip(geometries, from_disk=True)     

In [None]:
%time

#open subset directly
da = rioxarray.open_rasterio(asset_list[0], masked=True).squeeze('band')
subset = da.rio.clip_box(**gf3413.bounds)

In [None]:
%%time

# Will do this clipping taking advantage of multiple CPUs. In serial takes ~1.2 min
# NOTE: it should be same time to load full datasets or pre-clip with rio.clip (maybe not if using rio.clip from _disk)
# NOTE: using masked=True promotes float32 to float64 data


import dask
import pandas as pd

@dask.delayed
def lazy_open(href, masked=True):
    filename = href.split('/')[-1] 
    date = href.split('/')[-2] 
    da = rioxarray.open_rasterio(href, chunks=(1, "auto", -1), masked=masked).rename(band='time') 
    da['time'] = [pd.to_datetime(date)]
    da['filename'] = filename
    return da

# Seems single-machine scheduler uses threads by default (ThreadPool), you can use processes instead (ProcessPool)
#with dask.config.set(scheduler='processes'): 
# NSIDC raises HTTP 503 for threads>15 it seems...
with dask.config.set({'scheduler':'threads', 'num_workers':12}):
    dataArrays = dask.compute(*[lazy_open(href, masked=False) for href in asset_list])

In [None]:
%%time

# NOTE: this is fast with dask arrays, can run out of memory with numpy arrays
DA = xr.concat(dataArrays, dim='time', join='override', combine_attrs='drop').rio.clip_box(**gf3413.bounds)
DA

In [None]:
#%%time 

# Drop scenes that are all nans in the bbox

#test = DA.dropna('time', how='all') # da
#test

In [None]:
#finally lets use a video scrubber widget
import panel as pn

# note: delay 1000ms between scenes
#player = pn.widgets.Player(name='time', loop_policy='once', interval=500)
#pn.Row(video, widgets={'time':player})

panel = DA.hvplot.image(x='x',y='y', 
                        rasterize=True,
                        cmap='gray', clim=(-25,5),
                        aspect='equal', frame_width=800,
                        widget_type='scrubber', widget_location='bottom') 

#widget = panel[1][1][0] 
#widget.interval = 500ms default
panel

In [None]:
%%time 

subset = DA.to_dataset(name='gamma0').compute()
# save with compression
data_settings = {"zlib": True, "dtype":'float32', "complevel": 9}
encoding_dict = dict(gamma0=data_settings)
subset.to_netcdf('mysubset.nc', encoding=encoding_dict)