In [1]:
import dask
import xarray as xr
import rioxarray as rxr
import numpy as np
import fsspec

In [2]:
import os
os.environ['GDAL_DISABLE_READDIR_ON_OPEN']='EMPTY_DIR' #This is KEY! otherwise we send a bunch of HTTP GET requests to test for common sidecar metadata
os.environ['AWS_NO_SIGN_REQUEST']='YES' #Since this is a public bucket, we don't need authentication
os.environ['GDAL_MAX_RAW_BLOCK_CACHE_SIZE']='200000000'  #200MB: Want this to be greater than size of uncompressed raster to overcome a 10 MB limit in the GeoTIFF driver for range request merging.
os.environ['GDAL_SWATH_SIZE']='200000000'  #also increase this if increasing MAX_RAW_BLOCK_CACHE_SIZE
os.environ['VSI_CURL_CACHE_SIZE']='200000000' #also increase this if increasing MAX_RAW_BLOCK_CACHE_SIZE
# Others?

In [3]:
fs = fsspec.filesystem('s3', anon=True, client_kwargs=dict(endpoint_url='https://mghp.osn.xsede.org'))

In [4]:
fs.glob('s3://rsignellbucket1/testing/*.tif')

['rsignellbucket1/testing/1998_cog.tif',
 'rsignellbucket1/testing/ORTO_DIM_PER1_20161124154557_SEN_P_008258_COG.tif',
 'rsignellbucket1/testing/p01_r02_leafon_composite_2018_final.tif',
 'rsignellbucket1/testing/red.tif']

In [5]:
url = 's3://rsignellbucket1/testing/1998_cog.tif'

In [6]:
fs.size(url)/1e9  # GB

9.546264722

In [7]:
url = 's3://rsignellbucket1/testing/p01_r02_leafon_composite_2018_final.tif'

In [8]:
fs.size(url)/1e9  # GB

11.550736318

In [9]:
url = 'https://mghp.osn.xsede.org/rsignellbucket1/testing/p01_r02_leafon_composite_2018_final.tif'

In [10]:
da = rxr.open_rasterio(url, chunks={'x':30900, 'y':1024, 'band':1})

In [11]:
%%time
_ = da[:30900,:1024,0].load()

CPU times: user 675 ms, sys: 281 ms, total: 956 ms
Wall time: 3.57 s


In [13]:
da.nbytes/1e9

17.2422

In [None]:
da

In [12]:
ndvi = (da.sel(band=4) - da.sel(band=3)) / (da.sel(band=4) + da.sel(band=3))

In [14]:
def configure_cluster(machine):
    ''' Helper function to configure cluster
    '''
    if machine == 'denali':
        from dask.distributed import LocalCluster, Client
        cluster = LocalCluster(threads_per_worker=1)
        client = Client(cluster)
    
    elif machine == 'tallgrass':
        from dask.distributed import Client
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster(queue='cpu', cores=1, interface='ib0',
                               job_extra=['--nodes=1', '--ntasks-per-node=1', '--cpus-per-task=1'],
                               memory='6GB')
        cluster.adapt(maximum_jobs=30)
        client = Client(cluster)
        
    elif machine == 'local':
        import os
        import warnings
        warnings.warn("Running locally can result in costly data transfers!\n")
        n_cores = os.cpu_count() # set to match your machine
        cluster = LocalCluster(threads_per_worker=n_cores)
        client = Client(cluster)
        
    elif machine in ['esip-qhub-gateway-v0.4']:   
        import sys, os
        sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
        import ebdpy as ebd
        aws_profile = 'esip-qhub'
        ebd.set_credentials(profile=aws_profile)

        aws_region = 'us-west-2'
        endpoint = f's3.{aws_region}.amazonaws.com'
        ebd.set_credentials(profile=aws_profile, region=aws_region, endpoint=endpoint)
        worker_max = 30
        client,cluster = ebd.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                              region=aws_region, use_existing_cluster=True,
                                              adaptive_scaling=False, wait_for_cluster=False, 
                                              worker_profile='Medium Worker', propagate_env=True)
        
    return client, cluster

In [23]:
machine = 'esip-qhub-gateway-v0.4'
client, cluster = configure_cluster(machine)

Region: us-west-2
Existing Dask clusters:
Cluster Index c_idx: 0 / Name: dev.9f4a47fe0ce449d98325a5b3f66c4829 ClusterStatus.RUNNING
Using existing cluster [0].
Setting Fixed Scaling workers=30
Reconnect client to clear cache
client.dashboard_link (for new browser tab/window or dashboard searchbar in Jupyterhub):
https://jupyter.qhub.esipfed.org/gateway/clusters/dev.9f4a47fe0ce449d98325a5b3f66c4829/status
Propagating environment variables to workers
Using environment: users/pangeo


In [31]:
fs_write = fsspec.filesystem('s3', anon=False)

In [17]:
%%time
ds_ndvi = ndvi.to_dataset(name='ndvi')

CPU times: user 171 µs, sys: 49 µs, total: 220 µs
Wall time: 225 µs


In [18]:
ds_ndvi.ndvi.encoding

{}

In [19]:
import zarr
compressor = zarr.Blosc(cname="zstd", clevel=3, shuffle=2)

In [20]:
encoding = {'ndvi':dict(compressor=compressor, chunks=(30900,1024,1))}

In [22]:
ds_ndvi

Unnamed: 0,Array,Chunk
Bytes,10.71 GiB,241.41 MiB
Shape,"(46500, 30900)","(1024, 30900)"
Count,507 Tasks,46 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 10.71 GiB 241.41 MiB Shape (46500, 30900) (1024, 30900) Count 507 Tasks 46 Chunks Type float64 numpy.ndarray",30900  46500,

Unnamed: 0,Array,Chunk
Bytes,10.71 GiB,241.41 MiB
Shape,"(46500, 30900)","(1024, 30900)"
Count,507 Tasks,46 Chunks
Type,float64,numpy.ndarray


In [27]:
fs_write.ls('s3://esip-qhub/usgs/testing/ndvi.zarr/ndvi/')

['esip-qhub/usgs/testing/ndvi.zarr/ndvi/.zarray',
 'esip-qhub/usgs/testing/ndvi.zarr/ndvi/.zattrs']

In [32]:
a = ds_ndvi.to_zarr(fs_write.get_mapper('s3://esip-qhub/usgs/testing/ndvi.zarr'), 
                encoding=encoding, mode='w', compute=False)

In [33]:
dask.compute(a, retries=10)

(None,)

In [None]:
import dask
from dask.distributed import Client, LocalCluster, Lock
import xarray as xr
import rioxarray

file_path = "C:/Users/kpostma/OneDrive - DOI/NLCDShrub/Shrubv4/3index/GreatBasin_region_leafoff_1985_0801_1015.img"
xds = dask.delayed(xr.open_dataset)(file_path, chunks={'band':6,'x':1024,'y':1024})
ndvi = (xds.sel(band=4) - xds.sel(band=3)) / (xds.sel(band=4) + xds.sel(band=3))
print(ndvi)

client = Client(n_workers=8, threads_per_worker=1, memory_limit='4GB')
print(client)

preds_out = ndvi.band_data.rio.to_raster('dask_multiworker_output_ndvi.tif',compute=True,tiled=True,lock=Lock('rio',client=client))
dask.compute(preds_out)

In [None]:
file_path = "C:/Users/kpostma/OneDrive - DOI/NLCDShrub/Shrubv4/3index/GreatBasin_region_leafoff_1985_0801_1015.img"

#xds = xr.open_dataset(file_path)
xds = dask.delayed(xr.open_dataset)(file_path, chunks={'band':6,'x':1024,'y':1024})

#xds = xr.open_dataset(file_path, engine="rasterio")
#xds = rioxarray.open_rasterio(file_path)

In [None]:
#xds

In [None]:
#xds.sel(band=6)

In [None]:
#def func(da):
#    print(da.sizes)
#    return da.time

#mapped = xds.map_blocks(func, [xds.sel(band=6)])

In [None]:
#ndvi = ((xds['nir08'] - xds['red']) / (xds['nir08'] + xds['red'])).clip(0, 1)
#1=blue
#2=green
#3=red
#4=nir08
#5=swir16
#6=swir22
ndvi = (xds.sel(band=4) - xds.sel(band=3)) / (xds.sel(band=4) + xds.sel(band=3))
#ndvi.name = 'ndvi'
ndvi

In [None]:
client = Client(n_workers=8, threads_per_worker=1, memory_limit='4GB')
client

In [None]:
%%time
ndvi_out = ndvi.compute()

In [None]:
%%time
ndvi_out.rio.to_raster('dask_multiworker_output_ndvi.tif',tiled=True,lock=Lock('rio',client=client))

In [None]:
client.close()

In [None]:
%%time
ndvi_nc = dask.delayed(ndvi.to_netcdf)('dask_multiworker_output_ndvi.nc')
dask.compute(ndvi_nc)

In [None]:
#with LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB') as cluster, Client(cluster) as client:
preds_out = ndvi.band_data.rio.to_raster('dask_multiworker_output_ndvi.tif',compute=True,tiled=True,lock=Lock('rio',client=client))
#preds_out = ndvi.band_data.rio.to_raster('dask_multiworker_output_ndvi.tif',compute=True,tiled=True,lock=Lock('rio',client=client))
##preds_raster = dask.delayed(rioxarray.raster_array.RasterArray)(ndvi)
##preds_out = preds_raster.rio.to_raster('dask_multiworker_output_ndvi.tif',tiled=True,lock=Lock('rio',client=client))
dask.compute(preds_out)

In [None]:
%%time
ndvi_out = ndvi.compute()

In [None]:
ndvi_out

In [None]:
%%time
#with LocalCluster() as cluster, Client(cluster) as client:
ndvi_out.rio.to_raster('dask_multiworker_output_ndvi.tif',tiled=True,lock=Lock('rio',client=client))

In [None]:
del ndvi
del ndvi_out

In [None]:
import numpy as np

def calc_nd(band_a, band_b):
    """
    Calculate the normalized difference.
    (((a-b)/(a+b)) + 1) * 100
    """
    #make sure the numer and denom are floats
    numer = (band_a - band_b).astype(np.float32)
    denom = (band_a + band_b).astype(np.float32)
    nd = numer/denom
#    nd = np.divide(numer, denom, out=np.zeros_like(denom), where=denom!=0)
#    del numer
#    del denom
    nd += 1
    nd *= 100
#    aoi_mask = aoi == 0
#    nd[aoi_mask] = 100
#    del aoi_mask
    return nd

def calc_nd2(band_a, band_b):
    """
    Calculate the normalized difference type 2.
    (( (1.5 * (a-b)) / (a+b+0.5)) + 1) * 100
    """
    #make sure the numer and denom are floats
    numer = ((band_a - band_b) * 1.5).astype(np.float32)
    denom = ((band_a + band_b) + 0.5).astype(np.float32)
    nd = numer/denom
#    nd = np.divide(numer, denom, out=np.zeros_like(denom), where=denom!=0)
#    del numer
#    del denom
    nd += 1
    nd *= 100
#    aoi_mask = aoi == 0
#    nd[aoi_mask] = 100
#    del aoi_mask
    return nd
#def model(in_data):
#   out_data = {}
#   out_data['out(1)'] = calc_nd(in_data['in(2)'].astype(np.float),in_data['in(5)'].astype(np.float),in_data['AOI(1)'])
#   out_data['out(2)'] = calc_nd(in_data['in(5)'].astype(np.float),in_data['in(4)'].astype(np.float),in_data['AOI(1)'])
#   out_data['out(3)'] = calc_nd2(in_data['in(4)'].astype(np.float),in_data['in(3)'].astype(np.float),in_data['AOI(1)'])
#   return out_data

In [None]:
def calc_nd(ds, band_a, band_b):
    """
    Calculate the normalized difference.
    (((a-b)/(a+b)) + 1) * 100
    """
    #make sure the numer and denom are floats
    numer = (ds.sel(band=band_a) - ds.sel(band=band_b)).astype(np.float32)
    denom = (ds.sel(band=band_a) + ds.sel(band=band_b)).astype(np.float32)
    nd = numer/denom
    nd += 1
    nd *= 100
    return nd

def calc_nd2(ds, band_a, band_b):
    """
    Calculate the normalized difference type 2.
    (( (1.5 * (a-b)) / (a+b+0.5)) + 1) * 100
    """
    #make sure the numer and denom are floats
    numer = ((ds.sel(band=band_a) - ds.sel(band=band_b)) * 1.5).astype(np.float32)
    denom = ((ds.sel(band=band_a) + ds.sel(band=band_b)) + 0.5).astype(np.float32)
    nd = numer/denom
    nd += 1
    nd *= 100
    return nd

In [None]:
xds

In [None]:
from dask.distributed import Client
client = Client()
client

In [None]:
%%time
from dask.distributed import wait

xds = client.persist(xds)
_ = wait(xds)

In [None]:
%%time
three_index_1 = xds.map_blocks(func=calc_nd, kwargs={'band_a':2,'band_b':5}, template=xds.sel(band=2).drop(labels='band'))

In [None]:
%%time
three_index_2 = xds.map_blocks(func=calc_nd, kwargs={'band_a':5,'band_b':4}, template=xds.sel(band=5))

In [None]:
%%time
three_index_3 = xds.map_blocks(func=calc_nd2, kwargs={'band_a':4,'band_b':3}, template=xds.sel(band=4))

In [None]:
#%%time
#three_index_1 = calc_nd(xds.sel(band=2),xds.sel(band=5))
#three_index_2 = calc_nd(xds.sel(band=5),xds.sel(band=4))
#three_index_3 = calc_nd2(xds.sel(band=4),xds.sel(band=3))

In [None]:
three_index_1

In [None]:
%%time
#with LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB') as cluster, Client(cluster) as client:
three_index_1.compute()

In [None]:
%%time
three_index_2.compute()

In [None]:
%%time
three_index_3.compute()

In [None]:
%%time
with LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB') as cluster, Client(cluster) as client:
    three_index_1.rio.to_raster('dask_multiworker_output_3index_1.tif',tiled=True,lock=Lock('rio',client=client))

In [None]:
%%time
with LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB') as cluster, Client(cluster) as client:
    three_index_2.rio.to_raster('dask_multiworker_output_3index_2.tif',tiled=True,lock=Lock('rio',client=client))

In [None]:
%%time
with LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB') as cluster, Client(cluster) as client:
    three_index_3.rio.to_raster('dask_multiworker_output_3index_3.tif',tiled=True,lock=Lock('rio',client=client))

In [None]:
with LocalCluster() as cluster, Client(cluster) as client:
    xds = rioxarray.open_rasterio(
        file_path,
        chunks=True,
        lock=False,
        # lock=Lock("rio-read", client=client), # when too many file handles open
    )
    xds.rio.to_raster(
        "dask_multiworker_output.tif",
        tiled=True,
        lock=Lock("rio", client=client),
    )

In [None]:
# initial imports and reusable functions

#import holoviews as hv
#hv.extension('bokeh')

from copy import deepcopy
import geopandas as gpd
#import hvplot.pandas
import pandas as pd
#import pystac
from shapely.geometry import shape
import os
os.environ['AWS_REQUEST_PAYER'] = 'requester'

In [None]:

from shapely.geometry import mapping
from pystac_client import Client#%load_ext dotenv
#%dotenv

In [None]:
#%load_ext dotenv
#%dotenv

# logging 
import logging
logging.basicConfig()
import os
os.environ["AWS_REQUEST_PAYER"] = "requester" 

# set pystac_client logger to DEBUG to see API calls
logger = logging.getLogger('pystac_client')
logger.setLevel(logging.INFO)

<h1> AOI</h1>
We first start with a spatial AOI, which should be a single GeoJSON Feature with a geometry type of Point, LineString, Polygon, MultiPoint, MultiLineString, or MultiPolygon. A fast and simple tool to create such as an AOI is http://geojson.io/.

Save a GeoJSON Feature (not a FeatureCollection!) in a file accessible by this notebook.

In [None]:
#Read all ARD TILES
ARD_Tiles = gpd.read_file('/data/opt/c-experiments/2_Gridding_For_Scale/CONUS_C2_ARD_grid/conus_c2_ard_grid.shp')

In [None]:
ARD_lat_lon = ARD_Tiles.to_crs("EPSG:4326")

In [None]:
ARD_lat_lon.plot()

In [None]:
ARD_lat_lon.head()

In [None]:
h = 29

In [None]:
v = 5

In [None]:
# Select a class
selection = ARD_lat_lon[ARD_lat_lon["h"]==h]

In [None]:
# Select a class
one_gdf = selection[selection["v"]==5]

In [None]:
one_gdf.plot()

In [None]:
my_poly = one_gdf.geometry

In [None]:
my_poly

In [None]:
from shapely.geometry import mapping
geom = mapping(one_gdf.to_dict()['geometry'][702])

In [None]:
# read in AOI as a GeoDataFrame
#aoi = gpd.read_file('mini_square_maine.geojson')

# get the geometry of the AOI as a dictionary for use with PySTAC Client
#from shapely.geometry import mapping
#geom = mapping(aoi.to_dict()['geometry'][0])

In [None]:
# STAC API - Landsat Collection 2
url = "https://landsatlook.usgs.gov/stac-server"

# Search parameters
params = {
    #"collections": ["landsat-c2l2-sr"],
    #Access Collection 2 ARD Tiles
    "collections": ["landsat-c2ard-sr"],
    "intersects": geom,
    "datetime": "2019-05-01/2019-09-30",
    "limit": 100,
    "query": ["platform=LANDSAT_8", "eo:cloud_cover<40"]
}

from pystac_client import Client
cat = Client.open(url)
search = cat.search(**params)

matched = search.matched()
print(f"{search.matched()} scenes found")

In [None]:
import hvplot.pandas
import hvplot.xarray

# plot size settings
frame_width = 600
frame_height = 600

# line width of polygons
line_width = 3

# plot polygons as lines on a slippy map with background tiles.
def plot_polygons(data, *args, **kwargs):
    return data.hvplot.paths(*args, geo=True, tiles='OSM', xaxis=None, yaxis=None,
                             frame_width=frame_width, frame_height=frame_height,
                             line_width=line_width, **kwargs)

from copy import deepcopy
import geopandas as gpd
import pandas as pd
from shapely.geometry import shape

# convert a list of STAC Items into a GeoDataFrame
def items_to_geodataframe(items):
    _items = []
    for i in items:
        _i = deepcopy(i)
        _i['geometry'] = shape(_i['geometry'])
        _items.append(_i)
    gdf = gpd.GeoDataFrame(pd.json_normalize(_items))
    for field in ['properties.datetime', 'properties.created', 'properties.updated']:
        if field in gdf:
            gdf[field] = pd.to_datetime(gdf[field])
    gdf.set_index('properties.datetime', inplace=True)
    return gdf

In [None]:
%%time
from pystac import ItemCollection

# get all items found in search
items_dict = []
for item in search.get_all_items_as_dict()['features']:
    for a in item['assets']:
        if 'alternate' in item['assets'][a] and 's3' in item['assets'][a]['alternate']:
            item['assets'][a]['href'] = item['assets'][a]['alternate']['s3']['href']
        item['assets'][a]['href'] = item['assets'][a]['href'].replace('usgs-landsat-ard', 'usgs-landsat-ard')
    items_dict.append(item)

# Create GeoDataFrame from resulting Items
items_gdf = items_to_geodataframe(items_dict)
item_collection = ItemCollection(items_dict)

In [None]:
import yaml

cfg = """---
landat-c2l2-sr:
  measurements:
    '*':
      dtype: uint16
      nodata: 0
      unit: 'm'
"""
cfg = yaml.load(cfg, Loader=yaml.CSafeLoader)

In [None]:
# Convert the STAC item(s) to ODC datasets
import yaml

from odc import stac
from pyproj import CRS
from pystac.extensions.projection import ProjectionExtension

def open_odc(items, crs=None, resolution=None):
    configuration_str = """---
        landsat-c2l2-sr:
          measurements:
            '*':
              dtype: float32
              nodata: 0
              units: 'm'
        """
    configuration = yaml.load(configuration_str, Loader=yaml.CSafeLoader)
    datasets = list(stac.stac2ds(items, configuration))
    
    proj = ProjectionExtension.ext(items[0])
    if crs is None:
        crs = CRS.from_epsg(proj.epsg)
    if resolution is None:
        resolution = (proj.transform[4], proj.transform[0])

    data = stac.dc_load(datasets, 
                        bands=['blue','green','red','nir08','swir16','swir22','qa_pixel'], 
                        chunks={"x": 1024, "y": 1024}, 
                        output_crs=crs, 
                        resolution=resolution,
                        groupby='solar_day',
                        stac_cfg=cfg)
    #data = stac.dc_load(datasets, output_crs=crs, resolution=resolution)
    return data

In [None]:
# open found items as an OpenDataCube

import rasterio as rio
import xarray as xr

_datacube = open_odc(item_collection,'epsg:5070') #resolution='30')
_datacube.to_array(dim='bands')

In [None]:
%%time
import rioxarray
datacube = _datacube.rio.clip([geom], crs='epsg:4326')

In [None]:
datacube

In [None]:
from odc.algo import to_rgba

vis = to_rgba(datacube, clamp=(1, 20000), bands=['red', 'green', 'blue'])
vis

In [None]:
ndvi = ((datacube['nir08'] - datacube['red']) / (datacube['nir08'] + datacube['red'])).clip(0, 1)
ndvi.name = 'ndvi'
ndvi

In [None]:
def calc_nd(band_a, band_b):
    """
    Calculate the normalized difference.
    (((a-b)/(a+b)) + 1) * 100
    """
    #make sure the numer and denom are floats
    numer = (band_a - band_b).astype(np.float)
    denom = (band_a + band_b).astype(np.float)
    nd = np.divide(numer, denom, out=np.zeros_like(denom), where=denom!=0)
#    del numer
#    del denom
    nd += 1
    nd *= 100
#    aoi_mask = aoi == 0
#    nd[aoi_mask] = 100
#    del aoi_mask
    return nd
def calc_nd2(band_a, band_b):
    """
    Calculate the normalized difference type 2.
    (( (1.5 * (a-b)) / (a+b+0.5)) + 1) * 100
    """
    #make sure the numer and denom are floats
    numer = (band_a - band_b) * 1.5
    denom = (band_a + band_b) + 0.5
    nd = np.divide(numer, denom, out=np.zeros_like(denom), where=denom!=0)
#    del numer
#    del denom
    nd += 1
    nd *= 100
#    aoi_mask = aoi == 0
#    nd[aoi_mask] = 100
#    del aoi_mask
    return nd
#def model(in_data):
#   out_data = {}
#   out_data['out(1)'] = calc_nd(in_data['in(2)'].astype(np.float),in_data['in(5)'].astype(np.float),in_data['AOI(1)'])
#   out_data['out(2)'] = calc_nd(in_data['in(5)'].astype(np.float),in_data['in(4)'].astype(np.float),in_data['AOI(1)'])
#   out_data['out(3)'] = calc_nd2(in_data['in(4)'].astype(np.float),in_data['in(3)'].astype(np.float),in_data['AOI(1)'])
#   return out_data

In [None]:
# local Dask

from dask.distributed import Client
client = Client()
client

In [None]:
%%time
from dask.distributed import wait

vis = client.persist(vis)
_ = wait(vis)

In [None]:
%%time
vis_ = vis.compute()
vis_.plot.imshow(col='time', rgb='band', col_wrap=5, robust=True)

In [None]:
client.close()