# Test performance of NSIDC 0723 COGs

In [2]:
# 1) get list of 10 COGS for testing
cog_urls = ''' 
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2019.12.30/GL_S1bks_mosaic_30Dec19_04Jan20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.01.05/GL_S1bks_mosaic_05Jan20_10Jan20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.01.11/GL_S1bks_mosaic_11Jan20_16Jan20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.01.17/GL_S1bks_mosaic_17Jan20_22Jan20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.01.23/GL_S1bks_mosaic_23Jan20_28Jan20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.01.29/GL_S1bks_mosaic_29Jan20_03Feb20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.02.04/GL_S1bks_mosaic_04Feb20_09Feb20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.02.10/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.02.16/GL_S1bks_mosaic_16Feb20_21Feb20_gamma0_50m_v03.0.tif
https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.02.22/GL_S1bks_mosaic_22Feb20_27Feb20_gamma0_50m_v03.0.tif            
'''
cogs = list(cog_urls.split())

In [3]:
import os
os.environ['GDAL_HTTP_COOKIEFILE']='.urs_cookies' 
os.environ['GDAL_HTTP_COOKIEJAR']='.urs_cookies'
os.environ['GDAL_DISABLE_READDIR_ON_OPEN']='EMPTY_DIR'
os.environ['CPL_CURL_VERBOSE']='NO'

In [None]:
#check cog is valid
%run validate_cloud_optimized_geotiff.py /vsicurl/{url}

### GDALINFO

In [None]:
%%time
# gdalinfo should only require 1 GET request for header:
url = cogs[0]
cmd = f'gdalinfo --debug on /vsicurl/{url} 2>gdal.log 1>gdal.out'
print(cmd)
!{cmd}

In [None]:
!grep Downloading gdal.log | wc
!grep Downloading gdal.log

In [None]:
%%time
# similarly, requesting the first tile of data should be a single GET request:
# following https://trac.osgeo.org/gdal/wiki/CloudOptimizedGeoTIFF
cmd = f'gdal_translate --debug on /vsicurl/{url} -srcwin 1024 1024 256 256 out.tif 2>gdal.log 1>gdal.out'
print(cmd)
!{cmd}

In [None]:
!grep Downloading gdal.log | wc
!grep Downloading gdal.log

#### Reconvert to COG and put on S3
https://gdal.org/drivers/raster/cog.html

`gdal_translate -of COG` will lead to the following changes (requires GDAL>3.1)

1. increase geotiff version 1.0 -> 1.1 (approved nasa standard https://earthdata.nasa.gov/esdis/eso/standards-and-references/geotiff)
1. make overviews 512x512 tiling (currently only main image is 512, overviews are 128x128)
1. BIGTIFF=NO, currently all images are <4 Gb so BIGTIFF is not needed
1. Ensure all IFD headers are before data (this was not a strict requirement in past, adds flags LAYOUT=IFDS_BEFORE_DATA and LAYOUT=COG in tif metadata to ensure efficient network reads)



In [None]:
!gdalinfo --version

In [None]:
#%%time
# NOTE: when operating on entire files, generally better to download to local disk then translate it)
# taking 9+ min, probably due to GDAL_CACHE sizes,
# see http://osgeo-org.1560.x6.nabble.com/gdal-dev-optimal-vsicurl-settings-for-merging-range-requests-td5389484.html

#output = os.path.basename(url).replace('v03.0', 'v03.1')
#options = '-co COMPRESS=DEFLATE -co GEOTIFF_VERSION=1.1 -co BIGTIFF=NO'
#cmd = f'gdal_translate -of COG /vsicurl/{url} {output} 2>gdal.log 1>gdal.out'
#print(cmd)
#!{cmd}

In [None]:
%%time

cmd = f'wget -q {url}'
print(cmd)
!{cmd}

In [None]:
# Some command line tools for detailed COG information
# NOTE this gives a good summary and prints IFD byte offsets - 0x10 (16) for first
#!tiffinfo GL_S1bks_mosaic_30Dec19_04Jan20_gamma0_50m_v03.0.tif
#GeoTiffs in particular 
#!listgeo GL_S1bks_mosaic_30Dec19_04Jan20_gamma0_50m_v03.0.tif

In [None]:
%%time

filename = os.path.basename(url)
output = filename.replace('v03.0', 'v03.1')
options = '-co COMPRESS=DEFLATE -co GEOTIFF_VERSION=1.1 -co BIGTIFF=NO'
cmd = f'gdal_translate {options} -of COG {filename} {output} 2>gdal.log 1>gdal.out'
print(cmd)
!{cmd}

In [None]:
# Convert full list of COGs
for url in cogs:
    os.system(f'wget {url}')
    filename = os.path.basename(url)
    output = filename.replace('v03.0', 'v03.1')
    options = '-co COMPRESS=DEFLATE -co GEOTIFF_VERSION=1.1 -co BIGTIFF=NO'
    cmd = f'gdal_translate {options} -of COG {filename} {output} 2>gdal.log 1>gdal.out'
    print(cmd)
    !{cmd}   

In [None]:
%%time

#upload the file to S3
cmd = f'aws s3 cp {output} s3://my-cog-server/'
print(cmd)
!{cmd}

### GDALWARP

In [None]:
# GDALWARP test NSIDC
# lots of GET requests and download errors
# actualy increases number off nsidc?! GDAL_MAX_RAW_BLOCK_CACHE_SIZE=120000000 
#env = 'GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies'
env = 'GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies '
env += 'GDAL_MAX_RAW_BLOCK_CACHE_SIZE=200000000 GDAL_SWATH_SIZE=200000000 VSI_CURL_CACHE_SIZE=200000000'
src = '/vsicurl/https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.02.10/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.0.tif'
dst = os.path.basename(src)
cmd = f'{env} gdalwarp --debug on -overwrite -t_srs EPSG:4326 -te -54.85 69.31 -52.18 70.26 {src} {dst} 2>gdalwarp.log 1>gdalwarp.out'
print(cmd)
!{cmd}

In [None]:
# GDALWARP 3.0 AWS (s3://my-cog-server/GL_S1bks_mosaic_13Jan15_24Jan15_gamma0_50m_v03.0.tif)
src = '/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.0.tif'
dst = os.path.basename(src)
cmd = f'{env} gdalwarp --debug on -overwrite -t_srs EPSG:4326 -te -54.85 69.31 -52.18 70.26 {src} {dst} 2>gdalwarpAWS.log 1>gdalwarpAWS.out'
print(cmd)
!{cmd}

In [None]:
# GDALWARP 3.1 AWS
src = '/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.1.tif'
dst = os.path.basename(src)
cmd = f'{env} gdalwarp --debug on -overwrite -t_srs EPSG:4326 -te -54.85 69.31 -52.18 70.26 {src} {dst} 2>gdalwarpAWS31.log 1>gdalwarpAWS31.out'
print(cmd)
!{cmd}

In [None]:
# Network requests
!grep Downloading gdalwarp.log | wc
!grep Downloading gdalwarpAWS.log | wc
!grep Downloading gdalwarpAWS31.log | wc

### GDALTRANSLATE

(getting 1 512x512 tile from a COG should be 2 GET requests (1 for header, 1 for data)

In [None]:
# Test NSIDC 3.0
# lots of GET requests and download errors
env = 'GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies '
src = '/vsicurl/https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.02.10/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.0.tif'
dst = os.path.basename(src)
cmd = f'{env} gdal_translate --debug on -srcwin 0 0 512 512 {src} {dst} 2>gdaltranslate.log 1>gdaltranslate.out'
!{cmd}

In [None]:
# Test AWS 3.0
env = 'GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies'
src = '/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.0.tif'
dst = os.path.basename(src)
cmd = f'{env} gdal_translate --debug on -srcwin 0 0 512 512 {src} {dst} 2>gdaltranslateAWS.log 1>gdaltranslateAWS.out'
!{cmd}

In [None]:
# Test AWS 3.1
env = 'GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies'
src = '/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.1.tif'
dst = os.path.basename(src)
cmd = f'{env} gdal_translate --debug on -srcwin 0 0 512 512 {src} {dst} 2>gdaltranslateAWS31.log 1>gdaltranslateAWS31.out'
!{cmd}

In [None]:
!grep Downloading gdaltranslate.log | wc
!grep Downloading gdaltranslateAWS.log | wc
!grep Downloading gdaltranslateAWS31.log | wc

In [5]:
# try to retrive >10MB woth of pixels from 29520x53220 file
# first 5 row tiles ~288MB *uncompressed**
# http://osgeo-org.1560.x6.nabble.com/gdal-dev-gdalwarp-doesn-t-use-the-VSIL-curl-cache-td5391617.html
# can also use VSI_CACHE_SIZE and VSI_CACHE=FALSE
#GDAL_MAX_RAW_BLOCK_CACHE_SIZE=300000000 (300MB)
# GDAL_MAX_RAW_BLOCK_CACHE_SIZE=100000000 (100MB)
#In addition, a global least-recently-used cache of 16 MB
# CPL_VSIL_CURL_CACHE_SIZE=160000000 #default 16MB

# NOTE thes swath sizes and cache sizes must match **uncompressed** raster size
nrows = 512*5
env = 'CPL_VSIL_CURL_CACHE_SIZE=300000000 GDAL_SWATH_SIZE=300000000 GDAL_MAX_RAW_BLOCK_CACHE_SIZE=300000000 GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies'
src = '/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.1.tif'
dst = 'subset.tif'
cmd = f'{env} gdal_translate --debug on -srcwin 0 0 29520 {nrows} {src} {dst} 2>row.log 1>row.out'
print(cmd)
!{cmd}

CPL_VSIL_CURL_CACHE_SIZE=300000000 GDAL_SWATH_SIZE=300000000 GDAL_MAX_RAW_BLOCK_CACHE_SIZE=300000000 GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies gdal_translate --debug on -srcwin 0 0 29520 2560 /vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_10Feb20_15Feb20_gamma0_50m_v03.1.tif subset.tif 2>row.log 1>row.out


In [6]:
!grep Downloading row.log | wc

       2       8     284


## VRT

subset

minx	miny	maxx	maxy
0	-256677.224952	ve+06	-88385.054733	-2.224398e+06


In [7]:
with open('nsidc.txt', 'w') as f:
    vsis = ['/vsicurl/' + line +'\n' for line in cogs]
    f.writelines(vsis)
        
!head -n 2 nsidc.txt
cmd = f'{env} gdalbuildvrt -overwrite -allow_projection_difference -separate -input_file_list nsidc.txt nsidc.vrt'
print(cmd)
!{cmd}

/vsicurl/https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2019.12.30/GL_S1bks_mosaic_30Dec19_04Jan20_gamma0_50m_v03.0.tif
/vsicurl/https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2020.01.05/GL_S1bks_mosaic_05Jan20_10Jan20_gamma0_50m_v03.0.tif
CPL_VSIL_CURL_CACHE_SIZE=300000000 GDAL_SWATH_SIZE=300000000 GDAL_MAX_RAW_BLOCK_CACHE_SIZE=300000000 GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies gdalbuildvrt -overwrite -allow_projection_difference -separate -input_file_list nsidc.txt nsidc.vrt
0...10...20...30...40...50...60...70...80...90...100 - done.


In [8]:
with open('aws-s3.txt', 'w') as f:
    prefix = '/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/'
    vsis = [prefix + os.path.basename(cog) +'\n' for cog in cogs]
    f.writelines(vsis)
        
!head -n 2 aws-s3.txt
cmd = f'{env} gdalbuildvrt -overwrite -allow_projection_difference -separate -input_file_list aws-s3.txt aws.vrt'
print(cmd)
!{cmd}

/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_30Dec19_04Jan20_gamma0_50m_v03.0.tif
/vsicurl/https://my-cog-server.s3-us-west-2.amazonaws.com/GL_S1bks_mosaic_05Jan20_10Jan20_gamma0_50m_v03.0.tif
CPL_VSIL_CURL_CACHE_SIZE=300000000 GDAL_SWATH_SIZE=300000000 GDAL_MAX_RAW_BLOCK_CACHE_SIZE=300000000 GDAL_DISABLE_READDIR_ON_OPEN=EMPTY_DIR GDAL_HTTP_COOKIEFILE=.urs_cookies GDAL_HTTP_COOKIEJAR=.urs_cookies gdalbuildvrt -overwrite -allow_projection_difference -separate -input_file_list aws-s3.txt aws.vrt
0...10...20...30...40...50...60...70...80...90...100 - done.


In [9]:
%%time

import xarray as xr
da = xr.open_rasterio('nsidc.vrt', chunks=dict(band=1, x=29520, y=512)) #ensure data loaded as dask arrays
da.data

CPU times: user 3.51 s, sys: 181 ms, total: 3.69 s
Wall time: 756 ms


Unnamed: 0,Array,Chunk
Bytes,62.84 GB,60.46 MB
Shape,"(10, 53220, 29520)","(1, 512, 29520)"
Count,1041 Tasks,1040 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 62.84 GB 60.46 MB Shape (10, 53220, 29520) (1, 512, 29520) Count 1041 Tasks 1040 Chunks Type float32 numpy.ndarray",29520  53220  10,

Unnamed: 0,Array,Chunk
Bytes,62.84 GB,60.46 MB
Shape,"(10, 53220, 29520)","(1, 512, 29520)"
Count,1041 Tasks,1040 Chunks
Type,float32,numpy.ndarray


In [12]:
subset = da.sel(x=slice(-256677, -88385), y=slice(-2.22e6, -2.31e6))
subset.data

Unnamed: 0,Array,Chunk
Bytes,242.35 MB,6.89 MB
Shape,"(10, 1800, 3366)","(1, 512, 3366)"
Count,1091 Tasks,50 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 242.35 MB 6.89 MB Shape (10, 1800, 3366) (1, 512, 3366) Count 1091 Tasks 50 Chunks Type float32 numpy.ndarray",3366  1800  10,

Unnamed: 0,Array,Chunk
Bytes,242.35 MB,6.89 MB
Shape,"(10, 1800, 3366)","(1, 512, 3366)"
Count,1091 Tasks,50 Chunks
Type,float32,numpy.ndarray


In [13]:
%%time
subset.data.persist()

RasterioIOError: Read or write failed. /vsicurl/https://n5eil01u.ecs.nsidc.org/DP4/MEASURES/NSIDC-0723.003/2019.12.30/GL_S1bks_mosaic_30Dec19_04Jan20_gamma0_50m_v03.0.tif, band 1: IReadBlock failed at X offset 14, Y offset 63: TIFFReadEncodedTile() failed.

In [14]:
%%time

da = xr.open_rasterio('aws.vrt', chunks=dict(band=1, x=29520, y=512)) #ensure data loaded as dask arrays
da.data

CPU times: user 5.78 ms, sys: 1.18 ms, total: 6.96 ms
Wall time: 5.91 ms


Unnamed: 0,Array,Chunk
Bytes,62.84 GB,60.46 MB
Shape,"(10, 53220, 29520)","(1, 512, 29520)"
Count,1041 Tasks,1040 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 62.84 GB 60.46 MB Shape (10, 53220, 29520) (1, 512, 29520) Count 1041 Tasks 1040 Chunks Type float32 numpy.ndarray",29520  53220  10,

Unnamed: 0,Array,Chunk
Bytes,62.84 GB,60.46 MB
Shape,"(10, 53220, 29520)","(1, 512, 29520)"
Count,1041 Tasks,1040 Chunks
Type,float32,numpy.ndarray


In [15]:
subset = da.sel(x=slice(-256677, -88385), y=slice(-2.22e6, -2.31e6))
subset.data

Unnamed: 0,Array,Chunk
Bytes,242.35 MB,6.89 MB
Shape,"(10, 1800, 3366)","(1, 512, 3366)"
Count,1091 Tasks,50 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 242.35 MB 6.89 MB Shape (10, 1800, 3366) (1, 512, 3366) Count 1091 Tasks 50 Chunks Type float32 numpy.ndarray",3366  1800  10,

Unnamed: 0,Array,Chunk
Bytes,242.35 MB,6.89 MB
Shape,"(10, 1800, 3366)","(1, 512, 3366)"
Count,1091 Tasks,50 Chunks
Type,float32,numpy.ndarray


In [16]:
%%time
subset.data.persist()

CPU times: user 4.25 s, sys: 1.15 s, total: 5.41 s
Wall time: 18.9 s


Unnamed: 0,Array,Chunk
Bytes,242.35 MB,6.89 MB
Shape,"(10, 1800, 3366)","(1, 512, 3366)"
Count,50 Tasks,50 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 242.35 MB 6.89 MB Shape (10, 1800, 3366) (1, 512, 3366) Count 50 Tasks 50 Chunks Type float32 numpy.ndarray",3366  1800  10,

Unnamed: 0,Array,Chunk
Bytes,242.35 MB,6.89 MB
Shape,"(10, 1800, 3366)","(1, 512, 3366)"
Count,50 Tasks,50 Chunks
Type,float32,numpy.ndarray
