In [70]:
import geopandas as gpd
import pandas as pd
from shapely import wkt
import matplotlib.colors as colors
import tempfile
import numpy as np

## Define locations

In [90]:
BASEURI = 'gs://fdl2023-sar-datasets'
AOI     = "conus"
AOIURI  = f"{BASEURI}/{AOI}"

datasets = ['biomass-2020', 'esaworldcover-2020', 'gunw_2020-01-01_2020-03-31', 's1grd-2020', 's2rgbm-2020', 'strmdem', 
           'modis44b006veg', 'ghsbuilts-2020', 'gssic']


## Find and download definition files

observe that there are two files

- one defining the chips and their boundaries
- one splitting chips in three splits which reduce spatial data leakage

we do it this way since usually you just want to know what split a certain chip is in without having to load all the boundary definitions which are relatively large.

In [80]:
files = !gsutil ls $AOIURI

chipdefs_file  = [i for i in files if i.endswith(".geojson") and 'partitions_aschips' in i][0]
splitdefs_file = [i for i in files if i.endswith(".csv") and '_splits_' in i][0]
datasets_uri = chipdefs_file.split(".")[0]
datasets_folder = datasets_uri.split("/")[-1]

print (f"using chip  definitions at {chipdefs_file}")
print (f"using datasets at          {datasets_uri}/")
print (f"using split definitions at {splitdefs_file}")


using chip  definitions at gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589.geojson
using datasets at          gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/
using split definitions at gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589_splits_60bands_angle09_60-20-20.csv


In [81]:
# download definitions

!gsutil cp -nc $chipdefs_file data
!gsutil cp -nc $splitdefs_file data
!ls -las data

Skipping existing item: file://data/conus_partitions_aschips_293d95e3ee589.geojson
Skipping existing item: file://data/conus_partitions_aschips_293d95e3ee589_splits_60bands_angle09_60-20-20.csv
total 66592
    4 drwxrwxr-x 2 rlx rlx     4096 Dec  4 00:04 .
    4 drwxrwxr-x 5 rlx rlx     4096 Dec  4 10:49 ..
63404 -rw-rw-r-- 1 rlx rlx 64918760 Dec  3 20:03 conus_partitions_aschips_293d95e3ee589.geojson
 3176 -rw-rw-r-- 1 rlx rlx  3249458 Dec  3 20:03 conus_partitions_aschips_293d95e3ee589_splits_60bands_angle09_60-20-20.csv
    4 -rw-rw-r-- 1 rlx rlx      879 Dec  3 20:21 massachusetts.wkt


## Load chip definitions


In [82]:
chipdefs_localfile = f"data/{chipdefs_file.split('/')[-1]}"
splitdefs_localfile = f"data/{splitdefs_file.split('/')[-1]}"
c = gpd.read_file(chipdefs_localfile)
s = pd.read_csv(splitdefs_localfile, index_col=0)

In [83]:
c.shape, s.shape

((167406, 3), (167406, 1))

## Select randomly a few chips on the train split

we select randomly 20 chips in the train split in Massachusetts

In [84]:
with open("data/massachusetts.wkt", "r") as f:
    bounds = wkt.load(f)

myarea = c[[bounds.intersects(g) for g in c.geometry]]
myarea_splits = myarea.merge(s, left_on='identifier', right_on='identifier')
myarea_splits.shape

(1285, 4)

In [85]:
myarea_splits   = myarea.merge(s, left_on='identifier', right_on='identifier')
myarea_train    = myarea_splits[myarea_splits.split=='train']
myarea_selected = myarea_train.sample(20)
chipids = myarea_selected.identifier.values
chipids

array(['01be220189f77', '2c1a7f33fb7ff', '1f864ff2de2ff', '115145c182573',
       '2dbe6ebf00c92', '29301012c742e', '22be8f11396fe', '0f6826d7f42c7',
       '2b1f1cb4cc852', '2f666401552e1', '1cad5365f7d3b', '0b0999359260d',
       '2cfcab146184e', '1122d72bf5929', '303ca2a9aeb4f', '072e3a06bc49e',
       '07d00d673c2bd', '2fd48447443e1', '036baa35ffe0c', '015084acbf7dc'],
      dtype=object)

In [86]:
myarea_selected.explore()

## Download the corresponding chip files for each dataset

observe that some datasets are stored in `geotif` and others as `netcdf`

In [87]:
# store the file list in a temp file
tmpfile = f"{tempfile.gettempdir()}/{np.random.randint(100000)}"
with open(tmpfile, "w") as f :
    f.write("\n".join([f"{uri}/{chipid}.*" for chipid in chipids]))

In [92]:
# prepare local dir
dataset = datasets[0]

for dataset in datasets:

    print (f"\n\n-----------\ndownloading {dataset}\n")
    
    uri = f"{datasets_uri}/{dataset}"
    localdest = f"data/{datasets_folder}/{dataset}"
    !mkdir -p $localdest

    # store the file list in a temp file
    tmpfile = f"{tempfile.gettempdir()}/{np.random.randint(100000)}"
    with open(tmpfile, "w") as f :
        f.write("\n".join([f"{uri}/{chipid}.*" for chipid in chipids]))

    # download files for dataset
    !cat $tmpfile | gsutil -m cp -nc -I $localdest



-----------
downloading biomass-2020

Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/01be220189f77.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/2c1a7f33fb7ff.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/1f864ff2de2ff.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/115145c182573.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/2dbe6ebf00c92.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/29301012c742e.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/22be8f11396fe.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/biomass-2020/0f6826d7f42c7.nc...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips

## see downloaded sizes

In [62]:
!du -hs data/$datasets_folder/*

2,1M	data/conus_partitions_aschips_293d95e3ee589/biomass-2020
7,5M	data/conus_partitions_aschips_293d95e3ee589/esaworldcover-2020
41M	data/conus_partitions_aschips_293d95e3ee589/gunw_2020-01-01_2020-03-31
234M	data/conus_partitions_aschips_293d95e3ee589/s1grd-2020
263M	data/conus_partitions_aschips_293d95e3ee589/s2rgbm-2020
16M	data/conus_partitions_aschips_293d95e3ee589/strmdem


## firecci51 dataset

this dataset only contains tiles where there were fires reported, so we select some tiles from the existing ones

In [94]:
uri = f"{datasets_uri}/firecci51"
firechips = !gsutil ls $uri

In [97]:
firechips = np.r_[firechips]
len(firechips)

19138

In [99]:
firechips_selected = np.random.permutation(firechips)[:20]
firechips_selected

array(['gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/1dbc74c01afa1.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/1b497cb779921.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/2f3459e7942eb.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/2c05448f11feb.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/0d425e7fee762.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/0d0b8b1146305.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/354fe6b995ac6.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/16428340014b5.tif',
       'gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/2422c7b4d4cb4.tif',
       'gs://fdl202

In [102]:
    print (f"\n\n-----------\ndownloading firecci51\n")
    
    localdest = f"data/{datasets_folder}/firecci51"
    !mkdir -p $localdest

    # store the file list in a temp file
    tmpfile = f"{tempfile.gettempdir()}/{np.random.randint(100000)}"
    with open(tmpfile, "w") as f :
        f.write("\n".join(firechips_selected))

    # download files for dataset
    !cat $tmpfile | gsutil -m cp -nc -I $localdest



-----------
downloading firecci51

Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/1dbc74c01afa1.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/1b497cb779921.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/2f3459e7942eb.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/2c05448f11feb.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/0d425e7fee762.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/0d0b8b1146305.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/354fe6b995ac6.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/firecci51/16428340014b5.tif...
Copying gs://fdl2023-sar-datasets/conus/conus_partitions_aschips_293d95e3ee589/fire