# Datasets: Environmental Covariates

Datasets listed in [Supplementary_Data_File_1._environmental_covariates - Google Sheet](https://docs.google.com/spreadsheets/d/1hPw9G1A34SnlbDJ8sk3LYfwgoLN0Gbail2xdNb7viGc/edit#gid=106509025).

In [None]:
import os, sys, io, subprocess
from tqdm import tqdm
from pathlib import Path
import shutil
from urlpath import URL
import math
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray

In [None]:
Path.ls = lambda p: list(p.iterdir())

def _cp(src, dst):
    assert src.is_file()
    shutil.copy(src, dst)

In [None]:
def check_id(data_id):
    if data_id in DATA_SRC:
        print(f'Data_id "{data_id}" already exists. '
              'Check that it is not re-used.')
        raise    

        
def add_download(id, download):
    ENCOV = ENCOV.append(
        {'id':id, 'download':URL(download)}, ignore_index=True)

    
def add_filename(id, filename):
    ENCOV = ENCOV.append(
        {'id':id, 'filename':filename}, ignore_index=True)


In [None]:
def execute_wgt(url, dst='./'):
    cmd = ['wget', url, '-O', Path(dst)/url.name]
    proc = subprocess.Popen(cmd, stderr=subprocess.PIPE)
    return proc


# def monitor_wgt_proc(proc):
#     while True:
#         line = proc.stderr.readline()

#         if line=='' and proc.poll() is not None:
#             break
#         else:
#             print(f'\r{line}', end='')
     
#     proc.stderr.close()
    
#     return_code = proc.wait()
#     return return_code

In [None]:
def unzip(src, dst='./'):
    '''
    Unpack zip file at `src` to directory `dst`.
    '''
    dst.mkdir(exist_ok=True)
    proc = subprocess.Popen(
        ['unzip', str(src), '-d', str(dst)], 
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return proc

In [None]:
DIR_DATA = Path('../data')

In [None]:
# load Google Sheet for data sources and info

pth_csv = DIR_DATA / ('Supplementary_Data_File_1.'
                      '_environmental_covariates - sheet1.csv')

ENCOV = pd.read_csv(pth_csv)

In [None]:
ENCOV.columns

Index(['#', 'Variable', 'Description', 'Source / Link', 'doi',
       'File Size (GB)', 'GEE ID', 'Unnamed: 7'],
      dtype='object')

## Earthenv Cloud
Global 1-km Cloud Cover: https://www.earthenv.org/cloud

In [None]:
var_downloads = [
    ('EarthEnvCloudCover_MODCF_interannualSD', 
    'https://data.earthenv.org/cloud/MODCF_interannualSD.tif'),

    ('EarthEnvCloudCover_MODCF_intraannualSD',
    'https://data.earthenv.org/cloud/MODCF_intraannualSD.tif'),

    ('EarthEnvCloudCover_MODCF_meanannual',
    'https://data.earthenv.org/cloud/MODCF_meanannual.tif'),

    ('EarthEnvCloudCover_MODCF_seasonality_concentration',
    'https://data.earthenv.org/cloud/'
     'MODCF_seasonality_concentration.tif'),

    ('EarthEnvCloudCover_MODCF_seasonality_theta',
    'https://data.earthenv.org/cloud/MODCF_seasonality_theta.tif'),
]

In [None]:
for variable, download in var_downloads:
    ENCOV.loc[ENCOV.Variable==variable, 'Source / Link'] = download

## Earthenv Topography

Global 1,5,10,100-km Topography: https://www.earthenv.org/topography.

Couldn't obtain the direct download URLs for these using the browser's developer tools.

In [None]:
var_fns = [
    ('EarthEnvTopoMed_1stOrderPartialDerivEW',
     'dx_1KMmd_GMTEDmd.tif'),
    
    ('EarthEnvTopoMed_1stOrderPartialDerivNS', 
     'dy_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_2ndOrderPartialDerivEW',
     'dxx_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_2ndOrderPartialDerivNS',
     'dyy_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_AspectCosine',
     'aspectcosine_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_AspectSine',
     'aspectsine_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_Eastness',
     'eastness_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_Elevation', 
     'elevation_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_Northness',
     'northness_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_ProfileCurvature',
     'pcurv_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_Roughness',
     'roughness_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_Slope',
     'slope_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_TangentialCurvature',
     'tcurv_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_TerrainRuggednessIndex',
     'tri_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_TopoPositionIndex',
     'tpi_1KMmd_GMTEDmd.tif'),

    ('EarthEnvTopoMed_VectorRuggednessMeasure', 
     'vrm_1KMmd_GMTEDmd.tif')
]

In [None]:
for variable, filename in var_fns:
    ENCOV.loc[ENCOV.Variable==variable, 'filename'] = filename


After uploading to GEE by user `bingosaucer`, say, set `gee_id`:

In [None]:
gee_user = 'bingosaucer'

In [None]:
is_topo = ENCOV.Variable.apply(lambda o: str(o).startswith('EarthEnvTopoMed'))

In [None]:
ENCOV.loc[is_topo, 'GEE ID'] = (
    ENCOV.loc[is_topo, 'Variable'].apply(
        lambda var: f'users/{gee_user}/{var}')
)

## FanEtAl_Depth_to_Water_Table_AnnualMean 

http://thredds-gfnl.usc.es/thredds/catalog/GLOBALWTDFTP/catalog.html

In [None]:
var_wtb = 'FanEtAl_Depth_to_Water_Table_AnnualMean'

In [None]:
var_downloads = [
    ('NAMERICA_WTD_annualmean', 
     'http://thredds-gfnl.usc.es/thredds/fileServer/'
     'GLOBALWTDFTP/annualmeans/NAMERICA_WTD_annualmean.nc'),
    
    ('SAMERICA_WTD_annualmean', 
     'http://thredds-gfnl.usc.es/thredds/fileServer/'
     'GLOBALWTDFTP/annualmeans/SAMERICA_WTD_annualmean.nc'),
    
    ('OCEANA_WTD_annualmean', 
     'http://thredds-gfnl.usc.es/thredds/fileServer/'
     'GLOBALWTDFTP/annualmeans/OCEANA_WTD_annualmean.nc'),
    
    ('EURASIA_WTD_annualmean', 
     'http://thredds-gfnl.usc.es/thredds/fileServer/'
     'GLOBALWTDFTP/annualmeans/EURASIA_WTD_annualmean.nc'),
    
    ('AFRICA_WTD_annualmean', 
     'http://thredds-gfnl.usc.es/thredds/fileServer/'
     'GLOBALWTDFTP/annualmeans/AFRICA_WTD_annualmean.nc'),
]

In [None]:
ENCOV.columns

Index(['#', 'Variable', 'Description', 'Source / Link', 'doi',
       'File Size (GB)', 'GEE ID', 'Unnamed: 7', 'filename'],
      dtype='object')

In [None]:
dict_wtb = ENCOV[ENCOV.Variable==var_wtb].squeeze().to_dict()

In [None]:
for variable, download in var_downloads:
    d = dict_wtb.copy()
    d.update(
        {'Variable': variable, 
         'Source / Link': download})
    print(d, end='\n\n')

{'#': 26.0, 'Variable': 'NAMERICA_WTD_annualmean', 'Description': 'Mean annual depth of the water table on the terrestrial land surface (in m below land surface)', 'Source / Link': 'http://thredds-gfnl.usc.es/thredds/fileServer/GLOBALWTDFTP/annualmeans/NAMERICA_WTD_annualmean.nc', 'doi': '10.1126/science.1229881', 'File Size (GB)': 0.09, 'GEE ID': 'users/bingosaucer/FanEtAl_Depth_to_Water_Table_AnnualMean', 'Unnamed: 7': nan, 'filename': nan}

{'#': 26.0, 'Variable': 'SAMERICA_WTD_annualmean', 'Description': 'Mean annual depth of the water table on the terrestrial land surface (in m below land surface)', 'Source / Link': 'http://thredds-gfnl.usc.es/thredds/fileServer/GLOBALWTDFTP/annualmeans/SAMERICA_WTD_annualmean.nc', 'doi': '10.1126/science.1229881', 'File Size (GB)': 0.09, 'GEE ID': 'users/bingosaucer/FanEtAl_Depth_to_Water_Table_AnnualMean', 'Unnamed: 7': nan, 'filename': nan}

{'#': 26.0, 'Variable': 'OCEANA_WTD_annualmean', 'Description': 'Mean annual depth of the water table on

In [None]:
%%time

ds = rioxarray.open_rasterio(f'../data/{url.name}')
ds.rio.write_crs('epsg:4326', inplace=True)
ds = ds.squeeze()

CPU times: user 131 ms, sys: 43.4 ms, total: 174 ms
Wall time: 280 ms


In [None]:
ds.rio.crs

CRS.from_epsg(4326)

In [None]:
%%time

# ds.WTD.rio.to_raster(f'../data/{url.stem}.tiff')
ds.WTD.rio.to_raster(f'../data/NAMERICA_WTD_annualmean.tiff')

CPU times: user 21min 11s, sys: 3min 35s, total: 24min 47s
Wall time: 25min 59s


In [None]:
xr.open_rasterio('../data/FanEtAl_Depth_to_Water_Table_AnnualMean.tiff')

## MODIS_LAI

https://explorer.earthengine.google.com/#detail/MODIS%2F006%2FMCD15A3H

## ISRIC Data
ISRIC World Soil Information.  
Data Hub: https://data.isric.org/geonetwork/srv/eng/catalog.search#/home

## WCS_Human_Footprint_2009
> Human Footprint 2009

http://wcshumanfootprint.org/

### Full dataset

How to unpack full dataset download:
```
$ unzip doi_10.5061_dryad.052q5__v2.zip
$ brew install p7zip
$ 7za x HumanFootprintv2.7z
```

In [None]:
path = '../data/Dryadv3/Maps'
ns = [f'{path}/{n}' for n in os.listdir(path) if n.endswith('.tif')]
sum([os.path.getsize(n) for n in ns]) / 1e9

3.296027813

In [None]:
%%time

da_fullset = xr.open_rasterio(
    '../data/Dryadv3/Maps/HFP2009.tif', chunks={'x':10, 'y':10})

CPU times: user 15.5 s, sys: 37.4 s, total: 53 s
Wall time: 1min 8s


In [None]:
%%time

(da_fullset - da).sum()

CPU times: user 34.3 s, sys: 47.6 s, total: 1min 21s
Wall time: 1min 36s


###  Summary 2009

##  WorldClim2

https://www.worldclim.org/data/index.html 

In [None]:
def worldclim2_histdata_src():
    '''
    WorldClim v2.1 historical climate data found at:
    https://www.worldclim.org/data/worldclim21.html
    '''
    d = {}
    
    d['minimum temperature'] = {
        'download':
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_tmin.zip'),
        'units': 'C'}
    
    d['maximum temperature'] = {
        'download':
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_tmax.zip'),
        'units': 'C'}

    d['average temperature'] = {
        'download': 
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_tavg.zip'),
        'units': 'C'}
    
    d['precipitation'] = {
        'download':
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_prec.zip'),
        'units': 'mm'}
    
    d['solar radiation'] = {
        'download':
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_srad.zip'),
        'units': 'kJ m^-2 day^-1'}
    
    d['wind speed'] = {
        'download':
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_wind.zip'),
        'units': 'm s^-1'}
    
    d['water vapor pressure'] = {
        'download':
        URL('http://biogeo.ucdavis.edu/data/worldclim/v2.1/base/'
            'wc2.1_30s_vapr.zip'), 
        'units': 'kPa'}
    
    return d

WORLDCLIM2 = worldclim2_histdata_src()

Download a couple of variables.

In [None]:
vns = ['wind speed', 'water vapor pressure']

urls = [WORLDCLIM2[vn]['download'] for vn in vns]

procs = [execute_wgt(url, dst=DIR_DATA) for url in urls]

Unpack a variable and set it up for GEE.

In [None]:
vn = 'water vapor pressure'

collection_id = 'WorldClim2_' + '_'.join(vn.split())
collection_download = WORLD_CLIM2_SRC[vn]['download']

pth_zip = DIR_DATA / collection_download.name
dir_unzip = DIR_DATA / collection_id

# proc = unzip(pth_zip, dir_unzip)

In [None]:
fns = [n.name for n in dir_unzip.ls() if n.name.endswith('.tif')]
months = [n.split('.')[1].split('_')[-1] for n in fns]
asset_ids = [f'{collection_id}_{m}' for m in months]

Register the Image Collection.

In [None]:
ENCOV = ENCOV.append(
    {'id': collection_id, 'download': collection_download}, 
    ignore_index=True)

Register Images.

In [None]:
ENCOV = ENCOV.append(
    pd.DataFrame({'id': asset_ids, 'filename': fns}))

ENCOV.loc[ENCOV.id.isin(asset_ids), 
          'download'] = collection_download

Create the Image Collection in GEE (in the browser).

Set `gee_user` for the collection and individual assets.

In [None]:
ENCOV.loc[ENCOV.id.isin(asset_ids), 'gee_user'] = 'bingosaucer'

In [None]:
ENCOV.to_csv('../data/Supplementary_Data_File_1._environmental_covariates - sheet1.csv', index=False)

## GEE Access Control List

Sets all assets to be public.

In [None]:
for _, r in tqdm(available.iterrows()):
    try:
        subprocess.check_call(
            ['earthengine', 'acl', 'set', 'public',
             f'users/{r.gee_user}/{r.id}'])
    except subprocess.CalledProcessError:
        continue

28it [02:44,  5.88s/it]


# Google Earth Engine Access

Available

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
ENCOV[ENCOV['GEE ID'].notnull()][['Variable', 'GEE ID', 'Description']]

Unnamed: 0,Variable,GEE ID,Description
4,EarthEnvCloudCover_MODCF_interannualSD,users/bingosaucer/MODCF_interannualSD,Mean between-year seasonality represented as the mean of the 2000-2014 monthly standard deviations;
5,EarthEnvCloudCover_MODCF_intraannualSD,users/bingosaucer/MODCF_intraannualSD,Within-year seasonality represented as the standard deviation of mean 2000-2014 monthly cloud frequencies;
9,EarthEnvTopoMed_1stOrderPartialDerivEW,users/bingosaucer/EarthEnvTopoMed_1stOrderPartialDerivEW,1st Order Partial Derivative of the Elevation (with reference to the East-West / X direction)
10,EarthEnvTopoMed_1stOrderPartialDerivNS,users/bingosaucer/EarthEnvTopoMed_1stOrderPartialDerivNS,1st Order Partial Derivative of the Elevation (with reference to the North-South / Y direction)
11,EarthEnvTopoMed_2ndOrderPartialDerivEW,users/bingosaucer/EarthEnvTopoMed_2ndOrderPartialDerivEW,2nd Order Partial Derivative of the Elevation (with reference to the East-West / X direction)
12,EarthEnvTopoMed_2ndOrderPartialDerivNS,users/bingosaucer/EarthEnvTopoMed_2ndOrderPartialDerivNS,2nd Order Partial Derivative of the Elevation (with reference to the North-South / Y direction)
13,EarthEnvTopoMed_AspectCosine,users/bingosaucer/EarthEnvTopoMed_AspectCosine,Aspect Cosine
14,EarthEnvTopoMed_AspectSine,users/bingosaucer/EarthEnvTopoMed_AspectSine,Aspect Sine
15,EarthEnvTopoMed_Eastness,users/bingosaucer/EarthEnvTopoMed_Eastness,Eastness
16,EarthEnvTopoMed_Elevation,users/bingosaucer/EarthEnvTopoMed_Elevation,Elevation (in meters)


Not yet available

In [None]:
ENCOV[ENCOV['GEE ID'].isnull()]

Unnamed: 0,#,Variable,Description,Source / Link,doi,File Size (GB),GEE ID,Unnamed: 7,filename
0,1.0,CGIAR_Aridity_Index,Global AI (Aridity Index),http://www.cgiar-csi.org/data/global-aridity-and-pet-database,10.1016/j.agee.2008.01.014,,,,
1,2.0,CGIAR_PET,Global PET (Potential Evapotranspiration),http://www.cgiar-csi.org/data/global-aridity-and-pet-database,10.1016/j.agee.2008.01.014,,,,
2,3.0,ConsensusLandCover_Human_Development_Percentage,Percentage of urban/built-up areas summed with cultivated/managed vegetation (summed via composite code),https://www.earthenv.org/landcover,10.1111/geb.12182,,,,
3,4.0,CSP_Global_Human_Modification,The global Human Modification dataset (gHM) provides a cumulative measure of human modification of terrestrial lands globally at 1 square-kilometer resolution.,https://developers.google.com/earth-engine/datasets/catalog/CSP_HM_GlobalHumanModification,10.1111/gcb.14549,,,,
6,7.0,EarthEnvCloudCover_MODCF_meanannual,Mean annual cloud frequency (%) over 2000-2014;,https://data.earthenv.org/cloud/MODCF_meanannual.tif,10.1371/journal. pbio.1002415,0.67,,,
7,8.0,EarthEnvCloudCover_MODCF_seasonality_concentration,Seasonal cloud concentration index (see methods for full description) ranging from 0 (all months equally cloudy) to 100 (all clouds are observed in a single month);,https://data.earthenv.org/cloud/MODCF_seasonality_concentration.tif,10.1371/journal. pbio.1002415,0.67,,,
8,9.0,EarthEnvCloudCover_MODCF_seasonality_theta,Timing of peak seasonal cloud concentration (see methods for full description) expressed as an angle (degrees) ranging from 0 (peak cloudiness observed on January 1st) to 360 (peak cloudiness observed on December 31st).,https://data.earthenv.org/cloud/MODCF_seasonality_theta.tif,10.1371/journal. pbio.1002415,0.67,,,
26,27.0,GPWv4_Population_Density,"Gridded Population of the World, Version 4 (GPWv4): UN-Adjusted Population Density",https://explorer.earthengine.google.com/#detail/CIESIN%2FGPWv4%2Funwpp-adjusted-population-density,10.7927/H4HX19NJ,,,,
152,,,,,,,,,
153,,,,,,264.05,,,


# Reference

Reading GeoTIFF:
- http://xarray.pydata.org/en/stable/io.html#rasterio
- http://xarray.pydata.org/en/stable/generated/xarray.open_rasterio.html#xarray-open-rasterio