# Code to reprocessing reanalysis datasets used in Aguayo et al. (in review)
Developed by Rodrigo Aguayo (2020-2022)

In [14]:
import pandas as pd
import xarray as xr
import regionmask
import rioxarray as rioxr
import geopandas as gpd
import numpy as np
import os

os.chdir('/home/rooda/Dropbox/Patagonia/Data/') 
local  = "/media/rooda/Local Disk/Datasets"
days = np.array([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31])

encode_pp  = {"pp": {"zlib": True, "complevel": 9, "dtype": "int16"}}
encode_t2m = {"t2m": {"zlib": True, "complevel": 9, "dtype": "float32"}}
mask       = gpd.read_file("/home/rooda/Dropbox/Patagonia/GIS South/dem_mask.shp")

## ERA5

In [7]:
dict_pp  = {'tp':'pp', 'longitude':'lon', 'latitude':'lat'} # monthly averaged reanalysis (ok)
stack_pp = xr.open_dataset(os.path.join(local + "/ERA5/Monthly/ERA5_1959_2021m.nc"), chunks ="auto").rename(dict_pp)["pp"]
stack_pp = stack_pp.where((stack_pp.lon >= -76) & (stack_pp.lon <= -65) & (stack_pp.lat >= -56) & (stack_pp.lat <= -40), drop=True)
months  = xr.DataArray(days.repeat(2021-1959+1), coords=[stack_pp.time], name='month_length')
stack_pp = (stack_pp*months*1000).round(0)
stack_pp.to_netcdf("Precipitation/PP_ERA5_1959_2021m.nc")

dict_t2m  = {'longitude':'lon', 'latitude':'lat'} # monthly averaged reanalysis (ok)
stack_t2m = xr.open_dataset(os.path.join(local + "/ERA5/Monthly/ERA5_1959_2021m.nc"), chunks ="auto").rename(dict_t2m)["t2m"]
stack_t2m = stack_t2m.where((stack_t2m.lon >= -76) & (stack_t2m.lon <= -65) & (stack_t2m.lat >= -56) & (stack_t2m.lat <= -40), drop=True)
stack_t2m = (stack_t2m-273.15).round(2)
stack_t2m.to_netcdf("Temperature/T2M_ERA5_1959_2021m.nc")

In [None]:
# reanalysis 3-hourly data
dict_pp   = {'tp':'pp', 'longitude':'lon', 'latitude':'lat'}
stack_pp  = xr.open_mfdataset(os.path.join(local + "/ERA5/Hourly/ERA5*.nc"), concat_dim='time', combine='nested', chunks ="auto").rename(dict_pp)["pp"]  
mask_pp   = regionmask.mask_geopandas(mask, stack_pp)
stack_pp  = stack_pp.where(mask_pp >= 0, drop=True)
stack_pp  = stack_pp.sortby("time").resample(time='1D').sum(skipna=False)
stack_pp  = (stack_pp*3*1000).round(0)
stack_pp.to_netcdf("Precipitation/PP_ERA5_1959_2021d.nc")

In [None]:
dict_t2m   = {'longitude':'lon', 'latitude':'lat'}
stack_t2m  = xr.open_mfdataset(os.path.join(local + "/ERA5/Hourly/ERA5*.nc"), concat_dim='time', combine='nested', chunks ="auto").rename(dict_t2m)["t2m"] 
mask_t2m   = regionmask.mask_geopandas(mask, stack_t2m)
stack_t2m  = stack_t2m.where(mask_t2m >= 0, drop=True)
stack_t2m_max = stack_t2m.sortby("time").resample(time='1D').max()
stack_t2m_min = stack_t2m.sortby("time").resample(time='1D').min()
stack_t2m_max = (stack_t2m_max-273.15).round(2)
stack_t2m_min = (stack_t2m_min-273.15).round(2)
stack_t2m_max.to_netcdf("Temperature/Tmax_ERA5_1959_2021d.nc")
stack_t2m_min.to_netcdf("Temperature/Tmin_ERA5_1959_2021d.nc")

## ERA5-LAND



In [None]:
 # monthly averaged reanalysis
dict_pp   = {'tp':'pp', 'longitude':'lon', 'latitude':'lat'}
stack_pp  = xr.open_dataset(os.path.join(local + "/ERA5_LAND/Monthly/PP_ERA5L_1950_2021m.nc"), chunks ="auto").rename(dict_pp)  
stack_pp  = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp  = (stack_pp*1000).astype("int32")
stack_pp.to_netcdf("Precipitation/PP_ERA5L_1950_2021m.nc")

dict_t2m  = {'longitude':'lon', 'latitude':'lat'}
stack_t2m = xr.open_dataset(os.path.join(local, "/ERA5_LAND/Monthly/T2M_ERA5L_1950_2021m.nc"), chunks ="auto").rename(dict_t2m)  
stack_t2m = stack_t2m.where((stack_t2m.lon >= -79) & (stack_t2m.lon <= -64) & (stack_t2m.lat >= -57) & (stack_t2m.lat <= -40), drop=True)
stack_t2m = (stack_t2m-273.15).round(2)
stack_t2m.to_netcdf("Temperature/T2M_ERA5__1950_2021m.nc")

 # reanalysis 3-hourly data
stack_pp  = xr.open_mfdataset(os.path.join(local + "/ERA5_LAND/Hourly/PP_ERA5L_*.nc"), concat_dim='time', combine='nested', chunks ="auto")*1000     
stack_pp  = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp  = stack_pp.resample(time='1D').mean()
stack_pp  = (stack_pp*1000).round(1)
stack_pp.to_netcdf("Precipitation/PP_ERA5L_1950_2021d.nc")

stack_t2m  = xr.open_mfdataset(os.path.join(local + "/ERA5_LAND/Hourly/T2M_ERA5L.*.nc"), concat_dim='time', combine='nested', chunks ="auto")-273.15
stack_t2m  = stack_t2m.where((stack_t2m.lon >= -79) & (stack_t2m.lon <= -64) & (stack_t2m.lat >= -57) & (stack_t2m.lat <= -40), drop=True)
stack_t2m_max = stack_t2m.resample(time='1D').max()
stack_t2m_min = stack_t2m.resample(time='1D').min()
stack_t2m_min = (stack_t2m_min-273.15).round(2)
stack_t2m_max = (stack_t2m_max-273.15).round(2)
stack_t2m_max.to_netcdf("Temperature/T2M_max_ERA5L_1950_2021d.nc")
stack_t2m_min.to_netcdf("Temperature/T2M_min_ERA5L_1950_2021d.nc")

dict_ws  = {'longitude':'lon', 'latitude':'lat'}
factor      = (2/10)**0.25 #C orrection from 10m to 2m
stack_ws    = xr.open_mfdataset(os.path.join(local + "/ERA5_LAND/Hourly/WS_ERA5L*.nc"), concat_dim='time', combine='nested', chunks ="auto").rename(dict_ws) 
stack_ws    = stack_ws.where((stack_ws.lon >= -79) & (stack_ws.lon <= -64) & (stack_ws.lat >= -57) & (stack_ws.lat <= -40), drop=True).sortby("time")
stack_ws["ws"]= (stack_ws.u10**2 + stack_ws.v10**2)**0.5
stack_ws["ws"] = stack_ws.ws*factor
stack_ws = stack_ws["ws"].resample(time='1D').mean()
stack_ws.to_netcdf("Wind_speed/WS_ERA5L_1950_2021d.nc")

dict_hr   = {'longitude':'lon', 'latitude':'lat'}
stack_d2m = xr.open_mfdataset(os.path.join(local + "/ERA5_LAND/Hourly/T2Md*.nc"), concat_dim='time', combine='nested', chunks ="auto")-273.15
stack_t2m = xr.open_mfdataset(os.path.join(local + "/ERA5_LAND/Hourly/T2M*.nc"), concat_dim='time', combine='nested', chunks ="auto")-273.15
stack_rh  = xr.merge([stack_d2m, stack_d2m]).rename(dict_hr) 
stack_rh  = stack_rh.where((stack_rh.lon >= -79) & (stack_rh.lon <= -64) & (stack_rh.lat >= -57) & (stack_rh.lat <= -40), drop=True).sortby("time")
stack_rh["hr"] = exp((17.625*stack_rh.d2m)/(243.04+stack_rh.d2m)) / exp((17.625*stack_rh.t2m)/(243.04+stack_rh.t2m))
stack_rh = stack_rh["hr"].resample(time='1D').mean()
stack_rh.to_netcdf("Relative_humidity/RH_ERA5L_1950_2021d.nc")

In [None]:
pp_cr2met.pp.plot()

## MERRA2 (ok)

In [None]:
# Monthly data 
dict_pp   = {'TPRECMAX':'pp'}
stack_pp  = xr.open_mfdataset(os.path.join(local + "/MERRA2/MERRA2_*.nc4"), concat_dim='time', combine='nested', chunks ="auto").rename(dict_pp)[["pp"]]  
stack_pp  = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
months  = xr.DataArray(days.repeat(2021-1980+1), coords=[stack_pp.time], name='month_length')
stack_pp  = (stack_pp*months*86400).astype("int32")
stack_pp.to_netcdf("Precipitation/PP_MERRA2_1980_2021m.nc")                              

dict_t2m  = {'T2MMEAN':'t2m'}
stack_t2m = xr.open_mfdataset(os.path.join(local + "/MERRA2/MERRA2_*.nc4"), combine='by_coords', chunks ="auto").rename(dict_t2m)[["t2m"]]
stack_t2m = stack_t2m.where((stack_t2m.lon >= -79) & (stack_t2m.lon <= -64) & (stack_t2m.lat >= -57) & (stack_t2m.lat <= -40), drop=True)
stack_t2m = (stack_t2m-273.15).round(2)
stack_t2m.to_netcdf("Temperature/T2M_MERRA2_1980_2021m.nc")  

## CSFR (ok)

In [9]:
# Monthly Mean (4 per day) of 6-hour Accumulation
dict_pp  = {'A_PCP_L1_AccumAvg':'pp'}
stack_pp  = xr.open_mfdataset(os.path.join(local + "/CSFR/PP/*.nc"), concat_dim='time', combine='nested').sortby("time").rename(dict_pp)[["pp"]]
stack_pp.coords['lon'] = (stack_pp.coords['lon'] + 180) % 360 - 180
stack_pp = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp["time"] = pd.date_range(start='1979/01/01', end='2019/12/01', freq='MS')  
months  = xr.DataArray(days.repeat(2019-1979+1), coords=[stack_pp.time], name='month_length')
stack_pp  = (stack_pp*months*4).astype("int32")
stack_pp.to_netcdf("Precipitation/PP_CSFR_1979_2019m.nc", encoding = encode_pp)

# Monthly Mean (4 per day) of 6-hour Accumulation
dict_t2m  = {'TMP_L103_Avg':'t2m'}
stack_t2m = xr.open_mfdataset(os.path.join(local + "/CSFR/T2M/*.nc"), concat_dim='time', combine='nested').sortby("time").rename(dict_t2m)[["t2m"]]
stack_t2m.coords['lon'] = (stack_t2m.coords['lon'] + 180) % 360 - 180
stack_t2m = stack_t2m.where((stack_t2m.lon >= -79) & (stack_t2m.lon <= -64) & (stack_t2m.lat >= -57) & (stack_t2m.lat <= -40), drop=True)
stack_t2m["time"] = pd.date_range(start='1979/01/01', end='2019/12/01', freq='MS') 
stack_t2m = (stack_t2m-273.15).round(2)                  
stack_t2m.to_netcdf("Temperature/T2M_CSFR_1979_2019m.nc", encoding = encode_t2m)

## REGCR2: RegCM4-CR2 (ok)

In [6]:
dict_pp  = {'pr':'pp'}
stack_pp = xr.open_mfdataset(os.path.join(local + "/REGCR2/pr_*.nc"), combine='by_coords').rename(dict_pp).sortby("time")[["pp"]]
stack_pp = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp["time"] = pd.date_range(start='1980/01/01', end='2015/12/01', freq='MS')  
months   = xr.DataArray(days.repeat(2015-1980+1), coords=[stack_pp.time], name='month_length')
stack_pp = (stack_pp*months*86400).astype("int32")
stack_pp.to_netcdf("Precipitation/PP_REGCR2_1980_2015m.nc", encoding = encode_pp)

dict_t2m  = {'tas':'t2m'}
stack_t2m = xr.open_mfdataset(os.path.join(local + "/REGCR2/tas_*.nc"), combine='by_coords').rename(dict_t2m).sortby("time")["t2m"] 
stack_t2m = stack_t2m.where((stack_t2m.lon >= -79) & (stack_t2m.lon <= -64) & (stack_t2m.lat >= -57) & (stack_t2m.lat <= -40), drop=True)
stack_t2m["time"] = pd.date_range(start='1980/01/01', end='2015/12/01', freq='MS') 
stack_t2m = (stack_t2m-273.15).round(2)
stack_t2m.to_netcdf("Temperature/T2M_REGCR2_1980_2015m.nc", encoding = encode_t2m)

## MSWEP v2.8 (ok)

In [None]:
#Daily time step
dict_pp  = {'precipitation':'pp'}
stack_pp = xr.open_mfdataset(os.path.join(local + "/MSWEP/Daily/*.nc"), combine='by_coords', chunks ="auto", parallel = True).rename(dict_pp)  
stack_pp = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp.to_netcdf("Precipitation/PP_MSWEPv28_1979_2020d.nc")

#From daily to monthly
stack_pp = xr.open_mfdataset(os.path.join(local + "/MSWEP/Monthly/*.nc"), combine='by_coords', chunks ="auto", parallel = True).rename(dict_pp)  
stack_pp = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp = stack_pp.astype("int32")
stack_pp.to_netcdf("Precipitation/PP_MSWEPv28_1979_2020m.nc")

## CR2MET v2.0 (ok)

In [None]:
# Daily timestep
dict_pp   = {'pr':'pp'}
stack_pp  = xr.open_dataset(os.path.join(local + "/CR2MET/PP_CR2METv2_1979_2020d.nc"), chunks ="auto")[["pr"]].rename(dict_pp)
stack_pp  = stack_pp.where((stack_pp.lon >= -79) & (stack_pp.lon <= -64) & (stack_pp.lat >= -57) & (stack_pp.lat <= -40), drop=True)
stack_pp.to_netcdf("Precipitation/PP_CR2MET_1979_2020d.nc")

stack_t2m_max = xr.open_dataset(os.path.join(local + "/CR2MET/T2M_MAX_CR2METv2_1979_2020d.nc"), chunks ="auto")
stack_t2m_max = stack_t2m_max.where((stack_t2m_max.lon >= -79) & (stack_t2m_max.lon <= -64) & (stack_t2m_max.lat >= -57) & (stack_t2m_max.lat <= -40), drop=True)               
stack_t2m_max.to_netcdf("Temperature/T2M_max_CR2MET_1979_2020d.nc")                      

stack_t2m_min = xr.open_dataset(os.path.join(local + "/CR2MET/T2M_MIN_CR2METv2_1979_2020d.nc"), chunks ="auto")
stack_t2m_min = stack_t2m_min.where((stack_t2m_min.lon >= -79) & (stack_t2m_min.lon <= -64) & (stack_t2m_min.lat >= -57) & (stack_t2m_min.lat <= -40), drop=True)                               
stack_t2m_min.to_netcdf("Temperature/T2M_min_CR2MET_1979_2020d.nc")

# To monthly timestep 
stack_pp       = stack_pp.resample(time='MS').sum()    
stack_t2m_max  = stack_t2m_max.resample(time='MS').mean()    
stack_t2m_min  = stack_t2m_min.resample(time='MS').mean()
stack_t2m      = (stack_t2m_max["tmax"]+stack_t2m_min["tmin"])/2 # Just the mean of the maximum and minimum
stack_t2m = stack_t2m.to_dataset(name = "t2m")

stack_pp.to_netcdf("Precipitation/PP_CR2MET_1979_2020m.nc")
stack_t2m.to_netcdf("Temperature/T2M_CR2MET_1979_2020m.nc")

## GLEAM v3.6a

In [15]:
# Daily timestep
pet_stack = xr.open_mfdataset(os.path.join(local + "/GLEAM/Daily/Ep_*.nc"), combine='by_coords', chunks ="auto").rename({'Ep':'pet'})
pet_stack = pet_stack.where((pet_stack.lon >= -79) & (pet_stack.lon <= -64) & (pet_stack.lat >= -57) & (pet_stack.lat <= -40), drop=True)
pet_stack = pet_stack["pet"].where(pet_stack != 0)
pet_stack = pet_stack.interpolate_na(dim="lon", method="linear", limit=1)
pet_stack = pet_stack.round(2)
pet_stack.lon.attrs['long_name'] = 'longitude'
pet_stack.lat.attrs["long_name"] = "latitude"
pet_stack.to_netcdf("Evapotranspiration/PET_GLEAM36a_1980_2021d.nc", encoding = {"pet": {"zlib": True, "complevel": 9, "dtype": "float32"}})

# Monthly timestep
pet_stack = pet_stack.resample(time='MS').sum()  # to monthly timestep
pet_stack = pet_stack.where(pet_stack != 0)
pet_stack = pet_stack.round(0)
pet_stack.to_netcdf("Evapotranspiration/PET_GLEAM36a_1980_2021m.nc", encoding = {"pet": {"zlib": True, "complevel": 9, "dtype": "int16"}})

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  value = value[(slice(None),) * axis + (subkey,)]
  pet_stack.to_netcdf("Evapotranspiration/PET_GLEAM36a_1980_2021m.nc", encoding = {"pet": {"zlib": True, "complevel": 9, "dtype": "int16"}})


## WATER BALANCE III & IV

In [5]:
dict_bh3  = {'dim_lon':'lon', 'dim_lat':'lat', 'dim_time':'time'} # monthly averaged reanalysis (ok)
bh3_stack = xr.open_dataset(os.path.join(local + "/DGA_BH/BH3/netcdf/1_Historico/regionalizacion_1979_2015.nc")).rename(dict_bh3)
bh3_stack = bh3_stack.assign_coords(time = pd.date_range(start='1979/01/01', end='2015/12/01', freq='MS'))
bh3_stack = bh3_stack.assign_coords(lon  = bh3_stack.lon)
bh3_stack = bh3_stack.assign_coords(lat  = bh3_stack.lat)
bh3_stack = bh3_stack.transpose("time", "lat", "lon")
bh3_stack = bh3_stack.where((bh3_stack.lon >= -75) & (bh3_stack.lon <= -71) & (bh3_stack.lat >= -45.8) & (bh3_stack.lat <= -40), drop=True)
bh3_stack = bh3_stack.sel(time = slice("1985-01-01", "2016-01-01"))
bh3_stack = bh3_stack[["pr", "ET", "PET"]].resample(time='1Y').sum()
bh3_stack = bh3_stack.mean("time")
bh3_stack = bh3_stack.where(bh3_stack.pr != 0)
bh3_stack["ET"] = bh3_stack.ET*30
bh3_stack["PET"] = bh3_stack.PET*30

bh4_stack_pp  = xr.open_dataset(os.path.join(local + "/DGA_BH/BH4/Archivos_raster/BH_85-15/Forzantes/1_Historico/pr_Anual_LatLon.tif"))
bh4_stack_pet = xr.open_dataset(os.path.join(local + "/DGA_BH/BH4/Archivos_raster/BH_85-15/VIC/1_Historico/pet_Anual_LatLon.tif"))
bh4_stack_et  = xr.open_dataset(os.path.join(local + "/DGA_BH/BH4/Archivos_raster/BH_85-15/VIC/1_Historico/et_Anual_LatLon.tif"))
bh4_stack_pp  = bh4_stack_pp.sel(band=1, drop=True).drop("spatial_ref").rename({'x':'lon',  'y':'lat', 'band_data':'pr'})
bh4_stack_pet = bh4_stack_pet.sel(band=1, drop=True).drop("spatial_ref").rename({'x':'lon', 'y':'lat', 'band_data':'PET'})
bh4_stack_et  = bh4_stack_et.sel(band=1, drop=True).drop("spatial_ref").rename({'x':'lon',  'y':'lat', 'band_data':'ET'})

bh4_stack_pp  = bh4_stack_pp.pr.combine_first(bh3_stack.pr).rename({'lon':'x',  'lat':'y'})
bh4_stack_pet = bh4_stack_pet.PET.combine_first(bh3_stack.PET).rename({'lon':'x',  'lat':'y'})
bh4_stack_et  = bh4_stack_et.ET.combine_first(bh3_stack.ET).rename({'lon':'x',  'lat':'y'})
bh4_stack_pp.rio.to_raster("Precipitation/PP_WB_DGA_1985_2015.tif")
bh4_stack_pet.rio.to_raster("Evapotranspiration/PET_WB_DGA_1985_2015.tif")
bh4_stack_et.rio.to_raster("Evapotranspiration/ET_WB_DGA_1985_2015.tif")

In [27]:
os.chdir('/home/rooda/Dropbox/Patagonia/Data/Temperature/') 

t2m_pmet["t2m"]   = (xr.open_dataset("Tmax_PMET_1980_2020d.nc").Tmax + xr.open_dataset("Tmin_PMET_1980_2020d.nc").Tmin)/2
t2m_pmet.to_netcdf("Tavg_PMET_1980_2020d.nc")

t2m_cr2met["t2m"] = (xr.open_dataset("Tmax_CR2MET_1979_2020d.nc").tmax + xr.open_dataset("Tmin_CR2MET_1979_2020d.nc").tmin)/2
t2m_cr2met.to_netcdf("Tavg_CR2MET_1979_2020d.nc")

t2m_era5d["t2m"]  = (xr.open_dataset("Tmax_ERA5_hr_1980_2020d.nc").tmax + xr.open_dataset("Tmin_ERA5_hr_1980_2020d.nc").tmin)/2
t2m_era5d.to_netcdf("Tavg_ERA5_hr_1980_2020d.nc")