## Join satellite image with another dataset using dictionaries

In [1]:
import os
import xarray as xr
import numpy as np
import pandas as pd
import csv
#import glob    #This one lets you read all the csv files in a directory
import rasterio.crs
from tqdm.auto import tqdm #this one is a loading bar, it's cool to add loading bars to loops
from pandas import DataFrame
import geopandas as gpd
import matplotlib.gridspec as gs
import matplotlib.pyplot as plt
from matplotlib import pyplot
import datacube

import sys
sys.path.append('../Scripts')
from dea_spatialtools import xr_rasterize
from datacube.utils import geometry 
from datacube.utils.geometry import CRS
from datacube.utils import masking
from datacube.helpers import ga_pq_fuser, write_geotiff
#from digitalearthau.utils import wofs_fuser
#import DEAPlotting, DEADataHandling
import warnings
warnings.filterwarnings('ignore', module='datacube')
%load_ext autoreload
%autoreload 2



## Make dictionary of reservoir images

The following code block was copied from a DEA notebook called 'Open and run analysis on multiple polygons'.

In [2]:
gdf = gpd.read_file('00_Lib_bound/00_Lib_bound.shp') #The polygons of the extent I want

query = {'time': ('01-01-1988', '09-12-2020')} 
         #'crs': 'EPSG:3577'}
dc = datacube.Datacube(app='dc-WOfS')

wofs_dict = {} 

for index, row in gdf.iterrows():
    geom = geometry.Geometry(geom=row.geometry, crs=gdf.crs)
    query.update({'geopolygon': geom})
    
    wofs_albers= dc.load(product = 'wofs_albers', dask_chunks = {}, group_by='solar_day', **query)
    
    poly_mask = xr_rasterize(gdf.iloc[[index]], wofs_albers)
    wofs_albers = wofs_albers.where(poly_mask, other=wofs_albers.water.nodata) #put other or all the data turns into 0
    
    wofs_dict.update({str(row['gauge_ID']): wofs_albers}) #The key for dictionary objects is the gauge ID

## Make dictionary of gauge data

In [3]:
file_list = [] #iteratively read over all the files in a directory

directory = '00_Library'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_list.append(os.path.join(directory, filename))

In [4]:
data_dict = {}        
ID_list = [] #This list of IDs will come in handy later (list of dictionary keys)

for i in file_list:
    #get the ID from the gauge data file so we can make it the key
    df = pd.read_csv(i, nrows=1, escapechar='#')
    column = df.iloc[:,[1]] #This is the column with the ID in it
    ID = list(column)
    ID = ID[0]
    ID = df.at[0, ID]
    ID_list.append(str(ID))
    
    #get the actual gauge data
    data = pd.read_csv(i, error_bad_lines = False, skiprows=9, escapechar='#',
                         parse_dates=['Timestamp'], 
                         index_col=('Timestamp'),
                        date_parser=lambda x: pd.to_datetime(x.rsplit('+', 1)[0]))
    data = data.drop(columns=['Quality Code', 'Interpolation Type'])
    
    data_dict.update({str(ID): data}) #again, the key is the gauge ID

## have a look at the dictionaries we want to join

In [14]:
test_ID = ID_list[1]
print('ID =', test_ID, '\n\n', 'wofs data: \n', wofs_dict[test_ID], '\n \n \n \n', 'gauge data: \n', data_dict[test_ID])

ID = 604.1 

 wofs data: 
 <xarray.Dataset>
Dimensions:      (time: 667, x: 605, y: 588)
Coordinates:
  * time         (time) datetime64[ns] 1988-01-15T23:27:33.500000 ... 2019-07...
  * y            (y) float64 -4.587e+06 -4.587e+06 ... -4.602e+06 -4.602e+06
  * x            (x) float64 1.204e+06 1.204e+06 ... 1.219e+06 1.219e+06
    spatial_ref  int32 3577
Data variables:
    water        (time, y, x) int16 dask.array<chunksize=(1, 588, 605), meta=np.ndarray>
Attributes:
    crs:           EPSG:3577
    grid_mapping:  spatial_ref 
 
 
 
 gauge data: 
               Value
Timestamp          
2000-01-01  121.532
2000-01-02  121.540
2000-01-03  121.565
2000-01-04  121.556
2000-01-05  121.553
...             ...
2020-11-11  120.085
2020-11-12  120.174
2020-11-13  120.195
2020-11-14  120.208
2020-11-15      NaN

[7625 rows x 1 columns]


## Append one dictionary to the other

In [18]:
from collections import defaultdict

new_dict = defaultdict(list)

for i in (data_dict, wofs_dict): #add as many dictionaries with the same key as you want
    for key, value in i.items():
        new_dict[key].append(value)

In [19]:
new_dict[test_ID]

[              Value
 Timestamp          
 2000-01-01  121.532
 2000-01-02  121.540
 2000-01-03  121.565
 2000-01-04  121.556
 2000-01-05  121.553
 ...             ...
 2020-11-11  120.085
 2020-11-12  120.174
 2020-11-13  120.195
 2020-11-14  120.208
 2020-11-15      NaN
 
 [7625 rows x 1 columns],
 <xarray.Dataset>
 Dimensions:      (time: 667, x: 605, y: 588)
 Coordinates:
   * time         (time) datetime64[ns] 1988-01-15T23:27:33.500000 ... 2019-07...
   * y            (y) float64 -4.587e+06 -4.587e+06 ... -4.602e+06 -4.602e+06
   * x            (x) float64 1.204e+06 1.204e+06 ... 1.219e+06 1.219e+06
     spatial_ref  int32 3577
 Data variables:
     water        (time, y, x) int16 dask.array<chunksize=(1, 588, 605), meta=np.ndarray>
 Attributes:
     crs:           EPSG:3577
     grid_mapping:  spatial_ref]