# Process GRACE Indicators

### Prepare Workspace

In [None]:
#!pip install geopandas xarray netCDF4

import os

import numpy as np
import pandas as pd

import geopandas as gpd
import xarray as xr

# Define the base directory and the subfolder names
base_dir = '/Users/jessicarapson/Documents/GitHub/water-supply-forecast' # Change this
nc_files_location = 'assets/data/grace_indicators'
gpkg_file_path = 'assets/data/geospatial.gpkg'

# Set the current working directory to the base directory
os.chdir(base_dir)

# Create the full path to the subfolders
nc_files_folder = os.path.join(base_dir, nc_files_location)

# Load GeoPackage file containing polygons
gdf = gpd.read_file(gpkg_file_path)

### Export Catchment Averages

In [293]:
# Create an empty dataframe to store the derived data
dfs = []

# List all folders (years) in the subfolder
nc_files_folder = [f.path for f in os.scandir(nc_files_location) if f.is_dir()]

# Loop through each year's folder
for year_folder in nc_files_folder:
    
    # List all .nc4 files in the current year's folder
    nc_files = [f.path for f in os.scandir(year_folder) if f.is_file() and f.name.endswith('.nc4')]
    
    # Loop through each .nc4 file in the year's folder
    for nc_file in nc_files:
        
        # Open the NetCDF file using xarray
        data = xr.open_dataset(nc_file)
        
        # Loop through the catchments
        for index, row in gdf.iterrows():
            
            # Select catchment area
            polygon = row['geometry']
            
            # Select only raster data that overlaps with the catchment
            data_subset = data.sel(lat=slice(polygon.bounds[1], polygon.bounds[3]),
                                   lon=slice(polygon.bounds[0], polygon.bounds[2]))
            
            # Record the site and time
            site_id = gdf.iloc[index]['site_id']
            time =  pd.to_datetime(data_subset.coords['time'].values.item())

            # Take the cachment mean for each value
            gws_inst = np.nanmean(data_subset['gws_inst'].values)
            rtzsm_inst = np.nanmean(data_subset['rtzsm_inst'].values)
            sfsm_inst = np.nanmean(data_subset['sfsm_inst'].values)
            
            # Create a dataframe with the selected data
            subset_df = pd.DataFrame({
                'site_id': site_id,
                'time': time,
                'mean_gws_inst': gws_inst,
                'mean_rtzsm_inst': rtzsm_inst,
                'mean_sfsm_inst': sfsm_inst}, index=[0])
        
            # Append the subset dataframe to the output dataframe
            dfs.append(subset_df)
        
        # Once done, close the dataset
        data.close()
        
# Concatenate all dataframes into a single dataframe
result_df = pd.DataFrame(pd.concat(dfs, ignore_index=True))

# Export the dataframe
result_df.to_csv("assets/data/grace_indicators/grace_aggregated.csv", index=False)

### Export Grid Values for Each Catchment

In [290]:
# Create an empty dataframe to store the derived data
dfs = []

# List all folders (years) in the subfolder
nc_files_folder = [f.path for f in os.scandir(nc_files_location) if f.is_dir()]
        
# Loop through the catchments
for index, row in gdf.iterrows():
    
    # Select catchment area
    polygon = row['geometry']

    # Loop through each year's folder
    for year_folder in nc_files_folder:
        
        # List all .nc4 files in the current year's folder
        nc_files = [f.path for f in os.scandir(year_folder) if f.is_file() and f.name.endswith('.nc4')]
    
        # Loop through each .nc4 file in the year's folder
        for nc_file in nc_files:

            # Select only raster data that overlaps with the catchment
            data_subset = data.sel(lat=slice(polygon.bounds[1], polygon.bounds[3]),
                                   lon=slice(polygon.bounds[0], polygon.bounds[2]))

            # Record the site and time
            site_id = gdf.iloc[index]['site_id']
            time =  pd.to_datetime(data_subset.coords['time'].values.item())

            # Format lat and long coordinates
            lat = data_subset.coords['lat'].values
            lon = data_subset.coords['lon'].values
            lat_mesh, lon_mesh = np.meshgrid(lat, lon)
            lat_mesh = lat_mesh.flatten()
            lon_mesh = lon_mesh.flatten()

            # Select values
            values = data_subset['gws_inst'].values.flatten()
            
            # Create a dataframe with the selected data
            subset_df = pd.DataFrame({
            'site_id': site_id,
            'time': time,
            'lat': lat_mesh,
            'lon': lon_mesh,
            'gws_inst': values})
            
            # Append the subset dataframe to the output dataframe
            dfs.append(subset_df)
            
            # Once done, close the dataset
            data.close()
        
# Concatenate all dataframes into a single dataframe
result_df = pd.DataFrame(pd.concat(dfs, ignore_index=True))

# Export the dataframe
result_df.to_csv("assets/data/grace_indicators/grace_pixels.csv", index=False)