# Validation Workflow: October 2022 Flooding

Notebook to perform validation of the flood events in eastern Australia during October. 

**Validation LGAs:** 
```
 Shepparton = 'Greater Shepparton'
 Rochester  = 'Campaspe'
 Moree      = 'Moree Plains'
 Forbes     = 'Forbes'
 ```

In [None]:
# Necessary imports
import os
os.environ['USE_PYGEOS'] = '0'
from itertools import product
from dotenv import load_dotenv
import fsspec
import geopandas as gpd
import matplotlib.pyplot as plt

from ml4floods.data import utils
from ml4floods.visualization import plot_utils
from ml4floods.models.postprocess import spatial_aggregation
from ml4floods.data.ee_download import process_metadata

from datetime import datetime, timezone, timedelta
import ee
from georeader.readers import ee_query
import pandas as pd
import folium
#from backports.zoneinfo import ZoneInfo
from zoneinfo import ZoneInfo
import geemap.foliumap as geemap
import shapely

# Uncomment this to suppress deprecation warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 

## Load environment and project details

The notebook reads the location of the GCP access key file and project name from a hidden ```.env``` file in the root directory. See [SETUP]() file for instructions on creating these.

In [None]:
# Load environment variables (including path to credentials) from '.env' file
env_file_path = "../.env"

assert load_dotenv(dotenv_path=env_file_path) == True, "[ERR] failed to load environment!"
assert "GOOGLE_APPLICATION_CREDENTIALS" in os.environ, "[ERR] missing $GOOGLE_APPLICATION_CREDENTIAL!"
assert "GS_USER_PROJECT" in os.environ, "[ERR] missing $GS_USER_PROJECT!"
key_file_path = os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
assert os.path.exists(key_file_path), f"[ERR] Google credential key file does not exist: \n{key_file_path} "

In [None]:
# Authenticate with Google Earth Engine
# Execute 'earthengine authenticate' from a terminal

In [None]:
# Initialise the Google Earth Engine connection.
# Follow instructions on login prompt, if required.
ee.Initialize()

## Set flood event & validation parameters

Set the session name, flooding date range and LGAs to be validated.

In [None]:
# All work is conducted under a unique session name
session_name = "NEMA002"

# Pre-flood date range
date_pre_flood_start = "2022-09-05"
date_pre_flood_end = "2022-09-22"

# Flooding date range
date_flood_start = "2022-10-11"
date_flood_end = "2022-11-10"
#flood_duration_days = 15

# List of LGAs to check
lga_req_list = ["Greater Shepparton", "Campaspe", "Moree Plains", "Forbes"]

## Parse dates, load the grid and LGA shapes

In [None]:
# First parse the pre- and post-flood dates
tz = ZoneInfo("Australia/Sydney")
date_event = datetime.strptime(date_flood_start,"%Y-%m-%d").replace(tzinfo=tz)
period_pre_flood_start = datetime.strptime(date_pre_flood_start,"%Y-%m-%d").replace(tzinfo=tz)
period_pre_flood_end = datetime.strptime(date_pre_flood_end,"%Y-%m-%d").replace(tzinfo=tz)
period_flood_start = datetime.strptime(date_flood_start,"%Y-%m-%d").replace(tzinfo=tz)
period_flood_end = datetime.strptime(date_flood_end,"%Y-%m-%d").replace(tzinfo=tz)

print(f"Pre-flood search period: \n{period_pre_flood_start} to \n{period_pre_flood_end}")
print(f"\nFlood search period: \n{period_flood_start} to \n{period_flood_end}")

In [None]:
# The sampling grid is stored in a GeoJSON file under the session name
path_to_aois = f"gs://ml4floods_nema/0_DEV/1_Staging/operational/{session_name}/aois.geojson"
grid_fs = utils.get_filesystem(path_to_aois)
grid_df = gpd.read_file(path_to_aois).to_crs('epsg:4326')
grid_df.plot()

In [None]:
# We store the LGA shapefile locally
lga_file_path = "../resources/LGAs/LGA_2022_AUST_GDA2020.shp"
lga_df = gpd.read_file(lga_file_path).to_crs('epsg:4326')
lga_df.plot()

## Define useful functions

In [None]:
def intersect_lga_grid(lga_name, lga_df, grid_df):
    """Return the grid patches that overlap a LGA/"""
    grid_list = grid_df.overlay(lga_df[lga_df["LGA_NAME22"] == lga_name ], 
                                how = 'intersection').name.to_list()
    grid_lga_df = grid_df.loc[grid_df.name.apply(lambda x: x in grid_list)]
    return grid_list, grid_lga_df

def get_metadata_grid(grid_name):
    """Fetch and parse the metadata for a single grid position."""
    csv_s2_path = f"gs://ml4floods_nema/0_DEV/1_Staging/GRID/{grid_name}/S2/s2info.csv"
    metadata_s2 = process_metadata(csv_s2_path)
    metadata_s2["satellite"] = "S2"
    csv_l89_path = f"gs://ml4floods_nema/0_DEV/1_Staging/GRID/{grid_name}/Landsat/landsatinfo.csv"
    metadata_l89 = process_metadata(csv_l89_path)
    metadata_l89["satellite"] = "Landsat"
    metadata = pd.concat([metadata_s2, metadata_l89],ignore_index=True)
    metadata.cloud_probability*=100
    metadata = metadata[metadata["valids"] >= .8]
    return metadata

def plot_data_timeseries(metadata, period_flood_start, period_flood_end, period_pre_flood_start):
    """Plot the time-series of available satellite data."""
    for sat, downloaded in product(["Landsat","S2"], [True, False]):
        label=f"{sat} ({'' if downloaded else 'NOT '}Downloaded)"
        selection_mask = (metadata.satellite == sat) & (metadata.s2available == downloaded)
        color = "C0" if sat =="S2" else "C1"
        marker = "o" if downloaded else "x"
        ax.scatter(x=metadata[selection_mask].datetime, 
                   y=metadata[selection_mask].cloud_probability, 
                   label=label, c=color, marker=marker, s=100)
        
    # Format plot to look nice
    ax.legend()
    metadata.plot(x="datetime", y="cloud_probability", ax=ax,legend=None)
    ax.axvspan(period_flood_start, period_flood_end, alpha=0.2)
    datespan = abs(period_flood_end - period_pre_flood_start)
    ax.set_xlim(period_pre_flood_start-datespan*.1, period_flood_end+datespan*.1)
    plt.ylabel("Mean Cloud Coverage (%)")
    plt.xlabel("")
    plt.grid()

def create_aggregate_floodmap(grid_fs, grid_list):
    """Create a mosaiced floodmap covering the specified grid patches."""
    floodmaps_aggregate = []
    floodmaps_aggregate = ["gs://" 
                           + grid_fs.glob(f"gs://ml4floods_nema/0_DEV/1_Staging"
                                          + f"/operational/{session_name}/{g}"
                                          + f"/pre_post_products/postflood*.geojson")[0]
                           for g in grid_list]
    floodmap = spatial_aggregation(floodmaps_aggregate)
    return floodmap

def query_ee_images(poly_outline, period_start, period_end):
    """
    Query GEE for available images in a spatial region and date range.
    Return a dataframe of available images and an image collection.
    """
    images_available, collection = \
    ee_query.query(poly_outline, 
                   period_start, 
                   period_end,
                   producttype="both",                                
                   return_collection=True)
    return images_available, collection

def build_interactive_map(grid_outline, floodmap, images_available_gee, images_available_gee_pre, 
                          do_plot_day=False):
    """Build an interactive Folium map to visualise the LGA."""
    
    # Create a clipping Geometry
    eegeom_clip = ee.Geometry(shapely.geometry.mapping(grid_outline))
    # Filter for cloud polygons
    clouds = floodmap.loc[floodmap['class'].apply(lambda x: x in ['cloud'])]
    # Filter for 'flood_trace' and 'water' polygons
    floodmap_post_intersect = floodmap.loc[floodmap['class']
                                           .apply(lambda x: x in ['flood_trace', 'water'])]
    # Initialise the map base layer at the LGA centroid
    m = geemap.Map(location=grid_outline.centroid.coords[0][-1::-1], zoom_start=10)
    # Load the pre-flood images
    for (day, satellite), images_day in images_available_gee_pre.groupby(["solarday", "satellite"]):
        image_col_day_sat = \
        collection_pre.filter(ee.Filter.inList("title", images_day.index.tolist())).map(lambda x: x.clip(eegeom_clip))
        bands = ["B11","B8","B4"] if satellite.startswith("S2") else ["B6","B5","B4"]
        m.addLayer(image_col_day_sat,
                   {"min":0, "max":3500 if satellite.startswith("S2") else 0.35, "bands":bands},
                   f"{satellite}: {day}", 
                   False)
    # Load the post-flood images and post-flood maps
    for (day, satellite), images_day in images_available_gee.groupby(["solarday", "satellite"]):
        image_col_day_sat = \
        collection.filter(ee.Filter.inList("title", images_day.index.tolist())).map(lambda x: x.clip(eegeom_clip))
        bands = ["B11","B8","B4"] if satellite.startswith("S2") else ["B6","B5","B4"]
        m.addLayer(image_col_day_sat, 
                   {"min":0, "max":3000 if satellite.startswith("S2") else 0.3, "bands":bands},
                   f"{satellite}: {day}", 
                   False)
        satellite_plot = 'S2' if satellite.startswith("S2") else 'Landsat'
        if do_plot_day:
            print(f'Aggregating floodmap for day {day} and {satellite_plot} images')
            try:
                floodmaps_aggregate = []
                for g in grid_list:
                    floodmaps_aggregate.extend(["gs://" 
                                                + f for f in grid_fs.glob(f"gs://ml4floods_nema/0_DEV/1_Staging"
                                                + f"/GRID/{g}/WF2_*_vec/{satellite_plot}/{day}.geojson") ])
                floodmap_day_sat = spatial_aggregation(floodmaps_aggregate)   
                floodmap_day_sat = floodmap_day_sat.loc[floodmap_day_sat['class']
                                                        .apply(lambda x: x in ['flood_trace', 'water'])]
                floodmap_day_sat.explore(m=m, name = f'Water map {satellite_plot} {day}', color = 'violet')
            except:
                continue
        
    # Format the map and add controls
    #m.addLayer(eegeom_clip, name="AOI", color="red", style_kwds={"fillOpacity": 0.0})
    floodmap_post_intersect.explore(m=m, name="Water Post-Flood", color="violet")
    clouds.explore(m=m, name="Clouds", color="gray")
    folium.LayerControl(collapsed=False).add_to(m)
    return m

## Shepparton

Explore the Shepparton LGA and visualise available data.

In [None]:
# Choose Shepparton (LGA[0]) and fetch the grid polygons covering the LGA
lga_name = lga_req_list[0]
grid_list, grid_lga_df = intersect_lga_grid(lga_name, lga_df, grid_df)
grid_lga_df.explore()

**Visualise data availability in a representative grid patch**

For Sheparton choose GRID06189

In [None]:
# Choose a representative grid from the map above
grid_name = "GRID06189"

# Fetch the metadata from the bucket
metadata = get_metadata_grid(grid_name)
print(f"There are {metadata.shape[0]} entries in the table.")
metadata.head(5)

In [None]:
# Plot the timeseries of data
fig, ax = plt.subplots(1,1, figsize=(15,5))
plot_data_timeseries(metadata, period_flood_start, period_flood_end, period_pre_flood_start)

**Create the aggregate flood map and explore**

In [None]:
# Create an aggregate floodmap from the images in GCP
floodmap = create_aggregate_floodmap(grid_fs, grid_list)
plot_utils.plot_floodmap(floodmap)                 

In [None]:
# Create an outline of the gridded LGA to query data
grid_outline = grid_lga_df.dissolve().geometry.values[0]
grid_outline

In [None]:
# Check for images covering the flood period 
images_available_gee, collection = \
    query_ee_images(grid_outline,
                    period_flood_start,
                    period_flood_end)

# Check for images before flood period 
images_available_gee_pre, collection_pre = \
    query_ee_images(grid_outline,
                    period_pre_flood_start,
                    period_pre_flood_end)

assert images_available_gee.shape[0] > 0, "[ERR] No images found for date and location!"
print("Total images available:", images_available_gee.shape[0])

In [None]:
# Build a Folium map
m = build_interactive_map(grid_outline, floodmap, images_available_gee, images_available_gee_pre,
                         do_plot_day=True)

In [None]:
# Display the map
m

## Rochester

In [None]:
# Explore the second LGA (Rochester)
lga_name = lga_req_list[1]
grid_list, grid_lga_df = intersect_lga_grid(lga_name, lga_df, grid_df)
grid_lga_df.explore()

**Visualise data availability in a representative grid patch**

For Rochester choose GRID05748

In [None]:
# Choose a representative grid from the map above
grid_name = "GRID05748"

# Fetch the metadata from the bucket
metadata = get_metadata_grid(grid_name)
print(f"There are {metadata.shape[0]} entries in the table.")
metadata.head(5)

In [None]:
# Plot the timeseries of data
fig, ax = plt.subplots(1,1, figsize=(15,5))
plot_data_timeseries(metadata, period_flood_start, period_flood_end, period_pre_flood_start)

**Create the aggregate flood map and explore**

In [None]:
# Create an aggregate floodmap from the images in GCP
floodmap = create_aggregate_floodmap(grid_fs, grid_list)
plot_utils.plot_floodmap(floodmap)   

In [None]:
# Create an outline of the gridded LGA to query data
grid_outline = grid_lga_df.dissolve().geometry.values[0]
grid_outline

In [None]:
# Check for images covering the flood period 
images_available_gee, collection = \
    query_ee_images(grid_outline,
                    period_flood_start,
                    period_flood_end)

# Check for images before flood period 
images_available_gee_pre, collection_pre = \
    query_ee_images(grid_outline,
                    period_pre_flood_start,
                    period_pre_flood_end)

assert images_available_gee.shape[0] > 0, "[ERR] No images found for date and location!"
print("Total images available:", images_available_gee.shape[0])

In [None]:
# Build a Folium map and show
m = build_interactive_map(grid_outline, floodmap, images_available_gee, images_available_gee_pre)
m

## Moree

In [None]:
# Explore the third LGA (Moree)
lga_name = lga_req_list[2]
grid_list, grid_lga_df = intersect_lga_grid(lga_name, lga_df, grid_df)
grid_lga_df.explore()

**Visualise data availability in a representative grid patch**

For Moree choose GRID09458

In [None]:
# Choose a representative grid from the map above
grid_name = "GRID09458"
#grid_name = "GRID09898"

# Fetch the metadata from the bucket
metadata = get_metadata_grid(grid_name)
print(f"There are {metadata.shape[0]} entries in the table.")
metadata.head(5)

In [None]:
# Plot the timeseries of data
fig, ax = plt.subplots(1,1, figsize=(15,5))
plot_data_timeseries(metadata, period_flood_start, period_flood_end, period_pre_flood_start)

**Create the aggregate flood map and explore**

In [None]:
# Create an aggregate floodmap from the images in GCP
floodmap = create_aggregate_floodmap(grid_fs, grid_list)

In [None]:
# Show the floodmap
plot_utils.plot_floodmap(floodmap)

In [None]:
# Create an outline of the gridded LGA to query data
grid_outline = grid_lga_df.dissolve().geometry.values[0]
grid_outline

In [None]:
# Check for images covering the flood period 
images_available_gee, collection = \
    query_ee_images(grid_outline,
                    period_flood_start,
                    period_flood_end)

# Check for images before flood period 
images_available_gee_pre, collection_pre = \
    query_ee_images(grid_outline,
                    period_pre_flood_start,
                    period_pre_flood_end)

assert images_available_gee.shape[0] > 0, "[ERR] No images found for date and location!"
print("Total images available:", images_available_gee.shape[0])

In [None]:
# Build a Folium map
m = build_interactive_map(grid_outline, floodmap, images_available_gee, images_available_gee_pre)

In [None]:
# Show the map
m

## Forbes

In [None]:
# Explore the forth LGA (Forbes)
lga_name = lga_req_list[3]
grid_list, grid_lga_df = intersect_lga_grid(lga_name, lga_df, grid_df)
grid_lga_df.explore()

**Visualise data availability in a representative grid patch**

For Forbes choose GRID08115

In [None]:
# Choose a representative grid from the map above
grid_name = "GRID08115"

# Fetch the metadata from the bucket
metadata = get_metadata_grid(grid_name)
print(f"There are {metadata.shape[0]} entries in the table.")
metadata.head(5)

In [None]:
# Plot the timeseries of data
fig, ax = plt.subplots(1,1, figsize=(15,5))
plot_data_timeseries(metadata, period_flood_start, period_flood_end, period_pre_flood_start)

**Create the aggregate flood map and explore**

In [None]:
# Create an aggregate floodmap from the images in GCP
floodmap = create_aggregate_floodmap(grid_fs, grid_list)

In [None]:
plot_utils.plot_floodmap(floodmap)

In [None]:
# Create an outline of the gridded LGA to query data
grid_outline = grid_lga_df.dissolve().geometry.values[0]
grid_outline

In [None]:
# Check for images covering the flood period 
images_available_gee, collection = \
    query_ee_images(grid_outline,
                    period_flood_start,
                    period_flood_end)

# Check for images before flood period 
images_available_gee_pre, collection_pre = \
    query_ee_images(grid_outline,
                    period_pre_flood_start,
                    period_pre_flood_end)

assert images_available_gee.shape[0] > 0, "[ERR] No images found for date and location!"
print("Total images available:", images_available_gee.shape[0])

In [None]:
# Build a Folium map
m = build_interactive_map(grid_outline, floodmap, images_available_gee, images_available_gee_pre, True)

In [None]:
# Show the map
m