# Generate the sampling grid over Australia

This notebook generates the grid of square 'patches' used for sampling areas of interest. Each patch has a unique name and is considered immutable in the FloodMapper system. This means that the patches defined here **should not be changed**. However, the grid can be expanded, provided the new patches conform to the sampling scheme (spacing and size).

These geometry of these patches are stored in the database and to a file on the GCP bucket. 

Note: the schema of the database is available in the file [floodmapper-db-schema.sql](floodmapper-db-schema.sql).

In [None]:
# Necessary imports
import os
os.environ['USE_PYGEOS'] = '0'
import itertools
from shapely.geometry import box
import warnings
import numpy as np
import pandas as pd
import geopandas as gpd
from collections import defaultdict
warnings.filterwarnings('ignore', 'Geometry is in a ', UserWarning)
warnings.filterwarnings('ignore', '', RuntimeWarning)
import logging
logger = logging.getLogger()
logger.setLevel(logging.ERROR)

from ml4floods.data import utils

# Set bucket will not be requester pays
utils.REQUESTER_PAYS_DEFAULT = False

from dotenv import load_dotenv
from tqdm import tqdm
from db_utils import DB

## Load environment and project details

As with the other notebooks, we load credentials and project details from a hidden ```.env``` file.

In [None]:
# Load environment variables (including path to credentials) from '.env' file
env_file_path = "../.env"

# Uncomment for alternative version for Windows (r"" indicates raw string)
#env_file_path = r"C:/Users/User/floodmapper/.env"

assert load_dotenv(dotenv_path=env_file_path) == True, "[ERR] Failed to load environment!"
assert "GOOGLE_APPLICATION_CREDENTIALS" in os.environ, "[ERR] Missing $GOOGLE_APPLICATION_CREDENTIAL!"
assert "GS_USER_PROJECT" in os.environ, "[ERR] Missing $GS_USER_PROJECT!"
key_file_path = os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
assert os.path.exists(key_file_path), f"[ERR] Google credential key file does not exist: \n{key_file_path} "
assert "ML4FLOODS_BASE_DIR" in os.environ, "[ERR] Missing $ML4FLOODS_BASE_DIR!"
base_path = os.environ["ML4FLOODS_BASE_DIR"]
assert os.path.exists(base_path), f"[ERR] Base path does not exist: \n{base_path} "
bucket_name = os.environ["BUCKET_URI"]
assert bucket_name is not None and bucket_name != "", f"Bucket name not defined {bucket_name}"
print("[INFO] Successfully loaded FloodMapper environment.")

Set the path to the grid backup file.

In [None]:
# Output filename
outfile = "grid_australia.geojson"

# Form the path to the backup grid file on the GCP bucket
grid_geojson_path = os.path.join(bucket_name, "0_DEV/1_Staging/operational", outfile).replace("\\","/")
print(f"[INFO] Will save master grid file to:\n\t{grid_geojson_path}")

## Load and visualise existing grid (if it exists)

If you want to re-make the grid from scratch, set the variable `force_remake = True`

In [None]:
# Remake the grid from scratch? (ignore any existing grid file)
force_remake = True

# Check if gridfile exists
fs = utils.get_filesystem(grid_geojson_path)
grid_exists = fs.exists(grid_geojson_path)
m = 0

if grid_exists and not force_remake:
    print("[INFO] Existing grid found on GCP bucket - plotting.")
    existing_grid = utils.read_geojson_from_gcp(grid_geojson_path)
    existing_grid = existing_grid.drop_duplicates()
    existing_grid_full_extent = gpd.GeoDataFrame(geometry=[box(*existing_grid.total_bounds)], crs="EPSG:4326")
    m = existing_grid_full_extent.explore(style_kwds={"fillOpacity": 0.3,})
    display(m)
else:
    grid_exists = False
    existing_grid = None
    existing_grid_full_extent = None
    print("[INFO]: No existing grid was found on GCP (or forcing recreation).")

## Extend the grid, or create from scratch

We add an element to the grid if it's not in the previous grid (the intersection of the new grid element with the old grid is small)

**Define the size and overlap and bounding box of the grid**

The patches in the grid are defined in units of degrees under the [ESPG 4326](https://epsg.io/4326) coordinate system, which is also known as WGS84. This is an angular CRS meaning that the patches are 'square' in angular units, but not when projected onto the surface of the earth. The projected patches will be increasingly distorted in width at latitudes away from the equator. 

In [None]:
# Grid spacing and overlap
step = 0.20, 0.20
size = 0.21

# Initial bounding box covering all of Australia
# (long_min, lat_min, long_max, lat_max)
bounds_initial = (112.900000000000, -44.00516044138397, 
                  153.63872785102905, -10.244936010554465)

**Layout the new grid, respecting existing grid patch names.**

In [None]:
# Get the outline of the existing grid
existing_grid_union = None
grid_number = 0
if existing_grid is not None:
    existing_grid_union = existing_grid.geometry.unary_union
    last_grid_name = max(existing_grid["name"])
    grid_number = int(last_grid_name.replace("GRID",""))
grid_number += 1

# Iterate in X and Y from min_x and min_y to generate tiles
# bounds_initial = [min_x, min_y, max_x, max_y]
# arange(min_x, max_x, step_x), arange(min_y, max_y, step_y)
pols_add = []
for x_left, y_bottom in itertools.product(np.arange(bounds_initial[0],  bounds_initial[2], step[0]), 
                                          np.arange(bounds_initial[1], bounds_initial[3], step[1])):
    
    # Generate patch of type Shapely box
    bounds_iter = (x_left, y_bottom, x_left+size, y_bottom+size)
    pol = box(*bounds_iter)
    
    # Skip if overlap with existing grid >= 90% overlap
    if existing_grid is not None:
        intersection = pol.intersection(existing_grid_union)
        if (intersection.area / pol.area) >= 0.9:
            continue
        
    # Append new grid patch to the list
    pols_add.append({"geometry": pol, "name": f"GRID{grid_number:05d}"})
    grid_number += 1
    
grid_add = None
if len(pols_add) > 0:
    grid_add = gpd.GeoDataFrame(pols_add, crs="EPSG:4326")
    print(f"[INFO] Added {grid_add.shape[0]} grid patches.")
    print(grid_add.head())
else:
    print(f"[INFO] No new grid patches added.")

In [None]:
# Merge new and existing grids
do_upload = True
if existing_grid is not None and grid_add is not None:
    merged_grid = gpd.GeoDataFrame(pd.concat([existing_grid, grid_add], ignore_index=True), 
                                    crs=existing_grid.crs)
elif existing_grid is None and grid_add is not None:
    merged_grid = grid_add
elif existing_grid is not None and grid_add is None:
    merged_grid = existing_grid
    do_upload = False
else:
    do_upload = False

In [None]:
# Visualise
merged_grid.explore(style_kwds={"fill":False})

## Load the LGA table into the database

For convenience, we want to mark each grid patch with the local government area it covers. If the patch covers more than one LGA, we make multiple entries in the 'grid_loc' table.

In [None]:
# Load the latest LGA shapefile from local disk
lga_file = os.path.join(base_path, "resources/LGAs/LGA_2022_AUST_GDA2020.shp").replace("\\","/")
lga_gdf = gpd.read_file(lga_file)
lga_gdf.dropna(inplace=True)
lga_gdf.head()

In [None]:
# Drop invalid geometries
lga_gdf = lga_gdf[lga_gdf.geometry.is_valid]
lga_gdf.shape

In [None]:
# Connect to the database (point to the .env file for credentials)
db_conn = DB(env_file_path)

In [None]:
# Drop all previous rows
query = (f"DELETE FROM lgas_info")
db_conn.run_query(query)

In [None]:
%%time
# Insert the LGA entries into the database using a batch query
#https://naysan.ca/2020/05/09/pandas-to-postgresql-using-psycopg2-bulk-insert-performance-benchmark/
data = []
print("[INFO] Formatting rows:")
for row in tqdm(lga_gdf.itertuples(), total=len(lga_gdf)):
    data.append(
        (row.LGA_CODE22, row.LGA_NAME22, row.STE_CODE21, row.STE_NAME21,
         row.AUS_CODE21, row.AUS_NAME21, row.AREASQKM, row.LOCI_URI21,
         row.SHAPE_Leng, row.SHAPE_Area, str(row.geometry))
    )
query = (f"INSERT INTO lgas_info"
         f"(lga_code22, lga_name22, ste_code21, ste_name21, "
         f"aus_code21, aus_name21, areasqkm, loci_uri21, "
         f"shape_leng, shape_area, geometry_col) "
         f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_GeomFromText(%s, 25832)) "
         f"ON CONFLICT (lga_name22) DO NOTHING;")
print("[INFO] Inserting table in batch mode.", flush=True)
db_conn.run_batch_insert(query, data)

## Mark each grid patch with the LGA it covers

In [None]:
# Analyse the intersection of the grid with the LGAs
overlap_frac = 0.01
all_rows = list()
for name, g in tqdm(zip(lga_gdf.LGA_NAME22, lga_gdf.geometry), total = len(lga_gdf.geometry)):
    if g:
        aoi_idx = list()
        for row in merged_grid.itertuples():
            if g.intersects(row.geometry):
                area_overlap_lga = g.intersection(row.geometry).area / row.geometry.area
                if area_overlap_lga >= overlap_frac:
                    aoi_idx.append(row.Index)
        res_df = merged_grid.loc[aoi_idx]
        res_df['LGA_NAME22'] = name
        all_rows.extend(res_df.to_dict(orient = 'records'))
print("Residual LGAs:")
res_df

In [None]:
# Merge the LGA Names into the grid table, duplicating patches at boundaries
# Use an inner join so that we drop patches not covering a LGA
df = pd.DataFrame(all_rows)
all_g = pd.merge(merged_grid, df[['name', 'LGA_NAME22']], on = 'name', how = 'inner')
final_grid_gdf = gpd.GeoDataFrame(all_g, geometry='geometry')
final_grid_gdf = final_grid_gdf.drop_duplicates()
final_grid_gdf

In [None]:
# May need to rename and drop columns if grid file previously existed
if grid_exists:
    final_grid_gdf.rename({"LGA_NAME22_x": "LGA_NAME22"}, axis=1, inplace=True)
    del final_grid_gdf["LGA_NAME22_y"]
    final_grid_gdf

In [None]:
final_grid_gdf.explore()

Note that the grid table now has multiple entries for patches that overlap LGAs. This means the grid for each LGA can be selected by filtering the table by LGA Name.

In [None]:
# Plot the grid patches covering Cessnock LGA
lga = "Cessnock"
m = final_grid_gdf[final_grid_gdf.LGA_NAME22 == lga].explore(style_kwds={"fillOpacity": 0.3,})
lga_gdf[lga_gdf.LGA_NAME22 == lga].explore(m=m)
m

## Upload new grid to GCP

Upload the new grid to GCP as a GeoJSON file

In [None]:
print(f"[INFO] Uploading grid to: \n\t{grid_geojson_path}")
utils.write_geojson_to_gcp(grid_geojson_path, final_grid_gdf)

## Update the grid in the database.

Now we run a SQL command to re-write the 'grid_loc' table in the database. This will take a few minutes to complete.

In [None]:
# Drop all previous rows
query = (f"DELETE FROM grid_loc")
db_conn.run_query(query)

In [None]:
# Insert the new grid entries in batch mode
data = []
print("[INFO] Formatting rows:")
for row in tqdm(final_grid_gdf.itertuples(), total=len(final_grid_gdf)):
    data.append(
        (row.name, row.LGA_NAME22, str(row.geometry))
    )
query = (f"INSERT INTO grid_loc"
         f"(patch_name, lga_name22, geometry) "
         f"VALUES (%s, %s, ST_GeomFromText(%s, 4326)) "
         f"ON CONFLICT (patch_name, lga_name22) DO NOTHING;")
print("[INFO] Inserting table in batch mode.", flush=True)
db_conn.run_batch_insert(query, data)

Once the INSERT query has completed, we can check for a successful upload by querying and plotting the grid.

In [None]:
# Define and execute the query
query = (f"SELECT patch_name, lga_name22, ST_AsText(geometry) "
         f"FROM grid_loc")
grid_df = db_conn.run_query(query, fetch=True)
print(f"[INFO] Returned {len(grid_df)} rows.")

In [None]:
# Format the results into a correct GeoDataFrame
grid_df['geometry'] = gpd.GeoSeries.from_wkt(grid_df['st_astext'])
grid_df.drop(['st_astext'], axis=1, inplace = True)
grid_gdf = gpd.GeoDataFrame(grid_df, geometry='geometry', crs="EPSG:4326")
grid_gdf

In [None]:
# Plot an interactive map of the grid
grid_gdf.explore(style_kwds={"fillOpacity": 0.3,})

In [None]:
# Clean up by closing the connection to the database
db_conn.close_connection()