# NDWI-tide correlation tide model rankings

This code processes correlations between multiple global ocean tide models and satellite-derived NDWI water index data at regularly spaced points along the coastline, and returns tide model performance and rankings in a standardised format for further analysis.

## Getting started
Set working directory to top level of repo to ensure links work correctly:

In [None]:
cd ../..

Install additional packages directly from the requirements file:

In [None]:
pip install -r dev-requirements.in --quiet

In [None]:
# pip install -e /home/jovyan/Robbi/eo-tides/

In [None]:
# pip install git+https://github.com/tsutterley/pyTMD.git

Now restart!

### Load packages

In [None]:
%load_ext autoreload
%autoreload 2


import datacube
import geopandas as gpd
import pandas as pd
from dea_tools.dask import create_local_dask_cluster
from eo_tides.utils import list_models
from intertidal.ensemble import correlation_loop
from odc.geo.geom import Geometry
from tqdm.auto import tqdm

## Setup


### Set analysis parameters

In [None]:
# Intertidal Elevation variables
start_date = "2017"  # Informed by http://www.bom.gov.au/climate/enso/soi/?
end_date = "2019"  #  Non La Nina/El Nino period
resolution = 10  # Spatial resolution used for output files
crs = "EPSG:3577"  # Coordinate Reference System (CRS) to use for output files
include_s2 = True  # Include Sentinel-2 data in the analysis?
include_ls = True  # Include Landsat data in the analysis?
filter_gqa = True  # Filter to remove poorly georeferenced scenes?
directory = "/var/share/tide_models/"  # Directory containing tide model files

# Models to run
models = [
    "EOT20",
    "FES2012",
    "FES2014_extrapolated",
    "FES2022_extrapolated",
    "HAMTIDE11",
    "GOT4.10",
    "GOT5.6_extrapolated",
    "TPXO10-atlas-v2-nc",
    "TPXO8-atlas-nc",
    "TPXO9-atlas-v5-nc",
]

In [None]:
list_models(directory=directory, show_supported=False);

### Load correlation points

In [None]:
# import pyogrio
# processed_gdf = gpd.read_file("data/raw/rankings_correlations_2017-2019.fgb", engine="pyogrio")
# missing = processed_gdf.query("missing > 0").point_id.tolist()

# # Load points
# points_gdf = gpd.read_file("data/raw/tide_correlation_points_input.geojson").to_crs("EPSG:3577")
# points_gdf, name = points_gdf.query("id in @missing"), "nan_missing"
# points_gdf.plot()

# # Apply buffer
# points_gdf["geometry"] = points_gdf.geometry.buffer(2500)

In [None]:
# Load points
points_gdf = gpd.read_file("data/raw/tide_correlation_points_input.geojson").to_crs("EPSG:3577")

# Select subset
# points_gdf, name = points_gdf.iloc[0:3], "test"
# points_gdf, name = points_gdf.iloc[0:750], "claire_part1"
# points_gdf, name = points_gdf.iloc[750:1500], "claire_part2"
# points_gdf, name = points_gdf.iloc[1500:2250], "robbi_part1"
# points_gdf, name = points_gdf.iloc[2250:3059], "robbi_part2"
# points_gdf, name = points_gdf.cx[1162870:1192673, -4273329:-4236761].head(1), "westernport"

# missing = points_gdf.set_index("id").index.difference(tide_correlation_points.set_index("point_id").index)
# points_gdf, name = points_gdf.query("id in @missing"), "robbi_missing"

points_gdf.plot()

# Apply buffer
points_gdf["geometry"] = points_gdf.geometry.buffer(2500)

## Run correlation analysis

In [None]:
# Connect to datacube
dc = datacube.Datacube(app="NDWI-tide correlations")

# Create local dask cluster to improve data load time
client = create_local_dask_cluster(return_client=True)

out_list = []

for i, row in tqdm(points_gdf.iterrows(), total=points_gdf.shape[0]):
    try:
        point_id = row.id
        print(f"Processing ID {point_id}")
        geom = Geometry(row.geometry, crs="EPSG:3577")

        output_df = correlation_loop(
            dc,
            point_id,
            geom,
            start_date,
            end_date,
            resolution,
            crs,
            filter_gqa,
            models,
            directory,
            min_freq=0.01,
            max_freq=0.99,
            # corr_method="pearson",
            # apply_threshold=True,
            # return_arrays=True,
        )
        output_df["x"] = row.geometry.centroid.x
        output_df["y"] = row.geometry.centroid.y

        out_list.append(output_df)

        client.restart()

    except Exception as e:
        print(f"{point_id} failed with {e}; skipping")

client.close()

### Combine and process data

In [None]:
# Combine outputs into single dataframe
combined_df = (
    pd.concat(out_list)
    .set_index(["x", "y", "valid_perc"], append=True)
    .unstack("tide_model")["tide_m"]
    .assign(source="ndwi correlation", statistic="correlation")
    .reset_index(["x", "y", "valid_perc"])
    .reindex(["x", "y", "valid_perc", "source", "statistic"] + models, axis=1)
)

# Add additional columns
combined_df["min"] = combined_df.loc[:, models].min(axis=1)
combined_df["max"] = combined_df.loc[:, models].max(axis=1)
combined_df["diff"] = combined_df["max"] - combined_df["min"]
combined_df["ave"] = combined_df.loc[:, models].mean(axis=1)
combined_df["median"] = combined_df.loc[:, models].median(axis=1)
combined_df["std"] = combined_df.loc[:, models].std(axis=1)
combined_df["missing"] = combined_df.loc[:, models].isna().sum(axis=1)

# Calculate ranks
combined_df = pd.concat(
    [combined_df, combined_df.loc[:, models].rank(axis=1, ascending=False).add_prefix("rank_")], axis=1
)

# Remove all NaN rows and identify best/worst models
combined_df = combined_df.loc[combined_df.filter(regex="^rank").dropna(axis=0, how="all").index]
combined_df["top_model"] = combined_df.filter(regex="^rank").idxmin(axis=1).str[5:]
combined_df["worst_model"] = combined_df.filter(regex="^rank").idxmax(axis=1).str[5:]
combined_df

### Export

In [None]:
combined_gdf = gpd.GeoDataFrame(
    data=combined_df,
    geometry=gpd.points_from_xy(combined_df.x, combined_df.y, crs="EPSG:3577"),
).to_crs("EPSG:4326")
combined_gdf.to_file(f"data/raw/tide_correlation_points_{name}.geojson")
combined_gdf.explore()

### Combine (once all data is processed)

In [None]:
# pip install pyogrio

In [None]:
# Load original data
tide_correlation_points_all = (
    pd.concat([
        gpd.read_file("/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_claire_part1a.geojson"),
        gpd.read_file(
            "/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_claire_part1ba.geojson"
        ),
        gpd.read_file("/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_claire_part2.geojson"),
        gpd.read_file("/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_robbi_part1.geojson"),
        gpd.read_file("/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_robbi_part2.geojson"),
        gpd.read_file("/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_robbi_missing.geojson"),
    ])
    # Old files included GOT5.5 which was dropped for being too similar to GOT5.6
    .drop(["GOT5.5_extrapolated", "rank_GOT5.5_extrapolated"], axis=1)
    .set_index("point_id")
)

# Load fixed points and update original data in-place
tide_correlation_points_fix = gpd.read_file(
    "/gdata1/projects/coastal/intertidal/correlations/tide_correlation_points_nan_missing.geojson"
).set_index("point_id")
tide_correlation_points_all.update(tide_correlation_points_fix)

# Recalculate additional columns
tide_correlation_points["min"] = tide_correlation_points.loc[:, models].min(axis=1)
tide_correlation_points["max"] = tide_correlation_points.loc[:, models].max(axis=1)
tide_correlation_points["diff"] = tide_correlation_points["max"] - tide_correlation_points["min"]
tide_correlation_points["ave"] = tide_correlation_points.loc[:, models].mean(axis=1)
tide_correlation_points["median"] = tide_correlation_points.loc[:, models].median(axis=1)
tide_correlation_points["std"] = tide_correlation_points.loc[:, models].std(axis=1)
tide_correlation_points["missing"] = tide_correlation_points.loc[:, models].isna().sum(axis=1)

# Recalculate ranks
tide_correlation_points.update(tide_correlation_points.loc[:, models].rank(axis=1, ascending=False).add_prefix("rank_"))
tide_correlation_points["top_model"] = tide_correlation_points.filter(regex="^rank").idxmin(axis=1).str[5:]
tide_correlation_points["worst_model"] = tide_correlation_points.filter(regex="^rank").idxmax(axis=1).str[5:]

# Convert x/y to lat/lon
tide_correlation_points["x"] = tide_correlation_points.geometry.x
tide_correlation_points["y"] = tide_correlation_points.geometry.y

# Reorder columns
cols = [
    "point_id",
    "x",
    "y",
    "valid_perc",
    "source",
    "statistic",
    *models,
    "min",
    "max",
    "diff",
    "ave",
    "median",
    "std",
    "missing",
    *[f"rank_{m}" for m in models],
    "top_model",
    "worst_model",
    "geometry",
]
tide_correlation_points = tide_correlation_points[cols]

# Export
tide_correlation_points.to_file("data/raw/rankings_correlations_2017-2019.fgb", engine="pyogrio")
tide_correlation_points.head()

#### Combine with altimetry

In [None]:
import geopandas as gpd

tide_correlation_points = gpd.read_file("data/raw/rankings_correlations_2017-2019.fgb", engine="pyogrio")
coastal_altimetry_points = gpd.read_file("data/raw/rankings_altimetry_2017-2019_v2.fgb", engine="pyogrio")

In [None]:
import pandas as pd

rank_cols = [
    "rank_EOT20",
    "rank_FES2012",
    "rank_FES2014_extrapolated",
    "rank_FES2022_extrapolated",
    "rank_HAMTIDE11",
    "rank_GOT4.10",
    "rank_GOT5.6_extrapolated",
    "rank_TPXO10-atlas-v2-nc",
    "rank_TPXO8-atlas-nc",
    "rank_TPXO9-atlas-v5-nc",
]

value_cols = [
    "EOT20",
    "FES2012",
    "FES2014_extrapolated",
    "FES2022_extrapolated",
    "HAMTIDE11",
    "GOT4.10",
    "GOT5.6_extrapolated",
    "TPXO10-atlas-v2-nc",
    "TPXO8-atlas-nc",
    "TPXO9-atlas-v5-nc",
]


combined_gdf = (
    pd.concat([tide_correlation_points, coastal_altimetry_points])
    .drop("point_id", axis=1)
    .reset_index(drop=True)
    .rename_axis("point_id")
    .dropna(subset=rank_cols, how="all")
)

# Convert dtypes
float64_cols = list(combined_gdf.select_dtypes(include="float64"))
combined_gdf[float64_cols] = combined_gdf[float64_cols].round(3).astype("float32")

combined_gdf.to_file("data/raw/rankings_ensemble_2017-2019_v2.fgb", engine="pyogrio")

In [None]:
with pd.option_context("display.max_rows", 5, "display.max_columns", None):
    display(combined_gdf)