# Ensemble tide model tide guage validation

This code compares tides modelled using custom ensemble tide modelling against results from various other global ocean tide models at [Global Extreme Sea Level Analysis (GESLA) tide gauges](https://gesla787883612.wordpress.com/) across Australia.

> Haigh, I.D., Marcos, M., Talke, S.A., Woodworth, P.L., Hunter, J.R., Hague, B.S., Arns, A., Bradshaw, E. and Thompson, P., 2023. GESLA version 3: A major update to the global higher‐frequency sea‐level dataset. Geoscience Data Journal, 10(3), pp.293-314.

## Getting started
Set working directory to top level of repo to ensure links work correctly:

In [None]:
cd ../..

Install additional packages directly from the requirements file:

In [None]:
pip install -r dev-requirements.in --quiet

In [None]:
# pip install -e /home/jovyan/Robbi/eo-tides

### Load packages

In [None]:
%load_ext autoreload
%autoreload 2

import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from eo_tides.model import model_tides
from eo_tides.validation import eval_metrics, load_gauge_gesla

tide_model_dir = "/var/share/tide_models/"

### GESLA tide guage data


In [None]:
# # Load tide gauge metadata
# metadata_df, metadata_gdf = _load_gauge_metadata(metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv")
# metadata_gdf.to_file("gesla_stations.geojson")

In [None]:
# Load Collection 3 summary grid
c3_path = "https://data.dea.ga.gov.au/derivative/ga_summary_grid_c3.geojson"
c3_grid = gpd.read_file(c3_path)
xmin, ymin, xmax, ymax = c3_grid.total_bounds

In [None]:
# Extract tide data for all sites
gauge_df = load_gauge_gesla(
    x=(xmin, xmax),
    y=(ymax, ymin),
    time=("2017", "2019"),
    data_path="/gdata1/data/sea_level/GESLA3.0_ALL",
    metadata_path="/gdata1/data/sea_level/GESLA3_ALL 2.csv",
)

In [None]:
# Clean to restrict to Australia and remove duplicate sites/contributers
bad_sites = (
    "H033007A",  # Cape Ferguson, duplicate
    "H057022B",  # Thursday Island, duplicate
    "PLPEE01",  # Peel Inlet, inland
    "DVHAR01",  # Harvey, inland
    "H060010A",
)  # Half Tide Tug, duplicate
bad_contributers = ("UHSLC",)
gauge_df = gauge_df.query(
    f"(country == 'AUS') & (site_code not in {bad_sites}) & (contributor_abbreviated not in {bad_contributers})"
)

# Normalise to mean sea level
gauge_df["sea_level"] -= gauge_df.groupby(["site_code"])["sea_level"].transform("mean")

# Select 3-hourly subset
# gauge_df = gauge_df.iloc[::3]

In [None]:
# Export stations being processed to file
sites_df = gauge_df.groupby("site_code").first().iloc[:, 3:]
sites_gdf = gpd.GeoDataFrame(data=sites_df, geometry=gpd.points_from_xy(sites_df.longitude, sites_df.latitude))
sites_gdf.to_file("gesla_stations_aus.geojson")

## Model tides at each gauge

In [None]:
ensemble_funcs = {
    "ensemble-top": lambda x: x["rank"] == 1,
    "ensemble-mean-top3": lambda x: x["rank"] <= 3,
    "ensemble-mean": lambda x: x["rank"] <= 10,
    "ensemble-bottom": lambda x: x["rank"] == 9,  # 10,
    # "ensemble-mean-top5": lambda x: x["rank"] <= 5,
    # "ensemble-mean-weighted": lambda x: 8 - x["rank"],
    # "ensemble-mean-weightedtop3": lambda x: (4 - x["rank"]).clip(0, 3),
}

# Select subset of data to model (for testing)
# gauge_df_subset = gauge_df.head(100)
gauge_df_subset = gauge_df

tide_df = model_tides(
    x=gauge_df_subset.longitude,
    y=gauge_df_subset.latitude,
    time=gauge_df_subset.index.get_level_values("time"),
    model=[
        "EOT20",
        "FES2012",
        "FES2014_extrapolated",
        "FES2022_extrapolated",
        # "HAMTIDE11",
        "GOT4.10",
        "GOT5.6_extrapolated",
        "TPXO10-atlas-v2-nc",
        "TPXO8-atlas-nc",
        "TPXO9-atlas-v5-nc",
        "ensemble",
    ],
    directory=tide_model_dir,
    mode="one-to-one",
    output_format="wide",
    ensemble_func=ensemble_funcs,
    # ranking_points="/home/jovyan/Robbi/dea-intertidal/data/raw/rankings_ensemble_2017-2019.fgb",
    ranking_points="/home/jovyan/Robbi/dea-intertidal/data/raw/rankings_ensemble_2017-2019_v2.fgb",
    ensemble_models=[
        "EOT20",
        "FES2012",
        "FES2014_extrapolated",
        "FES2022_extrapolated",
        # "HAMTIDE11",
        "GOT4.10",
        "GOT5.6_extrapolated",
        "TPXO10-atlas-v2-nc",
        "TPXO8-atlas-nc",
        "TPXO9-atlas-v5-nc",
    ],
    p=2,
).rename(
    {
        "FES2014_extrapolated": "FES2014",
        "FES2022_extrapolated": "FES2022",
        "TPXO10-atlas-v2-nc": "TPXO10",
        "TPXO8-atlas-nc": "TPXO8",
        "TPXO9-atlas-v5-nc": "TPXO9",
        "GOT5.6_extrapolated": "GOT5.6",
    },
    axis=1,
)
tide_df

Add tide gauge data as a new column, then process back into long format:

In [None]:
# Add tide gauge data to dataframe
tide_df["site_code"] = gauge_df_subset.index.get_level_values("site_code").values
tide_df["site_name"] = gauge_df_subset.site_name.values
tide_df["tide_gauge"] = gauge_df_subset.sea_level.values

# Reshape to long format
tide_df_long = tide_df.melt(
    ignore_index=False,
    id_vars=["tide_gauge", "site_code", "site_name"],
    value_vars=[
        "EOT20",
        "FES2012",
        "FES2014",
        "FES2022",
        # "HAMTIDE11",
        "GOT4.10",
        "GOT5.6",
        "TPXO10",
        "TPXO8",
        "TPXO9",
        "ensemble-top",
        "ensemble-mean-top3",
        "ensemble-mean",
        "ensemble-bottom",
        # "ensemble-mean-weightedtop3",
        # "ensemble-mean-top5",
        # "ensemble-mean-weighted",
    ],
    value_name="tide_m",
)

tide_df_long

### Inspect model rankings
To do: expose these via `eo-tides` code

In [None]:
# from eo_tides.utils import idw

# ensemble_func=None
# ensemble_top_n=3
# ranking_points="https://dea-public-data-dev.s3-ap-southeast-2.amazonaws.com/derivative/dea_intertidal/supplementary/rankings_ensemble_2017-2019.fgb"
# ranking_valid_perc=0.02
# crs="EPSG:4326"


# # Extract x and y coords from dataframe
# x = [145.2228178]
# y = [-38.3739033]

# # Load model ranks points and reproject to same CRS as x and y
# ensemble_models = [
#                 "EOT20",
#                 "FES2012",
#                 "FES2014_extrapolated",
#                 "FES2022_extrapolated",
#                 "GOT4.10",
#                 "GOT5.6_extrapolated",
#                 "TPXO10-atlas-v2-nc",
#                 "TPXO8-atlas-nc",
#                 "TPXO9-atlas-v5-nc",
#             ]
# model_ranking_cols = [f"rank_{m}" for m in ensemble_models]

# model_ranks_gdf = (
#     gpd.read_file(ranking_points, engine="pyogrio")
#     .to_crs(crs)
#     .query(f"valid_perc > {ranking_valid_perc}")
#     .dropna(how="all")[model_ranking_cols + ["geometry"]]
# )


# # Use points to interpolate model rankings into requested x and y
# idw_kwargs = {"p": 2}
# id_kwargs_str = "" if idw_kwargs == {} else idw_kwargs
# print(f"Interpolating model rankings using IDW interpolation {id_kwargs_str}")
# ensemble_ranks_df = (
#     # Run IDW interpolation on subset of ranking columns
#     pd.DataFrame(
#         idw(
#             input_z=model_ranks_gdf[model_ranking_cols],
#             input_x=model_ranks_gdf.geometry.x,
#             input_y=model_ranks_gdf.geometry.y,
#             output_x=x,
#             output_y=y,
#             **idw_kwargs,
#         ),
#         columns=model_ranking_cols,
#     )
#     .assign(x=x, y=y)
#     # Drop any duplicates then melt columns into long format
#     .drop_duplicates()
#     .melt(id_vars=["x", "y"], var_name="tide_model", value_name="rank")
#     # Remore "rank_" prefix to get plain model names
#     .replace({"^rank_": ""}, regex=True)
#     # Set index columns and rank across groups
#     .set_index(["tide_model", "x", "y"])
#     .groupby(["x", "y"])
#     .rank()
# )
# ensemble_ranks_df.sort_values("rank").index

## Analysis


### Overall accuracy stats for each model

In [None]:
accuracy_df = tide_df_long.groupby(["tide_model"])[["tide_gauge", "tide_m"]].apply(
    lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m, round=4)
)

accuracy_df.to_csv("data/figures/all_gauges.csv")

accuracy_df.sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r",
    subset=["RMSE"],
)

### Model stats per site

In [None]:
accuracy_sites_df = (
    tide_df_long.query("site_name != 'Lord_Howe_Island'")
    .groupby(["site_name", "tide_model"])[["tide_gauge", "tide_m"]]
    .apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m, round=4))
)

#### Performance at sites with most model disagreement

In [None]:
# Standard models
default_models = [
    "EOT20",
    "FES2012",
    "FES2014",
    "FES2022",
    # "HAMTIDE11",
    "GOT4.10",
    "GOT5.6",
    "TPXO10",
    "TPXO8",
    "TPXO9",
]

# Calculate threshold
n_sites = len(accuracy_sites_df.index.droplevel("tide_model").unique())
threshold = int(n_sites * 0.2)

# Identify sites with worst performance (highest RMSE)
most_disagrement = (
    accuracy_sites_df.query("tide_model in @default_models")
    .groupby("site_name")["RMSE"]
    .mean()
    .nlargest(threshold)
    .index
)
most_disagrement

In [None]:
accuracy_df = (
    tide_df_long.query("site_name in @most_disagrement")
    .groupby(["tide_model"])[["tide_gauge", "tide_m"]]
    .apply(lambda x: eval_metrics(x=x.tide_gauge, y=x.tide_m, round=4))
)

accuracy_df.to_csv("data/figures/worst_gauges.csv")

accuracy_df.sort_values("RMSE").style.background_gradient(
    cmap="RdBu_r",
    subset=["RMSE"],
)

#### Facetted plot for a single site

Good sites: 
* 'Port_Alma'
* 'Western_Port_Stony_Point'
* 'Port_Welshpool_Pier'

In [None]:
tide_df_site

In [None]:
# Create faceted scatterplot
site_name = most_disagrement[9]

# Select data
# tide_df_site = tide_df_long.query("site_name == @site_name").copy()

# Pick first row as template and set dummy value
dummy = tide_df_site.iloc[0].copy()
dummy["tide_model"] = ""
tide_df_site = pd.concat([tide_df_site, pd.DataFrame([dummy], index=[tide_df_site.index[0]])])

# Custom sorting
order = [
    "FES2022",
    "TPXO10",
    "TPXO9",
    "TPXO8",
    "FES2012",
    "GOT4.10",
    "GOT5.6",
    "EOT20",
    "FES2014",
    "",
    "ensemble-top",
    "ensemble-mean-top3",
    "ensemble-mean",
    "ensemble-bottom",
]
tide_df_site["tide_model"] = pd.Categorical(tide_df_site["tide_model"], categories=order, ordered=True)

g = sns.FacetGrid(tide_df_site, col="tide_model", col_wrap=5, height=2, aspect=0.83)
g.fig.subplots_adjust(wspace=0.05, hspace=0.05)


def hexbin(x, y, color, **kwargs):
    cmap = "Blues"  # sns.light_palette(color, as_cmap=True)
    plt.hexbin(
        x,
        y,
        gridsize=40,
        cmap=cmap,
        mincnt=1,
        bins="log",
        linewidths=0,
        edgecolors="black",
        extent=(-1.9, 1.9, -1.9, 1.9),
        **kwargs,
    )


g.map_dataframe(hexbin, x="tide_gauge", y="tide_m")


def one_to_one_line(**kwargs):
    ax = plt.gca()
    lims = [min(ax.get_xlim()[0], ax.get_ylim()[0]), max(ax.get_xlim()[1], ax.get_ylim()[1])]
    ax.plot(lims, lims, ls="--", linewidth=0.8, c="black")


g.map(one_to_one_line)


g.set_titles("{col_name}")
g.set_axis_labels("", "")
g.figure.supxlabel("Observed tide height (m)")
g.figure.supylabel("Modelled tide height (m)", x=0)

# plt.suptitle(site_name.replace("_", " "), y=1.02)
g.set(ylim=(-1.9, 1.9), xlim=(-1.9, 1.9))

labels = [f"Ranking: {i}" for i in range(1, 11)] + ["FES2022", "  FES2022, TPXO10, TPXO9", "All models", "FES2014"]

for i, ax in enumerate(g.axes.flat):
    ax.set_xticks([-1.5, 0, 1.5])
    ax.set_yticks([-1.5, 0, 1.5])
    ax.text(0, 1.7, labels[i], fontsize=7, horizontalalignment="center", verticalalignment="center")

In [None]:
g.savefig("data/figures/figure_scatterplot.png", bbox_inches="tight", dpi=200)

#### Top results per site per model

In [None]:
accuracy_sites_df.loc[accuracy_sites_df.groupby("site_name").RMSE.idxmin()].reset_index().tide_model.value_counts()

In [None]:
# Excluding EOT20
accuracy_sites_df.loc[
    accuracy_sites_df.query("tide_model not in ['EOT20']").groupby("site_name").RMSE.idxmin()
].reset_index().tide_model.value_counts()

#### Bottom results per site per model

In [None]:
accuracy_sites_df.loc[accuracy_sites_df.groupby("site_name").RMSE.idxmax()].reset_index().tide_model.value_counts()

In [None]:
rank_counts = (
    accuracy_sites_df.groupby("site_name")
    .RMSE.rank(method="min")
    .groupby("tide_model")
    .value_counts()
    .rename_axis(["tide_model", "Rank (RMSE)"])
)
rank_counts

#### Plot rankings as histogram

In [None]:
rank_counts = (
    accuracy_sites_df.groupby("site_name")
    .RMSE.rank(method="min")
    .groupby("tide_model")
    .value_counts()
    .rename_axis(["tide_model", "Rank (RMSE)"])
)

g = sns.FacetGrid(
    rank_counts.to_frame().reset_index(),
    col_order=[
        "EOT20",
        "FES2012",
        "FES2014",
        "FES2022",
        "HAMTIDE11",
        "GOT4.10",
        "GOT5.6",
        "TPXO10",
        "TPXO8",
        "TPXO9",
        "ensemble-top",
        "ensemble-bottom",
        "ensemble-mean-top3",
        "ensemble-mean",
        "ensemble-mean-weightedtop3",
    ],
    col="tide_model",
    col_wrap=3,
    aspect=1.5,
    height=3,
)
g.set_titles(col_template="{col_name}")
g.map(sns.barplot, "Rank (RMSE)", "count")

### Export tide validation plots

In [None]:
# # Calculate tide range per site
# tide_range = (
#     tide_df_long.groupby("site_name")["tide_gauge"]
#     .apply(lambda x: np.abs(x).max())
#     .sort_values()
# )

# Select subset
# sites, limits, title = tide_range.loc[tide_range >= 2].index, 5, "Macrotidal sites"
# sites, limits, title = tide_range.loc[(tide_range >= 1) & (tide_range < 2)].index, 2, "Mesotidal sites"
# sites, limits, title = tide_range.loc[tide_range < 1].index, 1, "Microtidal sites"

# Optionally restrict to subset
site_filter = [
    # "DYDBY01",
    # "63090",
    # "59511",
    # "59510",
    # "61800",
    # "59690",
    # "61840",
    # "61600",
    # "58170",
    # "60780",
    # "60739",
    # "60590",
    # "63511",
    # "59980",
    # "60710",
    # "60730",
    "59850",
]
site_filter = most_disagrement.tolist()
# site_filter = ["Milner_Bay_Groote_Eylandt"]
sites, limits, title = site_filter, 3, "Problematic sites"
models_to_plot = ["EOT20", "FES2012", "FES2014", "GOT4.10", "HAMTIDE11", "TPXO9", "TPXO8", "ensemble-mean-top3"]

# Plot facetted
g = sns.FacetGrid(
    tide_df_long.query(f"(site_name in {sites}) & (tide_model in @models_to_plot)").reset_index(),
    col="tide_model",
    row="site_name",
    margin_titles=True,
    # xlim=(-limits, limits),
    # ylim=(-limits, limits),
)
g.fig.suptitle(title, size=20)
g.set_titles(row_template="{row_name}", col_template="{col_name}")
g.map(sns.scatterplot, "tide_gauge", "tide_m", alpha=0.1, linewidth=0, s=3)
for a in g.axes.flat:
    a.plot([-limits, limits], [-limits, limits], "--", c="black")
g.savefig(f"{title.replace(' ', '')}.jpg")