# domain_title summary statistics of model performance

In [None]:
chunk_start
shelf = shelf_mask
import glob

import nctoolkit as nc
from mask import mask_all, mask_shelf
from ecoval import tidy_name

In [None]:
%%capture --no-display

show_map = False

if len([x for x in glob.glob("*.ipynb") if "summary_shelf" in x]) > 0:
    ds_regions = nc.open_data(f"{data_dir}/amm7_val_subdomains.nc")
    ds_regions.subset(variables = ["Shelf", "Ocean"])
    ds_regions.sum_all()
    ds_regions.as_missing(0)
    if shelf:
        mask_shelf(ds_regions)
    
    ds_plot = ds_regions.pub_plot(legend_position=None, land = "lightgrey")
    show_map = True
try:
    ensemble = nc.create_ensemble("../../results/annual_mean")
    ensemble = tidy_summary_paths(ensemble)
except:
    ensemble = None


In [None]:
if ensemble is not None:
    if show_map:
        if shelf:
            md(f"**Figure {chapter}{i_figure}**: Map of the shelf area used for the evaluation.")
        else:
            md(f"**Figure {chapter}{i_figure}**: Map of the ocean area used for the evaluation.")

In [None]:
if ensemble is not None:
    md("## Taylor diagrams for the sea surface")

In [None]:
import nctoolkit as nc
import pandas as pd
import geopandas as gpd
from IPython.display import display_markdown 
import warnings
warnings.filterwarnings('ignore')
from plotnine import *
import numpy as np
import os
import glob as glob
from mask import mask_all, mask_shelf
%load_ext rpy2.ipython

i_table = 1
stamp = nc.session_info["stamp"]
out = ".trackers/" + stamp
if not os.path.exists(".trackers"):
    os.makedirs(".trackers")
# save out as empty file
with open(out, 'w') as f:
    f.write("")

In [None]:
def fix_variable(vv):
    if vv.lower() == "poc":
        return "POC"
    if vv.lower() == "doc":
        return "DOC"
    return vv

In [None]:
if ensemble is not None:
    df_taylor = []
    for ff in ensemble:
        variable = os.path.basename(ff).split("_")[1].replace(".nc", "")
        ds_ff = nc.open_data(ff)
        if True:
            mask_shelf(ds_ff)
        else:
            mask_all(ds_ff)

        df_ff = ds_ff.to_dataframe().reset_index()
        lon_name = [df_ff.columns[i] for i in range(len(df_ff.columns)) if "lon" in df_ff.columns[i]][0]
        lat_name = [df_ff.columns[i] for i in range(len(df_ff.columns)) if "lat" in df_ff.columns[i]][0]
        df_taylor.append(
            df_ff
            .loc[:,[lon_name, lat_name, "model", "observation"]]
            .dropna()
            .assign(variable = variable)
        )
    df_taylor = pd.concat(df_taylor).reset_index(drop=True)

    # fix name of variable
    df_taylor = (
        df_taylor
        .assign(variable = lambda x: x["variable"].apply(tidy_name))
    )
    # fix variables
    df_taylor =(
        df_taylor
        .assign(variable = lambda x: x["variable"].apply(fix_variable))
    )
    plot_taylor = True
else:
    plot_taylor = False
    df_taylor = None
    df_cor = None
    global_grid = None

In [None]:
%%capture --no-display
%%R -i df_taylor -w 600 -h 600 -r 120 -i plot_taylor

if(plot_taylor){
library(plotrix, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)

# get unique variable from df_taylor

variables <- df_taylor %>%
    group_by(variable) %>%
    summarize(nsd = sd(model)/sd(observation))  %>%
    arrange(desc(nsd)) %>%
    pull(variable) 

pch = 1:length(variables)
col = rainbow(length(variables))

r_min <- df_taylor %>%
    group_by(variable) %>%
    summarise(r = cor(observation, model, use = "complete.obs")) %>%
    summarize(r = min(r)) %>%
    pull(r)

pos_cor = r_min >= -0.1

i <- 1
for (vv in variables){

    df_vv <- df_taylor %>%
        filter(variable == vv)
    if(i == 1){
        plot_size <- df_vv %>%
        # get the standard deviation of all
        summarize(nsd = sd(model)/sd(observation)) %>%
        pull(nsd)

    }

    if (i == 1){
        taylor.diagram(df_vv$observation, df_vv$model, pch = pch[i], col = col[i], add = FALSE, normalize = TRUE,
        pos.cor = pos_cor, main = NULL

        )
    } else {
        taylor.diagram(df_vv$observation, df_vv$model, pch = pch[i], col = col[i], add = TRUE, normalize = TRUE,
        pos.cor = pos_cor

        )
    }

i <- i + 1
}

legend( plot_size * 1.3, plot_size * 1.9, legend = variables, pch = pch, col = col, bty = "n")




}

In [None]:
if plot_taylor:
    variables = df_taylor.variable.unique()

In [None]:
if plot_taylor:
    md_basic(f"**Figure {chapter}{i_figure}**: Taylor diagram for **sea surface** annual mean of {', '.join(variables)}. This diagram compares climatological annual averages of the model and observations across the model's spatial domain. Standard devaiation is normalized by the standard deviation of the observations, and a standard deviation below 1 indicates that the model is less variable than the observations. Note: This figure summarizes the overall ability of the model to reproduce climatological **spatial patterns**, and it does not represent temporal performance.") 
    i_figure += 1

In [None]:
if plot_taylor:
    md("## Model biases based on gridded sea surface data")

In [None]:
if plot_taylor:
    df_bias= []
    for ff in ensemble:
        variable = os.path.basename(ff).split("_")[1].replace(".nc", "").title()
        if variable.lower() == "sst":
            variable = "SST"
        ds_ff = nc.open_data(ff)
        ds_ff.set_precision("F32")
        if True:
            mask_shelf(ds_ff)
        else:
            mask_all(ds_ff)
        ds_ff.assign(bias = lambda x: x.model - x.observation)
        ds_ff.spatial_mean()
        bias = ds_ff.to_dataframe().reset_index().bias.values[0]
        unit = ds_ff.contents.unit[0]
        name = variable 
        model = ds_ff.to_dataframe().reset_index().model.values[0]
        observation = ds_ff.to_dataframe().reset_index().observation.values[0]
        df_bias.append(pd.DataFrame({"Variable": [name], "Modelled spatial mean": [model], "Observational spatial mean":[observation], "Model bias": [bias], "Unit": [unit]}))
    df_bias = pd.concat(df_bias).reset_index(drop=True)
    df_bias = df_bias.assign(percentage_bias = lambda x: x["Model bias"]/x["Observational spatial mean"]*100)
    df_bias.loc[df_bias.Variable == "Temperature", "percentage_bias"] = np.nan
    df_bias.columns = ["Variable", "Model mean", "Observed mean", "Model bias", "Unit", "Percentage bias"]
    # tidy Variable
    df_bias = df_bias.assign(Variable = lambda x: x["Variable"].apply(tidy_name))
    # Make everything 2 dp, except for 1st column
    df_bias.iloc[:,1:] = df_bias.iloc[:,1:].round(2)
    # drop unit
    df_bias = df_bias.drop(columns=["Unit"])
    df_display(df_bias)

In [None]:
if plot_taylor:
    md(f"**Table {chapter}{i_table}**: Bias of model compared with **sea surface** observations. The bias is calculated as the modelled spatial mean minus the observational spatial mean. The percentage bias is calculated as the model bias divided by the observational spatial mean.")
    i_table += 1

In [None]:
if plot_taylor:
    md("## Spatial performance of the model at the sea surface") 

In [None]:
if plot_taylor:
    df_cor = []
    for ff in ensemble:
        variable = os.path.basename(ff).split("_")[1].replace(".nc", "").title()
        if variable.lower() == "sst":
            variable = "SST"
        ds_ff = nc.open_data(ff)
        ds_ff.set_precision("F32")
        if True:
            mask_shelf(ds_ff)
        else:
            mask_all(ds_ff)
        ds_ff.cor_space("model", "observation")
        ff_cor = (
            ds_ff
            .to_dataframe()
            .dropna()
            .cor
            .values
            [0]
        )
        df_cor.append(pd.DataFrame({"Variable": [variable], "Correlation": [ff_cor]}))
    df_cor = pd.concat(df_cor).reset_index(drop=True)
    df_cor.columns = ["Variable", "Spatial correlation between model and observations"]
    # tidy Variable
    df_cor = df_cor.assign(Variable = lambda x: x["Variable"].apply(tidy_name))
    df_display(df_cor)

In [None]:
if plot_taylor:
    md(f"**Table {chapter}{i_table}**: Pearson correlation coefficient between model and observations at the **sea surface** for annual mean of {tidy_name(variables, lower = True)}. This table compares climatological annual averages of the model and observations across the model's spatial domain. Standard devaiation is normalized by the standard deviation of the observations, and a standard deviation below 1 indicates that the model is less variable than the observations.") 
    i_table += 1

In [None]:
if plot_taylor:
    md("## Temporal performance of the model at the sea surface")

In [None]:
if plot_taylor:
    global_grid = False
    paths =   glob.glob("../../results/temporals/*.nc")
    paths = tidy_summary_paths(paths)
    for ff in paths:
        ds_ff = nc.open_data(ff)
        df_ff = ds_ff.to_dataframe().reset_index()
        lat_name = [df_ff.columns[i] for i in range(len(df_ff.columns)) if "lat" in df_ff.columns[i]][0]
        lat_min = df_ff[lat_name].values.min()
        lat_max = df_ff[lat_name].values.max()
        if lat_min < -89 and lat_max > 89:
            global_grid = True

In [None]:
if plot_taylor:
    df_cor = []
    paths =  glob.glob("../../results/temporals/*.nc")
    paths = tidy_summary_paths(paths)
    for ff in paths:
        ds_ff = nc.open_data(ff)
        if global_grid:
            ds_ff.to_latlon(lon = [-179.5, 179.5], lat = [-89.5, 89.5], res = 1)
        if True:
            mask_shelf(ds_ff)
        else:
            mask_all(ds_ff)
        df_ff = ds_ff.to_dataframe().reset_index().dropna()
        lon_name = [df_ff.columns[i] for i in range(len(df_ff.columns)) if "lon" in df_ff.columns[i]][0]
        lat_name = [df_ff.columns[i] for i in range(len(df_ff.columns)) if "lat" in df_ff.columns[i]][0]
        # rename them
        df_ff = df_ff.assign(lon = df_ff[lon_name], lat = df_ff[lat_name])
        variable = os.path.basename(ff).split("_")[0].replace(".nc", "").title()
        if variable.lower() == "sst":
            variable = "SST"
        df_ff = df_ff.assign(variable = variable)
        df_cor.append(df_ff)
    # tidy variable name
    
    
    df_cor = pd.concat(df_cor).reset_index(drop=True)
    df_cor = df_cor.assign(variable = lambda x: x["variable"].apply(tidy_name))
    
    
    

In [None]:
%%capture --no-display
%%R -i df_cor -i global_grid -i plot_taylor -r 120
if(plot_taylor){
library(ggplot2, warn.conflicts = FALSE)
library(tidyverse, warn.conflicts = FALSE)
world_map <- map_data("world")

xlim <- c(min(df_cor$lon), max(df_cor$lon))
ylim <- c(min(df_cor$lat), max(df_cor$lat))

min_val <- min(df_cor$cor)
max_val <- max(df_cor$cor)
# CO2, superscript in markdown
df_cor <- df_cor %>%
        mutate(variable = gsub("CO2", "CO<sub>2</sub>", variable)) 
df_cor <- df_cor %>%
        mutate(variable = gsub("co2", "CO<sub>2</sub>", variable)) 
df_cor <- df_cor %>%
        mutate(variable = gsub("CO_2", "CO<sub>2</sub>", variable)) 


gg <- ggplot(df_cor)+
        geom_raster(aes(x  = lon,y =   lat, fill = cor))+ 
        coord_cartesian(xlim = xlim, ylim = ylim)+
        theme_bw(base_size = 12)+
        facet_wrap(~variable)+
        labs(fill = "Correlation coefficient")+
        theme_bw(base_family = "Helvetica", base_size = 8) +
        theme(
          legend.position = "bottom", legend.direction = "horizontal", legend.box = "horizontal", legend.key.width = unit(3.0, "cm"),
          legend.key.height = unit(0.5, "cm")
        ) +
        labs(x = NULL, y = NULL) +
        theme(plot.margin = unit(c(2, 0, 2, 0), "mm")) +
        theme(plot.title = element_text(hjust = 0.5))+
        theme(strip.text = ggtext::element_markdown())
        # use element_markdown for facet labels

        # make the legend 3 cm wide
        # theme( legend_key_size = unit(3, "cm"))



if (min_val < 0 & max_val > 0){
        gg <- gg + 
                scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0, 
                guide = guide_colorbar(title.position = "bottom", title.hjust = 0.5, title.theme = element_text(angle = 0, size = 10, family = "Helvetica")),
                breaks = seq(-1, 1, 0.25))
}
if (min_val > 0){
        gg <- gg + 
                scale_fill_viridis_c(
                guide = guide_colorbar(title.position = "bottom", title.hjust = 0.5, title.theme = element_text(angle = 0, size = 10, family = "Helvetica"))
                )
}


y_labels <-  as.numeric(na.omit(layer_scales(gg)$y$break_positions()))
x_labels <- as.numeric(na.omit(layer_scales(gg)$x$break_positions()))
x_breaks <- x_labels
y_breaks <- y_labels

# y labels are north-south coordinates. Make them more appropriate
# i.e. 10 should be 10 °N, -10 should be 10 °S

y_labels <- ifelse(y_labels >= 0, paste0(y_labels, "°N"), paste0(abs(y_labels), "°S"))
x_labels <- ifelse(x_labels >= 0, paste0(x_labels, "°E"), paste0(abs(x_labels), "°W"))

gg <- gg + scale_y_continuous(breaks = y_breaks, labels = y_labels) +
          scale_x_continuous(breaks = x_breaks, labels = x_labels)+
          geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "grey", colour = "grey")
gg


}

In [None]:
if plot_taylor:
    md(f"**Figure {chapter}{i_figure}**: Spatial correlation (Pearson correlation coefficient) between model and observations for annual mean of {tidy_name(variables, lower = True)}. This figure compares climatological monthly averages of the model and observations across the model's spatial domain.")
    i_figure += 1

In [None]:
if plot_taylor:
    md("The overall ability of the model reproduce the seasonality of each variable was estimated by calculating the spatial mean of the Pearson correlation coefficient between the model and the observations. The spatial mean was calculated by averaging the correlation coefficient of each grid cell.")

In [None]:
if plot_taylor:
    df_cor = []
    paths = glob.glob("../../results/temporals/*.nc")
    paths = tidy_summary_paths(paths)
    for ff in paths:
        ds_ff = nc.open_data(ff, checks = False)
        ds_ff.spatial_mean()
        variable = os.path.basename(ff).split("_")[0].replace(".nc", "").title()
        if variable.lower() == "sst":
            variable = "SST"
        df_cor.append(pd.DataFrame({"Variable": [variable], "Correlation": [ds_ff.to_dataframe().reset_index().cor.values[0]]}))
    
    df_cor = pd.concat(df_cor).reset_index(drop=True)
    
    # tidy Variable
    df_cor = df_cor.assign(Variable = lambda x: x["Variable"].apply(tidy_name))



In [None]:
# Make everything 2 dp, except for 1st column
if plot_taylor:
    df_display(df_cor)

In [None]:
if plot_taylor:
    md(f"**Table {chapter}{i_table}**: Spatial average of the temporal correlation (Pearson correlation coefficient) between model and observations for annual mean of {tidy_name(variables, lower = True)}. The correlation is calculated for each grid cell individually using monthly climatological averages. The spatial average is then calculated for each variable.")
    i_table += 1

In [None]:

paths = glob.glob("../../results/**_depth_summary.csv")

In [None]:
if len(paths) > 0:
    md("## Perfomance of model across depths")
    md("Root mean square deviation (RMSD), bias and correlation between model and observations were calculated for each variable at different depths. The RMSD is calculated as the square root of the mean of the squared differences between the model and observations. The bias is calculated as the modelled spatial mean minus the observational spatial mean. The correlation is calculated for each variable at different depths.") 
    def bin_depth(x):
        if x <= 10:
            return "0-10m"
        if x <= 30:
            return "10-30m"
        if x <= 60:
            return "30-60m"
        if x <= 100:
            return "60-100m"
        if x <= 150:
            return "100-150m"
        if x <= 300:
            return "150-300m"
        if x <= 600:
            return "300-600m"
        if x <= 1000:
          return "600-1000m"
        return np.nan 
    df_map = []
    for ff in paths:
        vv = os.path.basename(ff).split("_")[0]
        ff_points = glob.glob(f"../../matched/point/nws/all/{vv}/*all*{vv}.csv")[0]
        vv = vv.title()
        df_ff = pd.read_csv(ff_points).loc[:,["lon", "lat", "depth"]]
        lon_min = lon_lim[0]
        lon_max = lon_lim[1]
        lat_min = lat_lim[0]
        lat_max = lat_lim[1]
        # filter by lon and lat
        df_ff = df_ff.query(f"lon >= {lon_min} and lon <= {lon_max} and lat >= {lat_min} and lat <= {lat_max}").reset_index(drop = True)
        df_ff = df_ff.assign(depth_bin = df_ff["depth"].apply(bin_depth))
        # add variable
        df_ff = df_ff.assign(variable = vv)
        df_map.append(df_ff)
    df_map = pd.concat(df_map).reset_index(drop=True)
    # drop na
    df_map = df_map.dropna()
    plot_map = True
else:
    df_map = None
    plot_map = False

In [None]:
%%capture --no-display
%%R -i df_map -i plot_map -w 600 -h 600
if(plot_map){
library(ggplot2, warn.conflicts = FALSE)
library(tidyverse)
world_map <- map_data("world")

xlim <- c(min(df_map$lon), max(df_map$lon))
ylim <- c(min(df_map$lat), max(df_map$lat))
#    mutate(depth = factor(depth, levels = c("0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m")))
df_map <- df_map %>%
    mutate(depth_bin = factor(depth_bin, levels = c("0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m")))


gg <- ggplot(df_map)+
        geom_point(aes(x  = lon,y =   lat), size = 0.5)+ 
        coord_cartesian(xlim = xlim, ylim = ylim)+
        theme_bw(base_size = 24)+
        facet_grid(variable~depth_bin)+
        labs(color = "Depth bin")+
        theme_bw(base_family = "Helvetica", base_size = 8) +
        theme(
          legend.position = "bottom", legend.direction = "horizontal", legend.box = "horizontal", legend.key.width = unit(3.0, "cm"),
          legend.key.height = unit(0.5, "cm")
        ) +
        labs(x = NULL, y = NULL) +
        theme(plot.margin = unit(c(2, 0, 2, 0), "mm")) +
        theme(plot.title = element_text(hjust = 0.5))
        # make the legend 3 cm wide
        # theme( legend_key_size = unit(3, "cm"))

y_labels <-  as.numeric(na.omit(layer_scales(gg)$y$break_positions()))
x_labels <- as.numeric(na.omit(layer_scales(gg)$x$break_positions()))
x_breaks <- x_labels
y_breaks <- y_labels

# y labels are north-south coordinates. Make them more appropriate
# i.e. 10 should be 10 °N, -10 should be 10 °S

y_labels <- ifelse(y_labels >= 0, paste0(y_labels, "°N"), paste0(abs(y_labels), "°S"))
x_labels <- ifelse(x_labels >= 0, paste0(x_labels, "°E"), paste0(abs(x_labels), "°W"))

gg <- gg + scale_y_continuous(breaks = y_breaks, labels = y_labels) +
          scale_x_continuous(breaks = x_breaks, labels = x_labels)+
          geom_polygon(data = world_map, aes(x = long, y = lat, group = group), fill = "grey", colour = "grey")
gg
}


In [None]:
if plot_map:
    md(f"**Figure {chapter}{i_figure}**: Map of the locations of matchups at each depth range.") 
    i_figure += 1

In [None]:


if len(paths) > 0:

    df_depth = []
    for ff in paths:
        df = pd.read_csv(ff)
        vv_ff = os.path.basename(ff).split("_")[0]
        df_depth.append(
            df
            .loc[:,["Depth", "RMSD", "unit"]]
            # spread RMSD using Depth
            .pivot(index = "unit", columns = "Depth", values = "RMSD")
            .assign(variable = vv_ff)
            # put variable column first
            .reset_index()
            .set_index("variable")
            .reset_index()

        )
    df_depth = pd.concat(df_depth).reset_index(drop=True)
#        mutate(depth_bin = factor(depth_bin, levels = c("0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m")))
    # Change the order of the columns
    locs = [ x for x in ["variable", "unit", "0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m", "0-150m"] if x in df_depth.columns]
    df_depth = df_depth.loc[:,locs]

    df_display(df_depth)
    md(f"**Table {chapter}{i_table}**: Root mean square deviation (RMSD) of model compared with observations at different depths. The RMSD is calculated as the square root of the mean of the squared differences between the model and observations.")
    i_table += 1

    # now do bias
    df_bias = []
    for ff in paths:
        df = pd.read_csv(ff)
        vv_ff = os.path.basename(ff).split("_")[0]
        df_bias.append(
            df
            .loc[:,["Depth", "Bias", "unit"]]
            # spread RMSD using Depth
            .pivot(index = "unit", columns = "Depth", values = "Bias")
            .assign(variable = vv_ff)
            # put variable column first
            .reset_index()
            .set_index("variable")
            .reset_index()

        )
    df_bias = pd.concat(df_bias).reset_index(drop=True)
    locs = [x for x in ["variable", "unit", "0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m", "0-150m"] if x in df_bias.columns]
    df_bias = df_bias.loc[:,locs]
    df_display(df_bias)
    md(f"**Table {chapter}{i_table}**: Bias of model compared with observations at different depths.")

    i_table += 1

    # now do correlation

    df_cor = []
    for ff in paths:
        df = pd.read_csv(ff)
        vv_ff = os.path.basename(ff).split("_")[0]
        df_cor.append(
            df
            .loc[:,["Depth", "Correlation"]]
            # spread RMSD using Depth
             .assign(variable = vv_ff)
            .pivot(index = "variable", columns = "Depth", values = "Correlation")
            # put variable column first
            .reset_index()
            # .set_index("variable")
        )
    df_cor = pd.concat(df_cor).reset_index(drop=True)
    locs = [x for x in ["variable", "0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m", "0-150m"] if x in df_cor.columns]
    df_cor = df_cor.loc[:,locs]
    df_display(df_cor)
    md(f"**Table {chapter}{i_table}**: Pearson correlation coefficient between model and observations at different depths. The correlation is calculated for each variable at different depths.")
    i_table += 1

        