# Validation of point_layer template_title using point observations

In [None]:
# bin_value using function from r4ecology's github
import numpy as np
def bin_value(x, bin_res):
    return np.floor((x + bin_res / 2) / bin_res + 0.5) * bin_res - bin_res / 2


In [None]:
chunk_start

In [None]:
variable = "point_variable".lower()
vv_name = variable
if vv_name in ["benbio", "carbon", "susfrac", "oxycons"]:
    compact = True
if vv_name.lower() == "ph":
    vv_name = "pH"
if vv_name in ["doc", "poc"]:
    vv_name = vv_name.upper()
if vv_name == "benbio":
    vv_name = "biomass of macrobenthos"
if vv_name == "susfrac":
    vv_name = "Suspension feeding fraction"
if vv_name == "oxycons":
    vv_name = "benthic oxygen consumption"
if vv_name == "mesozoo":
    vv_name = "mesozooplankton concentration"
layer = "point_layer"
# get the units. File inspection could be randomized in case people have put loose files in there...
import glob
df = pd.read_csv("../../matched/mapping.csv")
df = df.query("variable == @variable")
pattern = list(df.pattern)[0]
paths = pd.read_csv(glob.glob(f"../../matched/point/**/{layer}/{variable}/paths.csv")[0]).path

unit = None
if unit is None:
    try:
        ff = glob.glob(f"../../matched/point/**/{layer}/{variable}/*_{variable}_unit.csv")[0]
        df = pd.read_csv(ff)
        unit = df.unit[0]
    except:
        pass
if unit is None:
    for ff in paths:
        try:
            ds = nc.open_data(paths[0])
            model_variable = list(df.model_variable)[0].split("+")[0]
            unit = list(ds.contents.query("variable == @model_variable").unit)[0]
            break
        except:
            pass
if unit is None:
    unit = "unknown unit"

if variable == "carbon":
    unit = "kg m-3"
if variable == "pco2":
    unit = "µatm"

if unit.endswith("/d"):
    unit = unit[:-2] + "/day"

if variable == "susfrac":
    unit = "-"
if variable == "chlorophyll":
    # ensure there is a space after mg in unit, using regex
    unit = "mg /m^3"



            

## Read in the data

In [None]:
if layer in ["surface", "all"]:
    md(f"## Performance of model {vv_name} at the sea surface")
    layer_select = "surface"
if layer in ["benthic"]:
    md(f"## Performance of model {vv_name} in the sediment layer")
    layer_select = "surface"
chunk_point_surface


In [None]:
if layer in ["bottom", "all"]:
    md(f"## Performance of model {variable} at the near-bottom")
    layer_select = "bottom"
chunk_point_bottom

In [None]:
if layer == "all":
    md(f"## Depth-resolved comparisons of modelled and observed {vv_name}")

In [None]:
df_raw =  pd.read_csv(glob.glob(f"../../matched/point/**/{layer}/{variable}/*_{variable}.csv")[0])
ff = "../../matched/model_grid.nc"
if not os.path.exists(ff):  
    ff = "../../matched/model_bathymetry.nc"
import nctoolkit as nc
ds_coords = nc.open_data(ff)
ds_coords.rename({ds_coords.variables[0]: "e3t"})
ds_coords.assign(lon_model = lambda x: lon(x.e3t), lat_model = lambda x: lat(x.e3t))
ds_coords.drop(variables = "e3t")
ds_coords.run()
ds_coords.regrid(df_raw.loc[:,["lon", "lat"]].drop_duplicates(), method = "nearest")
df_coords = ds_coords.to_dataframe().reset_index()

In [None]:
if layer == "all":
    # overall summary
    #df_raw
    # create bins
    # 0-10m
    # 10-30m
    # 30-60m
    # 60-100m
    # 100-150m
    # 150-300m
    # 300-600m
    # 600-1000m
    # function
    def bin_depth(x):
        if x <= 10:
            return "0-10m"
        if x <= 30:
            return "10-30m"
        if x <= 60:
            return "30-60m"
        if x <= 100:
            return "60-100m"
        if x <= 150:
            return "100-150m"
        if x <= 300:
            return "150-300m"
        if x <= 600:
            return "300-600m"
        if x <= 1000:
            return "600-1000m"
        return ">1000m"

    df_ave = (df_raw.merge(df_coords))
    selection = [x for x in ["lon_model", "lat_model", "month", "year", "day", "depth", "model", "observation"] if x in df_ave.columns]
    df_ave = df_ave.loc[:,selection]
    grouping = [x for x in ["lon_model", "lat_model", "month", "year", "day", "depth"] if x in df_ave.columns]
    df_ave = (
        df_ave
        .groupby(grouping)
        .mean()
        .reset_index()
    )
    df_binned = (
        df_ave
    )
    df_mapped = (
        df_raw
        .assign(depth = lambda x: x.depth.apply(bin_depth))
        .loc[:,["lon", "lat", "depth"]]
        .drop_duplicates()
        .reset_index(drop = True)   

    )
    df_summary = (
        df_binned
        .groupby(["lon_model", "lat_model",  "depth"])
        .mean()
        .reset_index()
        .assign(depth = lambda x: x.depth.apply(bin_depth))
        .groupby("depth")
        # calculate bias, cor and rmsd between model and observation, and number of observations
        .apply(lambda x: pd.Series({"bias": x.model.mean() - x.observation.mean(), "cor": x.model.corr(x.observation), "rmsd": np.sqrt(((x.model - x.observation).pow(2)).mean()), "n": len(x)}))   
        .reset_index()
        # sort depth in original order
    )
    # now do the same but for depths < 150m
    df_summary_shallow = (
        df_ave
        .query("depth < 150") 
        .groupby(grouping)
        .mean()
        .reset_index()
        .groupby(["lon_model", "lat_model",  "depth"])
        .mean()
        .reset_index()
        .assign(depth = "0-150m")
        .groupby("depth")
        # calculate bias, cor and rmsd between model and observation
        .apply(lambda x: pd.Series({"bias": x.model.mean() - x.observation.mean(), "cor": x.model.corr(x.observation), "rmsd": np.sqrt(((x.model - x.observation).pow(2)).mean()), "n": len(x)}))   
        .reset_index()
    )
    # bind them
    df_summary = pd.concat([df_summary, df_summary_shallow])
    df_summary = df_summary.reset_index()
    # make the titles better
    df_summary = df_summary.rename(columns={"depth": "Depth", "bias": "Bias", "cor": "Correlation", "rmsd": "RMSD", "n": "Number of observations"})
    df_summary = df_summary.drop(columns = "index")
    df_summary = df_summary.assign(Depth = pd.Categorical(df_summary.Depth, ["0-150m","0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m"]))
    df_summary = df_summary.sort_values("Depth")
else:
    df_mapped = None

In [None]:
map_layer = False
if layer == "all":
    map_layer = True

In [None]:
%%capture --no-display
%%R -i df_mapped -i map_layer
options(warn=-1)
options(warn=-1)
library(tidyverse)
library(ggtext)

if (map_layer){


# make depth a factor
df_mapped <- df_mapped %>%
    mutate(depth = factor(depth, levels = c("0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m")))

gg <- df_mapped %>%
    ggplot()+
    geom_point(aes(lon, lat), size = 0.2)+
    theme_gray(base_size = 14)+
    coord_fixed(ratio = 1.5, xlim = c(min(df_mapped$lon), max(df_mapped$lon)), ylim = c(min(df_mapped$lat), max(df_mapped$lat))) +
    facet_wrap(~depth)


y_labels <-  as.numeric(na.omit(layer_scales(gg)$y$break_positions()))
x_labels <- as.numeric(na.omit(layer_scales(gg)$x$break_positions()))
x_breaks <- x_labels
y_breaks <- y_labels

# y labels are north-south coordinates. Make them more appropriate
# i.e. 10 should be 10 °N, -10 should be 10 °S

y_labels <- ifelse(y_labels >= 0, paste0(y_labels, "°N"), paste0(abs(y_labels), "°S"))
x_labels <- ifelse(x_labels >= 0, paste0(x_labels, "°E"), paste0(abs(x_labels), "°W"))

gg <- gg + scale_x_continuous(breaks = x_breaks, labels = x_labels) + scale_y_continuous(breaks = y_breaks, labels = y_labels)+
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")

# remove x and y axis names
gg <- gg +
    labs(x = "", y = "") 

# ditch the whitespace around the plot
gg <- gg + theme(plot.margin=unit(c(0,0,0,0),"cm"))
gg


}

In [None]:
if layer == "all":
    md(f"**Figure {chapter}{i_figure}**: The geographic distribution of matchups between the model and observational {variable}. The data has been binned into depth ranges. The depth ranges are 0-10m, 10-30m, 30-60m, 60-100m, 100-150m, 150-300m, 300-600m, 600-1000m, and >1000m. The number of observations in each depth range is shown in the tables below.")
    i_figure += 1   

In [None]:
if layer == "all":
    df_display(df_summary)
    # save this to csv in the results directory
    ff_out = f"../../results/{variable}_depth_summary.csv"
    df_summary.assign(unit = unit).to_csv(ff_out, index = False)
    
    md(f"**Table {chapter}{i_table}:** Average bias ({unit}), root-mean square difference (RMSD) and correlation coefficient of modelled and observed {vv_name} for different depth ranges. The bias is calculated as model - observation. The RMSD is the square root of the mean squared difference. The correlation coefficient is the Pearson correlation coefficient between the model and observed values.")
    i_table += 1

In [None]:
chunk_end

## Data Sources

In [None]:
if variable == "poc":
    md_basic("Boss, Emmanuel; Picheral, Marc; Searson, Sarah; Le Goff, Hervé; Reverdin, Gilles; Leeuw, Thomas; Chase, Alison P; Bricaud, Annick; Kolber, Zbigniew S; Taillandier, V; Pesant, Stephane; Tara Oceans Consortium, Coordinators; Tara Oceans Expedition, Participants (2017): Underway surface water data during the Tara Oceans expedition in 2009-2012 [dataset]. PANGAEA, https://doi.org/10.1594/PANGAEA.873566, In: Boss, Emmanuel; Picheral, Marc; Searson, Sarah; Marec, Claudie; Le Goff, Hervé; Reverdin, Gilles; Leeuw, Thomas; Chase, Alison P; Anderson, Leif G; Gattuso, Jean-Pierre; Pino, Diana Ruiz; Padín, Xose Antonio; Grondin, Pierre-Luc; Matuoka, Atsushi; Babin, Marcel; Bricaud, Annick; Kolber, Zbigniew S; Taillandier, V; Hafez, Mark; Chekalyuk, Alexander; Pesant, Stephane; Météo France; Tara Oceans Consortium, Coordinators (2017): Harmonised data from underway navigation, meteorology and surface water measurements during the Tara Oceans expedition in 2009-2013 [dataset publication series]. PANGAEA, <https://doi.org/10.1594/PANGAEA.873592>")
    md_basic("Röttgers, Rüdiger; Bi, Shun; Burmester, Henning; Heymann, Kerstin; Hieronymi, Martin; Krasemann, Hajo; Schönfeld, Wolfgang (2023): Water inherent optical properties and concentrations of water constituents from the German Bight and adjacent regions: concentrations and auxiliary data [dataset]. PANGAEA, https://doi.org/10.1594/PANGAEA.950767, In: Röttgers, Rüdiger; Bi, Shun; Burmester, Henning; Heymann, Kerstin; Hieronymi, Martin; Krasemann, Hajo; Schönfeld, Wolfgang (2023): Water inherent optical properties and concentrations of water constituents from the German Bight and adjacent regions [dataset bundled publication]. PANGAEA, <https://doi.org/10.1594/PANGAEA.950774>)")
    md_basic("Loisel, Hubert; Duforêt-Gaurier, Lucile; Tran, Trung Kien; Jorge, Daniel S F; Steinmetz, Francois; Mangin, Antoine; Bretagnon, Marine; d'Andon, Odile (2023): Database (DSM) of in situPOC, SPM and Rrs collected between 1997 and 2018 [dataset]. PANGAEA, <https://doi.org/10.1594/PANGAEA.960962>")
    md_basic("Lønborg, Christian; Carreira, Cátia; Abril, Gwenael; Agustí, Susana; Amaral, Valentina; Andersson, Agneta; Arístegui, Javier; Bhadury, Punyasloke; Bif, Mariana B; Borges, Alberto Vieira; Bouillon, Steven; Calleja, Maria Ll; Cotovicz, Luiz C Jr; Cozzi, Stefano; Doval, Maryló; Duarte, Carlos Manuel; Eyre, Bradley D; Fichot, Cedric; García-Martín, Elena; Garzon-Garcia, Alexandra; Giani, Michele; Gonçalves-Araujo, Rafael; Gruber, Renee K; Hansell, Dennis A; Hashihama, Fuminori; He, Ding; Holding, Johnna M; Hunter, William Ross; Ibánhez, J Severino; Ibello, Valeria; Jiang, Shan; Kim, Guebuem; Klun, Katja; Kowalczuk, Piotr; Kubo, Atsushi; Lee, Choon Weng; Lopes, Claudia B; Maggioni, Federica; Magni, Paolo; Marrasé, Celia; Martin, Patrick; McCallister, S Leigh; McCallum, Rosh; M Medeiros, Patricia; G Morán, Xosé Anxelu; Muller-Karger, Frank; Myers-Pigg, Allison; Norli, Marit; Oakes, Joanne M; Osterholz, Helena; Park, Hyekyung; Lund Paulsen, Maria; Rosentreter, Judith A; Ross, Jeff; Rueda-Roa, Digna; Santinelli, Chiara; Shen, Yuan; Teira, Eva; Tinta, Tinkara; Uher, Guenther; Wakita, Masahide; Ward, Nicholas D; Watanabe, Kenta; Xin, Yu; Yamashita, Youhei; Yang, Liyang; Yeo, Jacob; Yuan, Huamao; Zheng, Qiang; Álvarez‐Salgado, Xosé Antón (2023): A global database of dissolved organic matter (DOM) concentration measurements in coastal waters (CoastDOM v.1) [dataset]. PANGAEA, <https://doi.org/10.1594/PANGAEA.964012>")


In [None]:
if vv_source.lower() == "ices":
    md_basic("ICES Data Portal, Dataset on Ocean HydroChemistry, Extracted March 3, 2023. ICES, Copenhagen")

In [None]:
if variable == "carbon":
    md_basic('Diesing, Markus, Terje Thorsnes, and Lilja Rún Bjarnadóttir. "Organic carbon densities and accumulation rates in surface sediments of the North Sea and Skagerrak." Biogeosciences 18.6 (2021): 2139-2160.')

In [None]:
if vv_source.lower() == "socat23":
    md_basic("Bakker, Dorothee C. E.; Alin, Simone R.; Bates, Nicholas; Becker, Meike; Feely, Richard A.; Gkritzalis, Thanos; Jones, Steve D.; Kozyr, Alex; Lauvset, Siv K.; Metzl, Nicolas; Munro, David R.; Nakaoka, Shin-ichiro; Nojiri, Yukihiro; O'Brien, Kevin M.; Olsen, Are; Pierrot, Denis; Rehder, Gregor; Steinhoff, Tobias; Sutton, Adrienne J.; Sweeney, Colm; Tilbrook, Bronte; Wada, Chisato; Wanninkhof, Rik; Akl, John; Barbero, Leticia; Beatty, Cory M.; Berghoff, Carla F.; Bittig, Henry C.; Bott, Randy; Burger, Eugene F.; Cai, Wei-Jun; Castaño-Primo, Rocío; Corredor, Jorge E.; Cronin, Margot; De Carlo, Eric H.; DeGrandpre, Michael D.; Dietrich, Colin; Drennan, William M.; Emerson, Steven R.; Enochs, Ian C.; Enyo, Kazutaka; Epherra, Lucía; Evans, Wiley; Fiedler, Björn; Fontela, Marcos; Frangoulis, Constantin; Gehrung, Martina; Giannoudi, Louisa; Glockzin, Michael; Hales, Burke; Howden, Stephan D.; Ibánhez, J. Severino P.; Kamb, Linus; Körtzinger, Arne; Lefèvre, Nathalie; Lo Monaco, Claire; Lutz, Vivian A.; Macovei, Vlad A.; Maenner Jones, Stacy; Manalang, Dana; Manzello, Derek P.; Metzl, Nicolas; Mickett, John; Millero, Frank J.; Monacci, Natalie M.; Morell, Julio M.; Musielewicz, Sylvia; Neill, Craig; Newberger, Tim; Newton, Jan; Noakes, Scott; Ólafsdóttir, Sólveig Rósa; Ono, Tsuneo; Osborne, John; Padín, Xose A.; Paulsen, Melf; Perivoliotis, Leonidas; Petersen, Wilhelm; Petihakis, George; Plueddemann, Albert J.; Rodriguez, Carmen; Rutgersson, Anna; Sabine, Christopher L.; Salisbury, Joseph E.; Schlitzer, Reiner; Skjelvan, Ingunn; Stamataki, Natalia; Sullivan, Kevin F.; Sutherland, Stewart C.; T'Jampens, Michiel; Tadokoro, Kazuaki; Tanhua, Toste; Telszewski, Maciej; Theetaert, Hannelore; Tomlinson, Michael; Vandemark, Douglas; Velo, Antón; Voynova, Yoana G.; Weller, Robert A.; Whitehead, Chris; Wimart-Rousseau, Cathy (2023). Surface Ocean CO2 Atlas Database Version 2023 (SOCATv2023) (NCEI Accession 0278913). [indicate subset used]. NOAA National Centers for Environmental Information. Dataset. <https://doi.org/10.25921/r7xa-bt92>. Accessed [25/04/2024].")

In [None]:
if variable == "doc":
    md_basic("Hansell, Dennis A.; Carlson, Craig A.; Amon, Rainer M. W.; Álvarez-Salgado, X. Antón; Yamashita, Youhei; Romera-Castillo, Cristina; Bif, Mariana B. (2021). Compilation of dissolved organic matter (DOM) data obtained from global ocean observations from 1994 to 2021. Version 2 (NCEI Accession 0227166). [indicate subset used]. NOAA National Centers for Environmental Information. Dataset. <https://doi.org/10.25921/s4f4-ye35>. Accessed [date].")

    md_basic("Lønborg, C., Carreira, C., Abril, G., Agustí, S., Amaral, V., Andersson, A., ... & Álvarez-Salgado, X. A. (2024). A global database of dissolved organic matter (DOM) concentration measurements in coastal waters (CoastDOM v1). Earth System Science Data, 16(2), 1107-1119.")


In [None]:
if variable in ["benbio", "susfrac"]:
    md_basic("URL: <https://www.vliz.be/vmdcdata/nsbs/about.php>")
if variable == "oxycons":
    md_basic("Stratmann, Tanja; Soetaert, Karline; Wei, Chih-Lin et al. (2022). Data from: The SCOC database – a large, open and global database with sediment community oxygen consumption rates [Dataset]. Dryad. <https://doi.org/10.5061/dryad.25nd083>")



In [None]:
if test_status:
    md("This is getting to the end!")