# Validation of sea surface PFTs using point observations

In [None]:
# bin_value using function from r4ecology's github
import numpy as np
def bin_value(x, bin_res):
    return np.floor((x + bin_res / 2) / bin_res + 0.5) * bin_res - bin_res / 2



In [None]:
chunk_start

In [None]:
variable = "pft".lower()
vv_name = variable
if vv_name.lower() == "ph":
    vv_name = "pH"
if vv_name in ["doc", "poc"]:
    vv_name = vv_name.upper()
if vv_name == "benbio":
    vv_name = "biomass of macrobenthos"
layer = "surface"
# get the units. File inspection could be randomized in case people have put loose files in there...
import glob
df = pd.read_csv("../../matched/mapping.csv")
if variable == "pft":
    df = df.query("variable == 'chlorophyll'")
else:
    df = df.query("variable == @variable")
pattern = list(df.pattern)[0]
paths = pd.read_csv(glob.glob(f"../../matched/point/**/{layer}/{variable}/paths.csv")[0]).path

for ff in paths:
    try:
        ds = nc.open_data(paths[0])
        model_variable = list(df.model_variable)[0].split("+")[0]
        unit = list(ds.contents.query("variable == @model_variable").unit)[0]
        break
    except:
        pass

## Read in the data

In [None]:
ff = glob.glob(f"../../matched/point/**/{layer}/{variable}/*_{variable}.csv")[0]
vv_source = os.path.basename(ff).split("_")[0]
vv_source = vv_source.upper()
df = pd.read_csv(ff)
if variable == "ph":
    df = df.query("observation > 4").reset_index(drop = True)
# Danish part is always dubious
df = df.query("lon < 9")
# ds= nc.open_data(f"{data_dir}/amm7_val_subdomains.nc")
# ds.subset(variable = "Shelf")
# ds.as_missing(0)
# ds.regrid(df.loc[:,["lon", "lat"]].drop_duplicates().reset_index(drop = True), "nn")
# df_grid = ds.to_dataframe().reset_index().dropna().drop_duplicates()
# df = df.merge(df_grid)
df_locs = df.loc[:,["lon", "lat"]].drop_duplicates()
# bin to 0.01 resolution
df_raw = df
df["lon"] = df["lon"].apply(lambda x: bin_value(x, 0.5))
df["lat"] = df["lat"].apply(lambda x: bin_value(x, 0.5))
if "year" in df.columns:
    df = df.groupby(["lon", "lat", "year", "month"]).mean().reset_index()
else:
    df = df.groupby(["lon", "lat"]).mean().reset_index()

In [None]:
if variable == "carbon":
    md("**Note**: This is in progress. Model and observation data are yet to be converted to comparable units!")

In [None]:
from IPython.display import Markdown as md

if vv_source == "ices": 

    if layer == "bottom":
        md(f"Near-bottom values of {vv_name} were extracted from ICES bottle and CTD data.")
    if layer == "surface":
        md(f"Values from the top 5 m of the water column were extracted from ICES bottle and CTD data.")
    if layer == "benthic":
        md("Benthic values were extracted from existing datasets")


if layer == "bottom":
    md(f"This data was extracted from vertical profiles. The near-bottom value was defined as the value closest to the bottom, that was within 5 m of the bottom. Bathymetry was estimated using GEBCO Bathymetry data.")
if layer == "surface":
    md(f"This data was extracted from vertical profiles. Values from the top 5 m were extracted from the database. This was compared with the model values from the sea surface level.")
if variable in ["benbio"]:
    md("Biomass data for macrobenthos was downloaded from the North Sea Benthos Survey 1986.")

if variable in ["carbon"]:
    md("Carbon data was compiled from multiple sources")
md(f"In total there were {len(df)} {layer} values extracted from the observational database.")

if layer == "bottom":
    md("**Note:** this analysis has been restricted to observations on the shelf region.")


if variable == "poc":
    md("Particulate organic carbon data was compiled from multiple sources")

if variable == "pco2":
    md("The variable pCO2water_SST_wet was extracted from the SOCAT 2023 database.")
    md("Observational values were averaged for each day in the year.")

if variable == "doc":
    md("Dissolved organic carbon data was compiled from multiple sources")

df_mapping = pd.read_csv("../../matched/mapping.csv")

In [None]:
%%capture --no-display
%%R -i df_locs -i variable -i unit -w 500 -h 600
library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(stringr)
world_map <- map_data("world")
# get lon, lat limits from profile_mld

xlim = c(min(df_locs$lon), max(df_locs$lon))
ylim = c(min(df_locs$lat), max(df_locs$lat))



gg <- df_locs %>%
# final six months of the year
    ggplot()+
    geom_point(aes(lon, lat))+
    theme_gray(base_size = 16)+
    # add colour scale. Minimum zero, label 100, ">100"
    coord_fixed(xlim = xlim, ylim = ylim, ratio = 1.5) 


y_labels <-  as.numeric(na.omit(layer_scales(gg)$y$break_positions()))
x_labels <- as.numeric(na.omit(layer_scales(gg)$x$break_positions()))
x_breaks <- x_labels
y_breaks <- y_labels

# y labels are north-south coordinates. Make them more appropriate
# i.e. 10 should be 10 °N, -10 should be 10 °S

y_labels <- ifelse(y_labels >= 0, paste0(y_labels, "°N"), paste0(abs(y_labels), "°S"))
x_labels <- ifelse(x_labels >= 0, paste0(x_labels, "°E"), paste0(abs(x_labels), "°W"))

gg <- gg + scale_x_continuous(breaks = x_breaks, labels = x_labels)+
    scale_y_continuous(breaks = y_breaks, labels = y_labels)+
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")

# there is no need for x or y axis labels
gg <- gg + labs(x = NULL, y = NULL)



    # move legen

gg

In [None]:
md(f"**Figure {chapter}{i_figure}:** Map of PFT observations from Cefas.")
i_figure = i_figure + 1

In [None]:
#df
# melt columns with model in the name
if variable == "pft":
    melt_columns = [x for x in ["lon", "lat", "year", "month", "day"] if x in df.columns]
    df = (
        df
        .melt(id_vars = melt_columns, var_name = "measure", value_name = "value")
    
    
    )
df["source"] = [x.split("_")[-1] for x in df.measure]
df = (df
        .assign(measure = lambda x: x.measure.str.replace("_model", ""))
        .assign(measure = lambda x: x.measure.str.replace("_obs", ""))
)

In [None]:
# pivot value using measure
pivot_vars = [x for x in ["lon", "lat", "year", "month", "source"] if x in df.columns]
df = (
    df.pivot_table(index = pivot_vars, columns = "measure", values = "value").reset_index()
    .reset_index(drop = True)
    # drop index name
    .rename_axis(None, axis = 1)
)
# change mod to model and obs  to observation in source
df["source"] = df.source.str.replace("model", "Model").str.replace("obs", "Observation")

In [None]:
%%capture --no-display
%%R -i df -w 1000 -h 500

library(ggtern)
library(tidyverse)

ggtern(data = df, aes(x = micro_frac, y = pico_frac, z = nano_frac)) +
    geom_point()+
    theme_rgbw(base_size = 24)+
    facet_wrap(~source)+
    # add better labels
    theme(legend.position = "bottom")+
    labs( x = "Micro", y = "Pico", z = "Nano")+
    percent_custom("%")

In [None]:
i_subplot = 1
md(f"**Figure {chapter}{i_figure}.{i_subplot}**: Ternary plot of the model output for Plankton Functional Types.")
i_subplot += 1

In [None]:
%%capture --no-display
%%R -i df -w 1000 -h 800

library(ggtern)
library(tidyverse)

ggtern(data = df, aes(x = micro_frac, y = pico_frac, z = nano_frac, colour = source)) +
    geom_point()+
    theme_rgbw(base_size = 24)+
    # add better labels
    theme(legend.position = "bottom")+
    labs( x = "Micro", y = "Pico", z = "Nano")+
    # ditch legend labels
    theme(legend.title = element_blank())+
    percent_custom("%")

In [None]:
md(f"**Figure {i_figure}.{i_subplot}**: Ternary plot of the model output for Plankton Functional Types.")
i_subplot += 1

## Data Sources

Creach and Forster (2017). North Sea phytoplankton pigments 2010 to 2011. Cefas, UK. V1. doi: https://doi.org/10.14466/CefasDataHub.33.