# Validation of point_layer template_title using point observations

In [None]:
# bin_value using function from r4ecology's github
import numpy as np
def bin_value(x, bin_res):
    return np.floor((x + bin_res / 2) / bin_res + 0.5) * bin_res - bin_res / 2


In [None]:
chunk_start

In [None]:
variable = "point_variable".lower()
vv_name = variable
if vv_name in ["benbio", "carbon"]:
    compact = True
if vv_name.lower() == "ph":
    vv_name = "pH"
if vv_name in ["doc", "poc"]:
    vv_name = vv_name.upper()
if vv_name == "benbio":
    vv_name = "biomass of macrobenthos"
layer = "point_layer"
# get the units. File inspection could be randomized in case people have put loose files in there...
import glob
df = pd.read_csv("../../matched/mapping.csv")
df = df.query("variable == @variable")
pattern = list(df.pattern)[0]
paths = pd.read_csv(glob.glob(f"../../matched/point/**/{layer}/{variable}/paths.csv")[0]).path

unit = None
if unit is None:
    try:
        ff = glob.glob(f"../../matched/point/**/{layer}/{variable}/*_{variable}_unit.csv")[0]
        df = pd.read_csv(ff)
        unit = df.unit[0]
    except:
        pass
if unit is None:
    for ff in paths:
        try:
            ds = nc.open_data(paths[0])
            model_variable = list(df.model_variable)[0].split("+")[0]
            unit = list(ds.contents.query("variable == @model_variable").unit)[0]
            break
        except:
            pass
if unit is None:
    unit = "unknown unit"

if variable == "carbon":
    unit = "kg m-3"

            

## Read in the data

In [None]:
ff = glob.glob(f"../../matched/point/**/{layer}/{variable}/*_{variable}.csv")[0]
vv_source = os.path.basename(ff).split("_")[0]
vv_source = vv_source.upper()
df = pd.read_csv(ff)
if variable == "ph":
    df = df.query("observation > 4").reset_index(drop = True)
# Danish part is always dubious
df = df.query("lon < 9")
df_locs = df.loc[:,["lon", "lat"]].drop_duplicates()
# bin to 0.01 resolution
df_raw = df
df["lon"] = df["lon"].apply(lambda x: bin_value(x, 0.5))
df["lat"] = df["lat"].apply(lambda x: bin_value(x, 0.5))
if variable == "benbio":
    df = df.assign(observation = lambda x: 1000 * 0.45 * x.observation) 
if variable not in  ["carbon", "benbio"]:
    df = df.groupby(["lon", "lat", "year", "month"]).mean().reset_index()
else:
    df = df.groupby(["lon", "lat"]).mean().reset_index()

In [None]:
if variable == "carbon":
    md("**Note**: This is in progress. Model and observation data are yet to be converted to comparable units!")

In [None]:
from IPython.display import Markdown as md
intro = []

if vv_source == "ICES": 

    if layer == "bottom":
        intro.append(f"Near-bottom values of {vv_name} were extracted from ICES bottle and CTD data.")
    if layer == "surface":
        intro.append(f"Values from the **top 5 m** of the water column were extracted from ICES bottle and CTD data.")
    if layer == "benthic":
        intro.append("Benthic values were extracted from existing datasets")
else:
    if layer == "bottom":
        intro.append(f"This data was extracted from vertical profiles. The near-bottom value was defined as the value closest to the bottom, that was within 5 m of the bottom. Bathymetry was estimated using GEBCO Bathymetry data.")
    if layer == "surface":
        intro.append(f"This data was extracted from vertical profiles. Values from the **top 5 m** were extracted from the database. This was compared with the model values from the surface level.")
    if variable in ["benbio"]:
        intro.append("Biomass data for macrobenthos was downloaded from the North Sea Benthos Survey 1986.")

if variable in ["carbon"]:
    intro.append("Carbon data was compiled from multiple sources")
md(f"In total there were {len(df_raw)} {layer} values extracted from the observational database.")

if layer == "bottom":
    intro.append("**Note:** this analysis has been restricted to observations on the shelf region.")


if variable == "poc":
    intro.append("Particulate organic carbon data was compiled from multiple sources")

if variable == "pco2":
    intro.append("The variable pCO2water_SST_wet was extracted from the SOCAT 2023 database.")
    intro.append("Observational values were averaged for each day in the year.")

if variable == "doc":
    intro.append("Dissolved organic carbon data was compiled from multiple sources")

df_mapping = pd.read_csv("../../matched/mapping.csv")
model_variable = list(df_mapping.query("variable == @variable").model_variable)[0]

if "year" in df_raw.columns:
    min_year = df_raw.year.min()
    max_year = df_raw.year.max()
    if min_year == max_year:
        intro.append(f"The model output was matched up with the observational data for the year **{min_year}**.")
    else:
        intro.append(f"The model output was matched up with the observational data for the years **{min_year} to {max_year}**.")

intro.append(f"The following model output was used to compare with observational **{vv_name}**: **{model_variable}**.")

md(" ".join(intro).strip().replace("  ", " "))

md(f"The map below shows the locations of the matched up data for {vv_name}.")

In [None]:
# bottom 1% of observations
bot_low = df.observation.quantile(0.001)
df = df.query(f"observation >= {bot_low}")

In [None]:
%%capture --no-display
%%R -i df_locs -i variable -i unit -w 500
library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(stringr)
world_map <- map_data("world")
# get lon, lat limits from profile_mld

xlim = c(min(df_locs$lon), max(df_locs$lon))
ylim = c(min(df_locs$lat), max(df_locs$lat))



if(variable == "temperature"){
    if(str_detect(unit, "C"))
     unit = "°C"
}



gg <- df_locs %>%
# final six months of the year
    ggplot()+
    geom_point(aes(lon, lat))+
    theme_gray(base_size = 14)+
    # add colour scale. Minimum zero, label 100, ">100"
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")+
    coord_fixed(xlim = xlim, ylim = ylim, ratio = 1.5) 

# figure out if lon minimum is less than -10
if( min(df_locs$lon) < -10 ){
    # add sensible labels for longitude and latitude

    gg <- gg +
    scale_x_continuous(breaks = seq(-10, 5, 5), labels = c("10°W", "5°W", "0°", "5°E"))+ 
    scale_y_continuous(breaks = seq(45, 60, 5), labels = c("45°N", "50°N", "55°N", "60°N"))+
    labs(x = "", y = "") 


}

    # move legen

gg

In [None]:
if layer == "surface":
    md(f"**Figure {chapter}{i_figure}:** Locations of matchups between simulated and observed {vv_name} in the top 5 m of the water column.") 
if layer == "bottom":
    md(f"**Figure {chapter}{i_figure}:** Locations of matchups between simulated and observed {vv_name} near the bottom of the water column.")
if layer == "all":
    md(f"**Figure {chapter}{i_figure}:** Locations of matchups between simulated and observed {vv_name} throughout the water column.")
i_figure = i_figure + 1

In [None]:
%%capture --no-display
%%R -i df -i variable -i unit -i layer -i vv_name -w 500 
options(warn=-1)
library(tidyverse)
if (("month" %in% colnames(df)) == FALSE){

df_map <- df %>%
    gather(variable, value, model:observation)
    df_map
# calculate the 98th percentil of the data
p98 = quantile(df_map$value, 0.98)
# cap the value at this
df_map$value = pmin(df_map$value, p98)

world_map <- map_data("world")


if(variable == "temperature"){
    if(str_detect(unit, "C"))
     unit = "°C"
}


Layer <- str_to_title(layer)
name <- str_glue("{Layer} {vv_name} ({unit})")


gg <- df_map %>%
    ggplot()+
    geom_tile(aes(lon, lat, fill = value))+
    theme_gray(base_size = 14)+
    coord_fixed(ratio = 1.5, xlim = c(min(df$lon), max(df$lon)), ylim = c(min(df$lat), max(df$lat)))+
    labs(color = variable)+
    # log10
    scale_color_viridis_c()+
    theme(legend.position = "bottom")+
    facet_wrap(~variable)+
      scale_fill_viridis_c(
        # use unit for the label
        name = name,
                       guide = guide_colorbar(title.position = "bottom", title.hjust = 0.5, title.theme = element_text(angle = 0, size = 20, family = "Helvetica"))
  )+
    theme(
    legend.position = "bottom", legend.direction = "horizontal", legend.box = "horizontal", legend.key.width = unit(6.0, "cm"),
    legend.key.height = unit(1.0, "cm"))


y_labels <-  as.numeric(na.omit(layer_scales(gg)$y$break_positions()))
x_labels <- as.numeric(na.omit(layer_scales(gg)$x$break_positions()))
x_breaks <- x_labels
y_breaks <- y_labels

# y labels are north-south coordinates. Make them more appropriate
# i.e. 10 should be 10 °N, -10 should be 10 °S

y_labels <- ifelse(y_labels >= 0, paste0(y_labels, "°N"), paste0(abs(y_labels), "°S"))
x_labels <- ifelse(x_labels >= 0, paste0(x_labels, "°E"), paste0(abs(x_labels), "°W"))

gg <- gg + scale_x_continuous(breaks = x_breaks, labels = x_labels) + scale_y_continuous(breaks = y_breaks, labels = y_labels)+
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")

# remove x and y axis names
gg <- gg +
    labs(x = "", y = "") 

# ditch the whitespace around the plot
gg <- gg + theme(plot.margin=unit(c(0,0,0,0),"cm"))
gg


}


In [None]:
if "month" not in df.columns:
    md(f"**Figure {chapter}{i_figure}:** Map of average {layer} {vv_name} in the model and observational datasets.")
    i_figure += 1

In [None]:
if "month" in df_raw.columns:
    # summarize using md the number of observations in each month
    # get the minimum and maximum number in each month and report the month
    df_size = df_raw.groupby("month").size().reset_index()
    df_size.columns = ["month", "n"]
    n_min = df_size.n.min()
    n_max = df_size.n.max()
    month_min = list(df_size.query("n == @n_min").month.values)[0]
    months_max = list(df_size.query("n == @n_max").month.values)[0] 
    # convert to month names
    import calendar
    month_min = calendar.month_name[int(month_min)]
    months_max = calendar.month_name[int(months_max)] 
    
    # summarize using md
    
    fig_summary = [f"The number of observations in each month ranged from {n_min} in {month_min} to {n_max} in {months_max}."]
    
    fig_summary.append(f"Figure {chapter}{i_figure} below shows the distribution of observations in each month.")
    
    md(" ".join(fig_summary).strip().replace("  ", " "))



In [None]:
%%capture --no-display
%%R -i df_raw -i variable -i unit -w 500 
# calculate number of observations per month
# figure out if "month" in df
if("month" %in% colnames(df_raw)){

df1 <- df_raw %>%
    group_by(lon, lat, month) %>%
    summarise(observation = n()) %>%
    ungroup()   

# plot number of observations per month using plotnine and geom_bar

# change month to month name
df1$month <- factor(df1$month, levels = 1:12, labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"))

gg <- df1 %>%
    ggplot(aes(x = month, y = observation))+
    theme_gray(base_size = 14)+
    geom_bar(stat = "identity")+
    labs(y = "Number of observations", x= "")

gg
}

In [None]:
if "month" in df.columns:
    md(f"**Figure {chapter}{i_figure}:** Number of {layer} observations per month for {vv_name}.")
    i_figure += 1

In [None]:
bias_text = []

bias_text.append(f"Figure {chapter}{i_figure} below shows the bias between the model and observational data for {vv_name}.")
bias_text.append(f"The bias is calculated as the model value minus the observational value, and it is shown for each month of the year.")

md(" ".join(bias_text).strip().replace("  ", " "))

In [None]:
%%capture --no-display
%%R -i df -i variable -i unit -i layer -w 1000 -h 1200 -i vv_name
options(warn=-1)
#%%R -i df -i variable -i unit -w 1600 -h 1000
options(warn=-1)

library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(stringr)
library(tidyverse)
world_map <- map_data("world")
# get lon, lat limits from profile_mld

xlim = c(min(df$lon), max(df$lon))
ylim = c(min(df$lat), max(df$lat))


if(vv_name == "temperature"){
    if(str_detect(unit, "C"))
     unit = "°C"
}



df <- df %>%
    mutate(bias = model - observation) 

# calculate the absolate bias

df1 <- df %>%
    mutate(bias = abs(bias))
# calculate the 98th percentile of the absolute bias
bias_high <- df1$bias %>% quantile(0.98)
# cap the bias to +/1 98th percentile
df$bias[df$bias > bias_high] <- bias_high
df$bias[df$bias < -bias_high] <- -bias_high



plot_month <- FALSE
if("month" %in% colnames(df))
    plot_month <- TRUE

# # convert month number to month in profile_mld
if(plot_month){
    df <- df %>%
        arrange(month)
df$month <- factor(df$month, levels = df$month, labels = month.abb[df$month])
}
# df$month <- factor(df$month, labels = month.abb)

title <- str_glue("Bias in {layer} {vv_name} ({unit})")

out = str_glue("../../results/{layer}/{variable}/{layer}_{variable}_bias.csv")

# # check directory exists for out
if (!dir.exists(dirname(out))){
    dir.create(dirname(out), recursive = TRUE)
}
df %>% write_csv(out)

# df.to_csv(out, index = False)

# export to csv

title = str_replace(title, "/m\\^3", "m<sup>-3")



gg <- df %>%
# final six months of the year
    ggplot()+
    geom_raster(aes(lon, lat, fill = bias))+
    theme_gray(base_size = 24)+
    # add colour scale. Minimum zero, label 100, ">100"
    coord_fixed(xlim = xlim, ylim = ylim, ratio = 1.5) +
    # move legend to the top. Make it 3 cm wide
    # move legend title to the bottom and centre it
    scale_fill_gradient2(low = "blue", high = "red",
                       guide = guide_colorbar(title.position = "bottom", title.hjust = 0.5, title.theme = ggtext::element_markdown(angle = 0, size = 20, family = "Helvetica"))
                    #    guide = guide_colorbar(title.position = "bottom", title.hjust = 0.5, title.theme = element_text(angle = 0, size = 20, family = "Helvetica"))
  )+
    theme(
    legend.position = "bottom", legend.direction = "horizontal", legend.box = "horizontal", legend.key.width = unit(6.0, "cm"),
    # legend.title = ggtext::element_markdown(),
    legend.key.height = unit(1.0, "cm"))+
    # set the legend title to bias
    labs(fill = title)
  #  .title.x = ggtext::element_markdown())

if (plot_month){
    gg <- gg + facet_wrap(~month)
}


y_labels <-  as.numeric(na.omit(layer_scales(gg)$y$break_positions()))
x_labels <- as.numeric(na.omit(layer_scales(gg)$x$break_positions()))
x_breaks <- x_labels
y_breaks <- y_labels

# y labels are north-south coordinates. Make them more appropriate
# i.e. 10 should be 10 °N, -10 should be 10 °S

y_labels <- ifelse(y_labels >= 0, paste0(y_labels, "°N"), paste0(abs(y_labels), "°S"))
x_labels <- ifelse(x_labels >= 0, paste0(x_labels, "°E"), paste0(abs(x_labels), "°W"))

gg <- gg + scale_x_continuous(breaks = x_breaks, labels = x_labels) + scale_y_continuous(breaks = y_breaks, labels = y_labels)+
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")

gg <- gg +
    labs(x = "", y = "")

    # move legen

gg

In [None]:
md(f"**Figure {chapter}{i_figure}**: Bias in {layer} {vv_name}. The bias is calculated as model - observation. The colour scale is from blue (negative bias) to red (positive bias). The colour scale is capped at the 98th percentile of the absolute bias. This is to avoid a few extreme outliers from dominating the colour scale. **Note:** values have been binned and averaged to 0.5 degree resolution.") 
i_figure += 1

#"adhoc/tmp/df_raw.feather"
# create directory if non-existent, recursive
if os.path.isdir("adhoc/tmp") == False:
    os.makedirs("adhoc/tmp")
df_raw.to_feather("adhoc/tmp/df_raw.feather")
df.to_feather("adhoc/tmp/df.feather")


In [None]:
scatter_text = []
scatter_text.append(f"Figures {chapter}{i_figure} and {chapter}{i_figure + 1} show the distribution of {layer} {vv_name} observations in the model and observational datasets.") 
scatter_text.append(f"This is shown for each month of the year (Figure {chapter}{i_figure}) and for the entire year (Figure {chapter}{i_figure + 1}).")

md(" ".join(scatter_text).strip().replace("  ", " "))

In [None]:
%%capture --no-display
%%R -i df -i compact -i vv_name -i unit -w 1000 -h 1200
#%%R -i df -i variable -i unit -w 1600 -h 1000
#df <- arrow::read_feather("adhoc/tmp/df_raw.feather")
if("month" %in% colnames(df) & compact == FALSE){


library(tidyverse, warn.conflicts = FALSE)

if(vv_name == "temperature"){
    if(str_detect(unit, "C"))
     unit = "°C"
}

x_lab <- str_glue("Model {vv_name} ({unit})")
y_lab <- str_glue("Observed {vv_name} ({unit})")


x_lab <- str_replace(x_lab, "/m\\^3", "m<sup>-3</sup>")
y_lab <- str_replace(y_lab, "/m\\^3", "m<sup>-3</sup>")

df <- df %>%
# convert month number to name, e.g. 1=Jan
# do not use a factor
    mutate(month = month.abb[month]) %>%
    ungroup()

df <- df %>%
    mutate(month = "All months") %>%
    ungroup() %>%
    bind_rows(df)

# convert month to factor
df$month <- factor(df$month, levels = c("All months", month.abb))



gg <- df %>%
# final six months of the year
    ggplot()+
    geom_point(aes(model, observation))+
    facet_wrap(~month)+
    theme_gray(base_size = 24)+
    labs(fill = title)+
    geom_abline()+
    geom_smooth(aes(model, observation), method = "gam")+
    labs(x = x_lab, y = y_lab)+
    theme(axis.title.x = ggtext::element_markdown())+
    theme(axis.title.y = ggtext::element_markdown())

    # move legen

gg
}

In [None]:
if variable not in ["carbon", "benbio"]:
    if compact is False:
        if layer == "surface":
            md(f"**Figure {chapter}{i_figure}**: Simulated versus observed {vv_name} in the top 5 m of the water column. The blue curve is a generalized additive model fit to the data, and the black line represents 1-1 relationship between the simulation and observations. The data has been binned to 0.5 degree resolution.") 
        if layer == "bottom":
            md(f"**Figure {chapter}{i_figure}**: Simulated versus observed {vv_name} near the bottom of the water column. The blue curve is a generalized additive model fit to the data, and the black line represents 1-1 relationship between the simulation and observations. The data has been binned to 0.5 degree resolution.")
        if layer == "all":
            md(f"**Figure {chapter}{i_figure}**: Simulated versus observed {vv_name} throughout the water column. The blue curve is a generalized additive model fit to the data, and the black line represents 1-1 relationship between the simulation and observations. The data has been binned to 0.5 degree resolution.")
        i_figure = i_figure + 1

In [None]:
%%capture --no-display
%%R -i vv_name -i unit -i compact -w 500 

if(compact){
library(dplyr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(stringr)


df <- arrow::read_feather("adhoc/tmp/df.feather")



x_lab <- str_glue("Model {vv_name} ({unit})")
y_lab <- str_glue("Observed {vv_name} ({unit})")
x_lab <- str_replace(x_lab, "/m\\^3", "m<sup>-3</sup>")
y_lab <- str_replace(y_lab, "/m\\^3", "m<sup>-3</sup>")


gg <- df %>%
# final six months of the year
    ggplot()+
    geom_point(aes(model, observation))+
    theme_gray(base_size = 14)+
    labs(fill = title)+
    geom_abline()+
    geom_smooth(aes(model, observation), method = "gam")+
    labs(x = x_lab, y = y_lab)+
    theme(axis.title.x = ggtext::element_markdown())+
    theme(axis.title.y = ggtext::element_markdown())
    # move legen

gg
}

In [None]:
if compact:
    md(f"**Figure {chapter}{i_figure}**: Model vs observed {vv_name} for {layer} values. The observations are from {vv_source}. The line is a GAM fit to the data. The shaded area is the 95% confidence interval of the GAM fit.")
    i_figure += 1

## Summary statistics

In [None]:
md(f"The overall ability of the model to predict the observed {vv_name} was assessed by calculating the average bias, the root mean square error (RMSE) and the correlation coefficient (R). The bias was calculated as the model value minus the observed value. The RMSE was calculated as the square root of the mean squared error. The correlation coefficient was calculated as the Pearson correlation coefficient between the model and observed values.") 
md(f"This was calculated for each month and for the entire dataset. The results are shown in the tables below.")
md(f"This is calculated in two separate ways. First, we use the raw model and observed values. Second, we use data that was averaged to 0.5 to 0.5 bins to account for spatial bias.")

In [None]:
if variable not in ["carbon", "benbio"]:
    df_bias = (
        df_raw
        .assign(bias = lambda x: x.model - x.observation)
        .groupby("month")
        .mean()
        .reset_index()
        .loc[:,["month", "bias"]]
        # convert month number to name
        .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
    )
    # add average bias to df_bias as a separate row
    annual_bias = df_raw.model.mean() - df_raw.observation.mean() 
    df_bias = pd.concat([df_bias, pd.DataFrame({"month": ["All"], "bias": [annual_bias]})])

    # move the final row to the top
    df_bias = pd.concat([df_bias.iloc[[-1]], df_bias.iloc[:-1]])
else:
    # only want annual
    df_bias = pd.DataFrame({"month": ["All"], "bias": [df_raw.model.mean() - df_raw.observation.mean()]})
if variable not in ["carbon", "benbio"]:
    # now create an rmse dataframe
    df_rmse = (
        df_raw
        .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
        .groupby("month")
        .apply(lambda x: np.sqrt((x.model - x.observation).pow(2).mean()))
        .reset_index()
        .rename(columns={0: "rmse"})
    )
    # add average rmse to df_rmse as a separate row
    annual_rmse = np.sqrt(((df_raw.model - df_raw.observation).pow(2)).mean())
    df_rmse = pd.concat([df_rmse, pd.DataFrame({"month": ["All"], "rmse": [annual_rmse]})])
    # move the final row to the top
    df_rmse = pd.concat([df_rmse.iloc[[-1]], df_rmse.iloc[:-1]])
else:
    # only want annual
    df_rmse = pd.DataFrame({"month": ["All"], "rmse": [np.sqrt(((df_raw.model - df_raw.observation).pow(2)).mean())]})
# rename the month column to Month
# merge the two dataframes
df_table = copy.deepcopy(df_bias).merge(df_rmse)
df_table = df_table.round(2)
# create df_corr
if variable not in ["carbon", "benbio"]:
    df_corr = (
        df_raw
        .groupby("month")
        .apply(lambda x: x.model.corr(x.observation))
        .reset_index()
        .rename(columns={0: "correlation"})
        .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
    )
    # add average correlation to df_corr as a separate row
    # calculate annual correlation using all data
    annual_corr = df_raw.model.corr(df_raw.observation)
    df_corr = pd.concat([df_corr, pd.DataFrame({"month": ["All"], "correlation": [annual_corr]})])
    # df_corr = df_corr.append({"month": "All", "correlation": annual_corr}, ignore_index=True)

    # move the final row to the top
    df_corr = pd.concat([df_corr.iloc[[-1]], df_corr.iloc[:-1]])
else:
    # only want annual
    df_corr = pd.DataFrame({"month": ["All"], "correlation": [df_raw.model.corr(df_raw.observation)]})
df_table = df_table.merge(df_corr)
df_table = df_table.round(2)
df_table = df_table.rename(columns={"month": "Month", "bias": "Bias", "rmse": "RMSE", "correlation": "Correlation"})
df_table = df_table[["Month", "Bias", "RMSE", "Correlation"]]
# change Month to Period
df_table = df_table.rename(columns={"Month": "Time period"})

if variable not in ["carbon", "benbio"]:
    # add commas to bias and rmse
    df_number = df_raw.groupby("month").count().reset_index().loc[:,["month", "observation"]]
# convert month number to name
    df_number["month"] = df_number["month"].apply(lambda x: calendar.month_abbr[x])
    df_number = df_number.rename(columns={"month": "Time period", "observation": "Number of observations"})
else:
    df_number = pd.DataFrame({"Time period": ["All"], "Number of observations": [len(df_raw)]})

# add total number of observations
annual_number = len(df_raw)
if variable not in ["carbon", "benbio"]:
    df_number = pd.concat([df_number, pd.DataFrame({"Time period": ["All"], "Number of observations": [annual_number]})])
# df_number = df_number.append({"Time period": "All", "Number of observations": annual_number}, ignore_index=True)
df_table = df_table.merge(df_number)

# include commas in the number of observations
df_table["Number of observations"] = df_table["Number of observations"].apply(lambda x: "{:,}".format(x))
# convert nan to "N/A"
df_table = df_table.fillna("N/A")

df_table.style.hide(axis="index")

In [None]:
md(f"**Table {chapter}{i_table}:** Average bias and root-mean square error in {layer} {vv_name} for each month using the raw {vv_source} data. The bias is calculated as model - observation. The average bias is calculated as the mean of the monthly biases.")
i_table += 1

In [None]:
if variable not in ["carbon", "benbio"]:
    df_bias = (
        df
        .assign(bias = lambda x: x.model - x.observation)
        .groupby("month")
        .mean()
        .reset_index()
        .loc[:,["month", "bias"]]
        # convert month number to name
        .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
    )
    # add average bias to df_bias as a separate row
    annual_bias = df.model.mean() - df.observation.mean() 
    df_bias = pd.concat([df_bias, pd.DataFrame({"month": ["All"], "bias": [annual_bias]})])

    # move the final row to the top
    df_bias = pd.concat([df_bias.iloc[[-1]], df_bias.iloc[:-1]])
else:
    # only want annual
    df_bias = pd.DataFrame({"month": ["All"], "bias": [df.model.mean() - df.observation.mean()]})
if variable not in ["carbon", "benbio"]:
    # now create an rmse dataframe
    df_rmse = (
        df
        .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
        .groupby("month")
        .apply(lambda x: np.sqrt((x.model - x.observation).pow(2).mean()))
        .reset_index()
        .rename(columns={0: "rmse"})
    )
    # add average rmse to df_rmse as a separate row
    annual_rmse = np.sqrt(((df.model - df.observation).pow(2)).mean())
    df_rmse = pd.concat([df_rmse, pd.DataFrame({"month": ["All"], "rmse": [annual_rmse]})])
    # move the final row to the top
    df_rmse = pd.concat([df_rmse.iloc[[-1]], df_rmse.iloc[:-1]])
else:
    # only want annual
    df_rmse = pd.DataFrame({"month": ["All"], "rmse": [np.sqrt(((df.model - df.observation).pow(2)).mean())]})
# rename the month column to Month
# merge the two dataframes
df_table = copy.deepcopy(df_bias).merge(df_rmse)
df_table = df_table.round(2)
# create df_corr
if variable not in ["carbon", "benbio"]:
    df_corr = (
        df
        .groupby("month")
        .apply(lambda x: x.model.corr(x.observation))
        .reset_index()
        .rename(columns={0: "correlation"})
        .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
    )
    # add average correlation to df_corr as a separate row
    # calculate annual correlation using all data
    annual_corr = df.model.corr(df.observation)
    df_corr = pd.concat([df_corr, pd.DataFrame({"month": ["All"], "correlation": [annual_corr]})])
    # df_corr = df_corr.append({"month": "All", "correlation": annual_corr}, ignore_index=True)

    # move the final row to the top
    df_corr = pd.concat([df_corr.iloc[[-1]], df_corr.iloc[:-1]])
else:
    # only want annual
    df_corr = pd.DataFrame({"month": ["All"], "correlation": [df.model.corr(df.observation)]})
df_table = df_table.merge(df_corr)
df_table = df_table.round(2)
df_table = df_table.rename(columns={"month": "Month", "bias": "Bias", "rmse": "RMSE", "correlation": "Correlation"})
df_table = df_table[["Month", "Bias", "RMSE", "Correlation"]]
# change Month to Period
df_table = df_table.rename(columns={"Month": "Time period"})

if variable not in ["carbon", "benbio"]:
    # add commas to bias and rmse
    df_number = df.groupby("month").count().reset_index().loc[:,["month", "observation"]]
# convert month number to name
    df_number["month"] = df_number["month"].apply(lambda x: calendar.month_abbr[x])
    df_number = df_number.rename(columns={"month": "Time period", "observation": "Number of observations"})
else:
    df_number = pd.DataFrame({"Time period": ["All"], "Number of observations": [len(df)]})

# add total number of observations
annual_number = len(df)
if variable not in ["carbon", "benbio"]:
    df_number = pd.concat([df_number, pd.DataFrame({"Time period": ["All"], "Number of observations": [annual_number]})])
# df_number = df_number.append({"Time period": "All", "Number of observations": annual_number}, ignore_index=True)
df_table = df_table.merge(df_number)

# include commas in the number of observations
df_table["Number of observations"] = df_table["Number of observations"].apply(lambda x: "{:,}".format(x))
df_table = df_table.fillna("N/A")

df_table.style.hide(axis="index")

In [None]:
md(f"**Table {chapter}{i_table}:** Average bias and root-mean square error of simulated {layer} {vv_name} for each month using geographically binned {vv_source} data. Data was averaged in each 0.5 by 0.5 degree cell in each year and month. The bias is calculated as model - observation. The average bias is calculated as the mean of the monthly biases.")
i_table += 1

## Linear regresion analysis of model vs observed point_variable

In [None]:
md(f"A linear regression analysis of modelled and observed {vv_name} was performed. The modelled {vv_name} was used as the independent variable and the observed {vv_name} was used as the dependent variable. The results are shown in the table below.")

md("The regression was carried out using the Python package statsmodels.")

In [None]:

# do a linear regression of model vs observed in df
X = df.model.values
Y = df.observation.values
# linear regression using statsmodels
import statsmodels.api as sm
X = sm.add_constant(X)
# make X and Y random numbers between 0 and 1
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
# get the slope and intercept
intercept, slope = model.params
# calculate the r squared
r2 = model.rsquared
# calculate the p value of the slope
p = model.f_pvalue

p = model.f_pvalue
# put that in a dataframe
df_stats = pd.DataFrame({"Slope": slope, "Intercept": intercept, "R2": r2, "P": p}, index = ["All"]).assign(Period = "All")
# do this month by month append to df_stats

for month in range(1, 13):
    try:
        X = df.query("month == @month").model.values
        Y = df.query("month == @month").observation.values
        X = sm.add_constant(X)
        model = sm.OLS(Y, X).fit()
        intercept, slope = model.params
        r2 = model.rsquared
        p = model.f_pvalue
        df_stats = pd.concat([df_stats, pd.DataFrame({"Slope": slope, "Intercept": intercept, "R2": r2, "P": p}, index = [month]).assign(Period = month)])
        df_stats.loc[df_stats.index[-1], "Period"] = calendar.month_abbr[month]
    except:
        pass
# sort period appropriately, so All is first then ordered by month
df_stats["Period"] = pd.Categorical(df_stats["Period"], [calendar.month_abbr[x] for x in range(1, 13)] + ["All"])
# round p-value to 3 dp
df_stats["P"] = df_stats["P"].round(5)
# change P to p-value
df_stats = df_stats.rename(columns={"P": "p-value"})
# put Period first
df_stats = df_stats[["Period", "Slope", "Intercept", "R2", "p-value"]]
# 

df_stats.style.hide(axis="index")

In [None]:
md(f"**Table {chapter}{i_table}:** Linear regression analysis of modelled and observed {vv_name}. The modelled {vv_name} was used as the independent variable and the observed {vv_name} was used as the dependent variable. The slope and intercept of the regression line are shown, along with the R<sup>2</sup> value and the p-value of the slope. The p-value is a measure of the significance of the slope. A p-value less than 0.05 is considered statistically significant. Note: only months with sufficient values for a regression are shown.")
i_table += 1 

In [None]:
chunk_end

## Data Sources

In [None]:
if variable == "poc":
    md("Boss, Emmanuel; Picheral, Marc; Searson, Sarah; Le Goff, Hervé; Reverdin, Gilles; Leeuw, Thomas; Chase, Alison P; Bricaud, Annick; Kolber, Zbigniew S; Taillandier, V; Pesant, Stephane; Tara Oceans Consortium, Coordinators; Tara Oceans Expedition, Participants (2017): Underway surface water data during the Tara Oceans expedition in 2009-2012 [dataset]. PANGAEA, https://doi.org/10.1594/PANGAEA.873566, In: Boss, Emmanuel; Picheral, Marc; Searson, Sarah; Marec, Claudie; Le Goff, Hervé; Reverdin, Gilles; Leeuw, Thomas; Chase, Alison P; Anderson, Leif G; Gattuso, Jean-Pierre; Pino, Diana Ruiz; Padín, Xose Antonio; Grondin, Pierre-Luc; Matuoka, Atsushi; Babin, Marcel; Bricaud, Annick; Kolber, Zbigniew S; Taillandier, V; Hafez, Mark; Chekalyuk, Alexander; Pesant, Stephane; Météo France; Tara Oceans Consortium, Coordinators (2017): Harmonised data from underway navigation, meteorology and surface water measurements during the Tara Oceans expedition in 2009-2013 [dataset publication series]. PANGAEA, https://doi.org/10.1594/PANGAEA.873592")
    md("Röttgers, Rüdiger; Bi, Shun; Burmester, Henning; Heymann, Kerstin; Hieronymi, Martin; Krasemann, Hajo; Schönfeld, Wolfgang (2023): Water inherent optical properties and concentrations of water constituents from the German Bight and adjacent regions: concentrations and auxiliary data [dataset]. PANGAEA, https://doi.org/10.1594/PANGAEA.950767, In: Röttgers, Rüdiger; Bi, Shun; Burmester, Henning; Heymann, Kerstin; Hieronymi, Martin; Krasemann, Hajo; Schönfeld, Wolfgang (2023): Water inherent optical properties and concentrations of water constituents from the German Bight and adjacent regions [dataset bundled publication]. PANGAEA, https://doi.org/10.1594/PANGAEA.950774)")
    md("Loisel, Hubert; Duforêt-Gaurier, Lucile; Tran, Trung Kien; Jorge, Daniel S F; Steinmetz, Francois; Mangin, Antoine; Bretagnon, Marine; d'Andon, Odile (2023): Database (DSM) of in situPOC, SPM and Rrs collected between 1997 and 2018 [dataset]. PANGAEA, https://doi.org/10.1594/PANGAEA.960962")
    md("Lønborg, Christian; Carreira, Cátia; Abril, Gwenael; Agustí, Susana; Amaral, Valentina; Andersson, Agneta; Arístegui, Javier; Bhadury, Punyasloke; Bif, Mariana B; Borges, Alberto Vieira; Bouillon, Steven; Calleja, Maria Ll; Cotovicz, Luiz C Jr; Cozzi, Stefano; Doval, Maryló; Duarte, Carlos Manuel; Eyre, Bradley D; Fichot, Cedric; García-Martín, Elena; Garzon-Garcia, Alexandra; Giani, Michele; Gonçalves-Araujo, Rafael; Gruber, Renee K; Hansell, Dennis A; Hashihama, Fuminori; He, Ding; Holding, Johnna M; Hunter, William Ross; Ibánhez, J Severino; Ibello, Valeria; Jiang, Shan; Kim, Guebuem; Klun, Katja; Kowalczuk, Piotr; Kubo, Atsushi; Lee, Choon Weng; Lopes, Claudia B; Maggioni, Federica; Magni, Paolo; Marrasé, Celia; Martin, Patrick; McCallister, S Leigh; McCallum, Rosh; M Medeiros, Patricia; G Morán, Xosé Anxelu; Muller-Karger, Frank; Myers-Pigg, Allison; Norli, Marit; Oakes, Joanne M; Osterholz, Helena; Park, Hyekyung; Lund Paulsen, Maria; Rosentreter, Judith A; Ross, Jeff; Rueda-Roa, Digna; Santinelli, Chiara; Shen, Yuan; Teira, Eva; Tinta, Tinkara; Uher, Guenther; Wakita, Masahide; Ward, Nicholas D; Watanabe, Kenta; Xin, Yu; Yamashita, Youhei; Yang, Liyang; Yeo, Jacob; Yuan, Huamao; Zheng, Qiang; Álvarez‐Salgado, Xosé Antón (2023): A global database of dissolved organic matter (DOM) concentration measurements in coastal waters (CoastDOM v.1) [dataset]. PANGAEA, https://doi.org/10.1594/PANGAEA.964012")


In [None]:
if vv_source.lower() == "ices":
    md("ICES Data Portal, Dataset on Ocean HydroChemistry, Extracted March 3, 2023. ICES, Copenhagen")

In [None]:
if variable == "carbon":
    md('Diesing, Markus, Terje Thorsnes, and Lilja Rún Bjarnadóttir. "Organic carbon densities and accumulation rates in surface sediments of the North Sea and Skagerrak." Biogeosciences 18.6 (2021): 2139-2160.')

In [None]:
if vv_source.lower() == "socat23":
    md("Bakker, Dorothee C. E.; Alin, Simone R.; Bates, Nicholas; Becker, Meike; Feely, Richard A.; Gkritzalis, Thanos; Jones, Steve D.; Kozyr, Alex; Lauvset, Siv K.; Metzl, Nicolas; Munro, David R.; Nakaoka, Shin-ichiro; Nojiri, Yukihiro; O'Brien, Kevin M.; Olsen, Are; Pierrot, Denis; Rehder, Gregor; Steinhoff, Tobias; Sutton, Adrienne J.; Sweeney, Colm; Tilbrook, Bronte; Wada, Chisato; Wanninkhof, Rik; Akl, John; Barbero, Leticia; Beatty, Cory M.; Berghoff, Carla F.; Bittig, Henry C.; Bott, Randy; Burger, Eugene F.; Cai, Wei-Jun; Castaño-Primo, Rocío; Corredor, Jorge E.; Cronin, Margot; De Carlo, Eric H.; DeGrandpre, Michael D.; Dietrich, Colin; Drennan, William M.; Emerson, Steven R.; Enochs, Ian C.; Enyo, Kazutaka; Epherra, Lucía; Evans, Wiley; Fiedler, Björn; Fontela, Marcos; Frangoulis, Constantin; Gehrung, Martina; Giannoudi, Louisa; Glockzin, Michael; Hales, Burke; Howden, Stephan D.; Ibánhez, J. Severino P.; Kamb, Linus; Körtzinger, Arne; Lefèvre, Nathalie; Lo Monaco, Claire; Lutz, Vivian A.; Macovei, Vlad A.; Maenner Jones, Stacy; Manalang, Dana; Manzello, Derek P.; Metzl, Nicolas; Mickett, John; Millero, Frank J.; Monacci, Natalie M.; Morell, Julio M.; Musielewicz, Sylvia; Neill, Craig; Newberger, Tim; Newton, Jan; Noakes, Scott; Ólafsdóttir, Sólveig Rósa; Ono, Tsuneo; Osborne, John; Padín, Xose A.; Paulsen, Melf; Perivoliotis, Leonidas; Petersen, Wilhelm; Petihakis, George; Plueddemann, Albert J.; Rodriguez, Carmen; Rutgersson, Anna; Sabine, Christopher L.; Salisbury, Joseph E.; Schlitzer, Reiner; Skjelvan, Ingunn; Stamataki, Natalia; Sullivan, Kevin F.; Sutherland, Stewart C.; T'Jampens, Michiel; Tadokoro, Kazuaki; Tanhua, Toste; Telszewski, Maciej; Theetaert, Hannelore; Tomlinson, Michael; Vandemark, Douglas; Velo, Antón; Voynova, Yoana G.; Weller, Robert A.; Whitehead, Chris; Wimart-Rousseau, Cathy (2023). Surface Ocean CO2 Atlas Database Version 2023 (SOCATv2023) (NCEI Accession 0278913). [indicate subset used]. NOAA National Centers for Environmental Information. Dataset. https://doi.org/10.25921/r7xa-bt92. Accessed [25/04/2024].")

In [None]:
if variable == "doc":
    md("Hansell, Dennis A.; Carlson, Craig A.; Amon, Rainer M. W.; Álvarez-Salgado, X. Antón; Yamashita, Youhei; Romera-Castillo, Cristina; Bif, Mariana B. (2021). Compilation of dissolved organic matter (DOM) data obtained from global ocean observations from 1994 to 2021. Version 2 (NCEI Accession 0227166). [indicate subset used]. NOAA National Centers for Environmental Information. Dataset. https://doi.org/10.25921/s4f4-ye35. Accessed [date].")

    md("Lønborg, C., Carreira, C., Abril, G., Agustí, S., Amaral, V., Andersson, A., ... & Álvarez-Salgado, X. A. (2024). A global database of dissolved organic matter (DOM) concentration measurements in coastal waters (CoastDOM v1). Earth System Science Data, 16(2), 1107-1119.")


In [None]:
if variable == "benbio":
    md("URL: https://www.vliz.be/vmdcdata/nsbs/about.php")

In [None]:
if test_status:
    md("This is getting to the end!")