# Surface template_title validation using gridded observations

In [None]:
chunk_start

In [None]:
stamp = nc.session_info["stamp"]
out = ".trackers/" + stamp + ".txt"
if not os.path.exists(".trackers"):
    os.makedirs(".trackers")
# save out as empty file
with open(out, 'w') as f:
    f.write("")

In [None]:
import numpy as np
grid = pd.read_csv("../../matched/model_grid.csv")
lon = grid.loc[:,[x for x in grid.columns if "lon" in x]].values
lon = np.unique(lon)
lon.sort()
lat = grid.loc[:,[x for x in grid.columns if "lat" in x]].values
lat = np.unique(lat)
lat.sort()
# get unique values in grid and sort them
lon = np.unique(lon)
lon.sort()
lon_res = lon[1] - lon[0]
lat_res = lat[1] - lat[0]

In [None]:
variable = "template_variable" 
Variable = variable.title()
vv_name= variable
if vv_name == "co2flux":
    vv_name = "air-sea CO~2~ fluxes"
source = glob.glob(f"../../matched/gridded/**/**/**_{variable}*.nc")[0].split("/")[-1].split("_")[0]

In [None]:

if source == "nsbc":
    md("We used version 1.1 of the **North Sea Biogeochemical Climatology** (NSBC) to validate **sea surface template_variable**. NSBC is a monthly climatology that covers the region 47°-65°N and 15°W-15°E. The data is made up of observations over the period 1960-2014. For validation purposes we only used the level 3 data, which a gridded monthly climatology at a spatial resolution of 1/4°.  The data can be download from [NSBC](https://www.cen.uni-hamburg.de/en/icdc/data/ocean/nsbc.html).")
else:
    if variable == "poc":
        md("Model and observational POC were compared using data from the National Centre for Earth Observation (NCEO).")
        md("*Summary from NCEO*")
        md("The National Centre for Earth Observation (NCEO): Monthly global Particulate Organic Carbon (POC) dataset contains POC concentrations gridded on both sinusoidal (SIN) and geographic (GEO) grid projections at 4 km spatial resolution for 1997-2020. The POC dataset has been produced using the Ocean Colour Climate Change Initiative Remote Sensing Reflectance (Rrs) products, Version 4.2. The dataset includes the Rrs at 443 nm and 555 nm with pixel-by-pixel uncertainty estimates for each wavelength.")
        md("For more details on the algorithm and its validation, please see papers by Stramski et al. (2008) and Evers-King et al. (2017). Please note that the validation of the POC algorithm is a continuing process. To increase the accuracy of POC algorithms, further in situ POC data need to be collected with high spatial and temporal resolution.")

    if variable == "doc":
        md("## Data Source: NCEO BICEP project DOC")
        md("Modelled DOC does not included the refactory component, which is typically 40 uM. This was added to the model data to make it comparable to the NCEO data.")
    
    if source == "ostia":
        md("Temperature was validated using the OSTIA sea surface temperature dataset. The validation was performed by comparing the modelled temperature with the OSTIA data for the same time and location. The OSTIA data was downloaded from the Copernicus Marine Environment Monitoring Service () catalogue. A description of the dataset is available [here](https://data.marine.copernicus.eu/product/SST_GLO_SST_L4_REP_OBSERVATIONS_010_011/description).")
    
    if source == "cobe2":
        md(f"Temperature was validated using the COBE2 sea surface temperature dataset. The validation was performed by comparing the modelled temperature with the COBE2 data for the same time and location. The COBE2 data was downloaded from https://psl.noaa.gov/data/gridded/data.cobe2.html.")
        md(f"Observational temperature is a monthly time series from 1850 with a spatial resolution of 1°x1°.")

md(f"The model and observations were matched up as follows. First, the model dataset was cropped by a small amount to make sure cells close to the boundary were removed.")

md("The model was then regridded to the observational grid if the observational grid was coarser using nearest neighbour. Only grid cells with model and observational data were maintained.")

df_mapping = pd.read_csv("../../matched/mapping.csv")
model_variable = list(df_mapping.query("variable == @variable").model_variable)[0]

md(f"The following model output was used to compare with observational **{variable}**: **{model_variable}**.")

In [None]:
ff = glob.glob(f"../../matched/gridded/**/**/**_{variable}*.nc")
if len(ff) != 1:
    raise ValueError("Something is wrong with the file")
layer = os.path.basename(ff[0]).split("_")[-1].replace(".nc", "")
ds_model = nc.open_data(ff)
ds_model.set_precision("F32")
ds_model.subset(variable = "model")
ds_model.tmean("month")
ds_year = min(ds_model.years)
ds_model.set_year(ds_year)
ds_times = ds_model.times
df_times = pd.DataFrame({"year":[x.year for x in ds_times]}).groupby("year").size().reset_index()
df_times.columns = ["year", "count"]
years = list(df_times.query("count > 1").year)
ds_model.as_missing(0)
# if variable is doc, add 40
ds_model.run()
ds_annual = ds_model.copy()
ds_annual.tmean()
# ds_annual.set_longnames({ds_annual.variables[0]: Variable})

### Baseline climatologies of template_title

In [None]:
md(f"Climatologies of model and observational {layer} {vv_name} are shown in the two figures below.")

In [None]:
ff = glob.glob(f"../../matched/gridded/**/**/**_{variable}*.nc")
ds_obs = nc.open_data(ff)
ds_obs.subset(variable = "observation")
ds_obs.set_precision("F32")
ds_obs.tmean("month")
ds_obs.run()

In [None]:
## fix the units and names

vars = [
        "ammonium",
        "chlorophyll",
        "nitrate",
        "phosphate",
        "oxygen",
        "silicate",
        "poc",
        "doc",
    ]
if variable in vars:
    # set the units of obs to match model
    ds_obs.set_units({ds_obs.variables[0]: ds_model.contents.unit[0]})
    if variable not in ["poc", "doc"]:
        # set the longnames of obs to match model
        ds_obs.set_longnames({ds_obs.variables[0]: f"Observed surface {variable} concentration"})
        ds_model.set_longnames({ds_model.variables[0]: f"Modelled surface {variable} concentration"})
        ds_annual.set_longnames({ds_annual.variables[0]: f"Modelled annual mean surface {variable} concentration"})
    else:
        ds_obs.set_longnames({ds_obs.variables[0]: f"Observed surface {variable.upper()} concentration"})
        ds_model.set_longnames({ds_model.variables[0]: f"Modelled surface {variable.upper()} concentration"})
        ds_annual.set_longnames({ds_annual.variables[0]: f"Modelled annual mean surface {variable.upper()} concentration"})
    if variable == "temperature":
        ds_obs.set_longnames({ds_obs.variables[0]: "Observed sea surface temperature"})
        ds_model.set_longnames({ds_model.variables[0]: "Modelled sea surface temperature"})
        ds_annual.set_longnames({ds_annual.variables[0]: "Modelled annual mean sea surface temperature"})

ds_obs.run()
ds_model.run()


In [None]:
chunk_clim

In [None]:
chunk_bias

In [None]:
md(f"## Can the model reproduce seasonality of {layer} {vv_name}?")

md(f"The ability of the model to reproduce seasonality of {layer} {vv_name} was assessed by comparing the modelled and observed seasonal cycle of {vv_name}. First, we derive a monthly climatology for the model data. Then, we calculate the Pearson correlation coefficient between the modelled and observed {vv_name} at each grid cell. The Pearson correlation coefficient is a measure of the linear correlation between two variables. It has a value between -1 and 1, where 1 indicates a perfect positive linear correlation, 0 indicates no linear correlation, and -1 indicates a perfect negative linear correlation.")

md("Note: we are only assessing the ability of the model to reproduce the ability of the model to reproduce seasonal changes, not long-term trends.")


In [None]:
chunk_seasonal

In [None]:
md(f"## Regional assessment of model performance for {layer} {vv_name}")

We assessed the regional performance of the model by comparing the model with observations from the following regions: Southern North Sea, Central North Sea, Northern North Sea, Channel, Skagerrak, Norwegian Trench, Shetland, Irish Shelf, Irish Sea, Celtic Sea, Armorican, Northern North East Atlantic, Southern North East Atlantic, Shelf, Ocean, Rosa, Locate Shelf, Deep Ocean.

The regions considered are mapped below.

In [None]:
if regional:
    df_mapped = (
        ds_regions
        .to_dataframe()
        .reset_index()
        .melt(id_vars = ["lon", "lat"])
        .dropna()
        .merge(regions_contents.loc[:,["variable", "long_name"]])
        .drop(columns = [ "value"])
    )
    bad = ["Rosa", "Locate Shelf"]
    df_mapped = df_mapped.query("long_name not in @bad")
    xlim = np.array([df_mapped.lon.min(), df_mapped.lon.max()])
    ylim = np.array([df_mapped.lat.min(), df_mapped.lat.max()])
    shape = gpd.read_file(f"{data_dir}/mapping/TM_WORLD_BORDERS-0.3.shp")

    def fix_name(x):
        x = x.replace("North East", "NE")
        x = x.replace("North ", "N ")
        if x == "Channel":
            x = "English Channel"
        return x

    fix_name = np.vectorize(fix_name)


    df_mapped.long_name = fix_name(df_mapped.long_name)


    gg = (
        ggplot( df_mapped)+
         geom_tile(aes(x  = "lon",y =   "lat"))+
        geom_map(shape, aes("LON", "LAT"), fill = "grey", colour = "grey")+
        coord_cartesian(xlim = xlim, ylim = ylim)+
        scale_x_continuous(breaks = [-20, -10, 0, 10], labels = ["20°W", "10°W", "0°", "10°E"])+
        scale_y_continuous(breaks = [40, 50, 60], labels = ["40°N", "50°N", "60°N"])+
        theme_bw(base_size = 10)+
        facet_wrap("~long_name")+

        theme(axis_title_x=element_blank(),
                axis_title_y=element_blank())
    )

    gg = gg.draw()
    gg


In [None]:
if regional:
    md(f"**Figure {i_figure}**: Regions used for validation of {layer} {vv_name}.")
i_figure += 1

In [None]:
if regional:
    md(f"Time series were constructed comparing the monthly mean of the spatial average {layer} {vv_name} in each region. The spatial average was calculated using the mean of all grid cells within each region, accounting for grid cell area.")

In [None]:
if regional:
    df_all = []
    for vv in ds_regions.variables:
        ds_rr = ds_regions.copy()
        ds_rr.subset(variable = vv)
        ds_rr.run()
        ds_vv = ds_ts.copy()
        time_name = [x for x in list(ds_vv.to_xarray().coords) if "time" in x][0]
        ds_vv * ds_rr
        ds_region = ds_vv.copy()
        ds_vv.spatial_mean()
        region = list(regions_contents.query("variable == @vv").long_name)[0]
        df_vv = (
            ds_vv
            .to_dataframe()
            .reset_index()
            .rename(columns = {time_name: "time"})
            .loc[:,["time", "model", "observation"]]
            .melt("time")
            .assign(month = lambda x: x.time.dt.month)
            .assign(region = vv)
        )
        df_all.append(df_vv)
        ds_region.tmean()
        df_region = (
            ds_region
            .to_dataframe()
            .dropna()
            .reset_index()
            .loc[:,["model", "observation"]]
            .drop_duplicates()
        )
    
        del ds_rr, ds_vv, ds_region
    df_all = pd.concat(df_all).dropna()
        
    df_all = (
        df_all
        .merge(df_mapped.loc[:,["long_name", "variable"]].drop_duplicates().rename(columns = {"variable": "region"}))
    )

In [None]:
if regional:
    ylab = "Spatial average " + vv_name + " ("+ nc.static_plot.fix_label(ds_ts.contents.unit[0]) + ")"
    gg = (
        ggplot(df_all)+
        geom_line(aes("month", "value", colour = "variable"))+
        facet_wrap("long_name")+
        labs(y = ylab )+
        labs(x = "Month")+
        theme(legend_position = "top")+
        scale_color_manual(values = ["red", "blue"])+
        theme_bw(base_size = 10)+
        labs(colour = "")+
        scale_x_continuous(breaks = [1,4, 7, 10], labels = ["Jan", "Apr", "Jul", "Oct"]) +
        theme(legend_position = "top") 
        
    )
    
    gg = gg.draw()
    gg


In [None]:
if regional:
    md(f"**Figure {i_figure}**: Seasonal cycle of {layer} {vv_name} for model and observations for each region. The spatial average is taken over the region.") 
    i_figure += 1

In [None]:
chunk_results

In [None]:
md(f"Can the model reproduce spatial patterns of {layer} {vv_name}?")

md(f"The ability of the model to reproduce spatial patterns of {layer} {vv_name} was assessed by comparing the modelled and observed {vv_name} at each grid cell. We calculated the Pearson correlation coefficient between the modelled and observed {vv_name} at each grid cell. The Pearson correlation coefficient is a measure of the linear correlation between two variables. It has a value between -1 and 1, where 1 indicates a perfect positive linear correlation, 0 indicates no linear correlation, and -1 indicates a perfect negative linear correlation.")
md("This was carried out monthly and using the annual mean in each grid cell")

In [None]:
ff = glob.glob(f"../../matched/gridded/**/**/**_{variable}*.nc")
if len(ff) != 1:
    raise ValueError("Something is wrong with the file")
layer = os.path.basename(ff[0]).split("_")[-1].replace(".nc", "")
ds_cor = nc.open_data(ff)
ds_cor.set_precision("F32")
ds_cor.tmean("month")
ds_cor.cor_space("model", "observation")
ds_cor_df = ds_cor.to_dataframe().reset_index()
ds_cor_df = ds_cor_df.dropna()
time_name = [x for x in list(ds_cor.to_xarray().coords) if "time" in x][0]
# rename time in dataframe
ds_cor_df.rename(columns = {time_name: "time"}, inplace = True)
# extract the month
ds_cor_df["month"] = ds_cor_df.time.dt.month
ds_cor_df = ds_cor_df.loc[:,["month", "cor"]].drop_duplicates()
# change month number to month name
ds_cor_df["month"] = ds_cor_df["month"].apply(lambda x: calendar.month_abbr[x])
# now do this annually
ds_cor = nc.open_data(ff)
ds_cor.set_precision("F32")
ds_cor.tmean("month")
ds_cor.tmean()
ds_cor.cor_space("model", "observation")
ds_cor_df_annual = ds_cor.to_dataframe().reset_index()
ds_cor_df_annual = ds_cor_df_annual.dropna()
time_name = [x for x in list(ds_cor.to_xarray().coords) if "time" in x][0]
# rename time in dataframe
ds_cor_df_annual.rename(columns = {time_name: "time"}, inplace = True)
# extract the month
ds_cor_df_annual["month"] = ds_cor_df_annual.time.dt.month
ds_cor_df_annual = ds_cor_df_annual.loc[:,["month", "cor"]].drop_duplicates()
# output to csv
ds_cor_df_annual = ds_cor_df_annual.assign(month = "Annual mean")
# merge the two dataframes
ds_cor_df = pd.concat([ds_cor_df_annual, ds_cor_df])
# change month to period
ds_cor_df.rename(columns = {"month": "period"}, inplace = True)
# Give the columns more sensible names
ds_cor_df.rename(columns = {"cor": "Correlation coefficient"}, inplace = True)
ds_cor_df.rename(columns = {"period": "Time period"}, inplace = True)
ds_cor_df.style.hide(axis="index")

In [None]:
md(f"**Table {i_table}**: Pearson correlation coefficient between modelled and observed {layer} {vv_name} at each grid cell. The correlation was calculated monthly and using the annual mean in each grid cell.")
i_table += 1

## Cumulative distribution function

In [None]:
md(f"The ability of the model to reproduce the broad-scale statistical distribution of {layer} {vv_name} is assessed by comparing the cumulative distribution function (CDF) of the modelled and observed {vv_name}. The CDF is a function that maps the probability that a random variable is less than or equal to a given value. The CDF is calculated by counting the number of values less than or equal to a given value and dividing by the total number of values. The CDF is a non-parametric measure of the statistical distribution of a random variable. It is a more robust measure of the statistical distribution than the mean and standard deviation, which are sensitive to outliers.")

In [None]:
ds = nc.open_data(f"../../results/monthly_mean/monthlymean_{variable}.nc")
ds_regions = nc.open_data(f"{data_dir}/amm7_val_subdomains.nc")
ds_regions.subset(variable = "Shelf")
ds_regions.as_missing(0)
ds_regions.regrid(ds)
ds * ds_regions
df = (
    ds
    .to_dataframe()
    .dropna()
    .reset_index()
    )
time_name = [x for x in df.columns if "time" in x][0]
df.rename(columns = {time_name: "time"}, inplace = True)
df = (
    df
    .assign(month = lambda x: x.time.dt.month)
    .loc[:,["lon", "lat", "model", "observation", "month"]]
    .drop_duplicates()
    )
df = df.melt(["lon", "lat", "month"])
units = ds.contents.unit[0]

In [None]:
%%capture --no-display
%%R -i df -i units -i variable -w 10 -h 10 --units in -r 100
library(tidyverse)
library(ggridges)
library(ggthemes)

df <- df %>%
    group_by(variable, month) %>%
    # calculate 98th percentile of value
    summarize(limit = quantile(value, 0.98)) %>%
    inner_join(df) %>%
    # filter to values below 98th percentile
    filter(value < limit) %>%
    # remove limit column
    select(-limit)


# ggplot(iris, aes(x = Sepal.Length, y = Species, group = Species)) + 
#   geom_density_ridges(fill = "#00AFBB")

# df <- df %>%
#     mutate(month = as.factor(month))
# convert month to month name
df$month <- month.name[df$month]
# convert month to factor
df$month <- as.factor(df$month)
# ensure month is ordered
df$month <- factor(df$month, levels = month.name)


# edf plot

ggplot(df, aes(x = value, colour = variable)) +
    stat_ecdf()+
    facet_wrap(~month)+
    labs(y = "Cumulative probability", x = str_glue("{str_to_title(variable)} ({units})"))+
    theme_bw(base_size = 14)+
    theme(legend.position = "top")+
    labs(colour = NULL)+
    scale_color_fivethirtyeight()

In [None]:
md(f"**Figure {i_figure}**: Empirical distribution function of {layer} {vv_name} for model and observations for each month. This compares the distributions on the shelf across the entire domain using grid cells with model-observation matchups.")

In [None]:
time_series = False
if regional:
    ff = glob.glob(f"../../matched/gridded/**/**/**_{variable}*.nc")
    ds_ts = nc.open_data(ff)
    years = ds_ts.years
    year_range = f"{min(years)}-{max(years)}"
    if len(years) > 1:
        mask_all(ds_ts)
        ds_ts.tmean("year")
        ds_ts.run()
        time_series = True

In [None]:
if time_series:
    md(f"The ability of the model to reproduce mult-year trends in {layer} {vv_name} was assessed by comparing the modelled and observed time series of annual {vv_name} across each region.")
    md(f"The figure below shows the average {vv_name} in each region")

In [None]:
if time_series:
    df_all = []
    for vv in ds_regions.variables:
        ds_rr = ds_regions.copy()
        ds_rr.subset(variable = vv)
        ds_rr.run()
        ds_vv = ds_ts.copy()
        ds_vv * ds_rr
        ds_region = ds_vv.copy()
        ds_vv.spatial_mean()
        region = list(regions_contents.query("variable == @vv").long_name)[0]
        time_name = [x for x in list(ds_vv.to_xarray().coords) if "time" in x][0]
        df_vv = (
            ds_vv
            .to_dataframe()
            .reset_index()
            .rename(columns = {time_name: "time"})
            .loc[:,["time", "model", "observation"]]
            .melt("time")
            .assign(year = lambda x: x.time.dt.year)
            .assign(region = vv)
        )
        df_all.append(df_vv)
        ds_region.tmean()
        df_region = (
            ds_region
            .to_dataframe()
            .dropna()
            .reset_index()
            .loc[:,["model", "observation"]]
            .drop_duplicates()
        )
    
        del ds_rr, ds_vv, ds_region
    df_all = pd.concat(df_all).dropna()
        
    df_all = (
        df_all
        .merge(df_mapped.loc[:,["long_name", "variable"]].drop_duplicates().rename(columns = {"variable": "region"}))
    )

In [None]:
if time_series:
    ylab = "Spatial average " + variable + " ("+ nc.static_plot.fix_label(ds_ts.contents.unit[0]) + ")"
    
    gg = (
        ggplot(df_all)+
        geom_line(aes("year", "value", colour = "variable"))+
        facet_wrap("long_name")+
        labs(y = ylab )+
        labs(x = "Year")+
        theme(legend_position = "top")+
        scale_color_manual(values = ["red", "blue"])+
        theme_bw(base_size = 10)+
        labs(colour = "")+
        theme(legend_position = "top") 
        
    )
    
    gg = gg.draw()
    gg


In [None]:
if time_series:
    md(f"**Figure {i_figure}**: Changes in {layer} {vv_name} for model and observations for each region for the period {year_range}. The spatial average is taken over the region.") 
    i_figure += 1

In [None]:
chunk_end

## Data citation

In [None]:
if source == "nsbc":
    md("Hinrichs,Iris; Gouretski,Viktor; Paetsch,Johannes; Emeis, Kay; Stammer, Detlef (2017). North Sea Biogeochemical Climatology (Version 1.1).")
    md("URL: https://www.cen.uni-hamburg.de/en/icdc/data/ocean/nsbc.html")
if variable == "poc":
    md("Sathyendranath, S.; Kong, C.; Jackson, T. (2021): NCEO: Monthly global Particulate Organic Carbon (POC) (produced from the Ocean Colour Climate Change Initiative, Version 4.2 dataset). Centre for Environmental Data Analysis, 07 January 2021. doi:10.5285/ef09d81517a84979ac60329e4859f449. https://dx.doi.org/10.5285/ef09d81517a84979ac60329e4859f449")
    md("URL: https://catalogue.ceda.ac.uk/uuid/ef09d81517a84979ac60329e4859f449")

if source == "ostia":
    md("Good, S.; Fiedler, E.; Mao, C.; Martin, M.J.; Maycock, A.; Reid, R.; Roberts-Jones, J.; Searle, T.; Waters, J.; While, J.; Worsfold, M. The Current Configuration of the OSTIA System for Operational Production of Foundation Sea Surface Temperature and Ice Concentration Analyses. Remote Sens. 2020, 12, 720, doi:10.3390/rs12040720")
    md("URL: https://data.marine.copernicus.eu/product/SST_GLO_SST_L4_REP_OBSERVATIONS_010_011/description")

if source == "cobe2":
    md("COBE-SST 2 and Sea Ice data provided by the NOAA PSL, Boulder, Colorado, USA, from their website at https://psl.noaa.gov.")

