# Validation of near-bottom ices_variable 

In [None]:
# bin_value using function from r4ecology's github
import numpy as np
def bin_value(x, bin_res):
    return np.floor((x + bin_res / 2) / bin_res + 0.5) * bin_res - bin_res / 2


In [None]:
chunk_start

In [None]:
variable = "ices_variable"
# get the units. File inspection could be randomized in case people have put loose files in there...
import glob
df = pd.read_csv("../../matched/mapping.csv")
df = df.query("variable == @variable")
pattern = list(df.pattern)[0]
while True:
    i = 0
    patterns = pattern.split("/")
    for x in patterns:
        if x == "**":
            break
        i+=1
    new_pattern = glob.glob("/".join(patterns[0:i])+"/" + "**" )[-1].split("/")[-1]
    patterns[i] = new_pattern
    pattern = "/".join(patterns)

    
    if len([x for x in pattern.split("/") if x == "**"]) == 0:
        break
paths = glob.glob(pattern)
ds = nc.open_data(paths[0])
model_variable = list(df.model_variable)[0]
unit = list(ds.contents.query("variable == @model_variable").unit)[0]

## Read in the data

In [None]:
df = pd.read_csv(f"../../matched/ices/bottom/ices_bottom_{variable}.csv")
# Danish part is always dubious
df = df.query("lon < 9")
ds= nc.open_data(f"{data_dir}/amm7_val_subdomains.nc")
ds.subset(variable = "Shelf")
ds.as_missing(0)
ds.regrid(df.loc[:,["lon", "lat"]], "nn")
df_grid = ds.to_dataframe().reset_index().dropna().drop_duplicates()
df = df.merge(df_grid)
df_locs = df.loc[:,["lon", "lat"]].drop_duplicates()
# bin to 0.01 resolution
df_raw = df
df["lon"] = df["lon"].apply(lambda x: bin_value(x, 0.5))
df["lat"] = df["lat"].apply(lambda x: bin_value(x, 0.5))
df = df.groupby(["lon", "lat", "year", "month"]).mean().reset_index()

In [None]:
md(f"Near-bottom values of {variable} were extracted from ICES bottle and CTD data.")
md(f"This data was extracted from vertical profiles. The near-bottom value was defined as the value closest to the bottom, that was within 5 m of the bottom. Bathymetry was estimated using GEBCO Bathymetry data.")

md(f"In total there were {len(df)} near-bottom values extracted from the  ICES database.")

md("**Note:** this analysis has been restricted to observations on the shelf region.")

md(f"Data for {variable} was downloaded from the ICES website on 3rd March 2023. The data is available from [ICES](https://data.ices.dk/view-map).") 



In [None]:
# bottom 1% of observations
bot_low = df.observation.quantile(0.001)
df = df.query(f"observation >= {bot_low}")

In [None]:
%%capture --no-display
%%R -i df_locs -i variable -i unit -w 1000 -h 1200
library(tidyverse, warn.conflicts = FALSE)
world_map <- map_data("world")
# get lon, lat limits from profile_mld

xlim = c(min(df_locs$lon), max(df_locs$lon))
ylim = c(min(df_locs$lat), max(df_locs$lat))



gg <- df_locs %>%
# final six months of the year
    ggplot()+
    geom_point(aes(lon, lat))+
    theme_gray(base_size = 24)+
    # add colour scale. Minimum zero, label 100, ">100"
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")+
    coord_fixed(xlim = xlim, ylim = ylim, ratio = 1.5) 

# figure out if lon minimum is less than -10
if( min(df_locs$lon) < -10 ){
    # add sensible labels for longitude and latitude

    gg <- gg +
    scale_x_continuous(breaks = seq(-10, 5, 5), labels = c("10°W", "5°W", "0°", "5°E"))+ 
    scale_y_continuous(breaks = seq(45, 60, 5), labels = c("45°N", "50°N", "55°N", "60°N"))+
    labs(x = "", y = "") 


}

    # move legen

gg

In [None]:
md(f"**Figure {i_figure}:** Map of near-bottom {variable} observations from ICES.")

In [None]:
# calculate number of observations per month
import calendar
df1 = df.groupby(["lon", "lat", "month"]).count().reset_index()
# plot number of observations per month using plotnine and geom_bar
# convert month to jan, feb, etc.
gg = (
    ggplot(df1, aes(x="month", y="observation")) + 
    geom_bar(stat="identity")+
    scale_x_continuous(breaks=range(1,13), labels=list(calendar.month_abbr[1:]))+
    labs(y = "Number of observations", x= "")
)
gg = gg.draw()
gg


In [None]:
md(f"**Figure {i_figure}:** Number of near-bottom observations per month for {variable}.")
i_figure += 1

In [None]:
%%capture --no-display
%%R -i df -i variable -i unit -w 1000 -h 1200
#%%R -i df -i variable -i unit -w 1600 -h 1000

library(tidyverse, warn.conflicts = FALSE)
world_map <- map_data("world")
# get lon, lat limits from profile_mld

xlim = c(min(df$lon), max(df$lon))
ylim = c(min(df$lat), max(df$lat))


df <- df %>%
    mutate(bias = model - observation) 

# calculate the absolate bias

df1 <- df %>%
    mutate(bias = abs(bias))
# calculate the 98th percentile of the absolute bias
bias_high <- df1$bias %>% quantile(0.98)
# cap the bias to +/1 98th percentile
df$bias[df$bias > bias_high] <- bias_high
df$bias[df$bias < -bias_high] <- -bias_high


# # convert month number to month in profile_mld
df <- df %>%
    arrange(month)
df$month <- factor(df$month, levels = df$month, labels = month.abb[df$month])
# df$month <- factor(df$month, labels = month.abb)

title <- str_glue("Bias in bottom {variable} ({unit})")



gg <- df %>%
# final six months of the year
    ggplot()+
    geom_raster(aes(lon, lat, fill = bias))+
    facet_wrap(~month)+
    theme_gray(base_size = 24)+
    # add colour scale. Minimum zero, label 100, ">100"
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")+
    coord_fixed(xlim = xlim, ylim = ylim, ratio = 1.5) +
    # move legend to the top. Make it 3 cm wide
    # move legend title to the bottom and centre it
    scale_fill_gradient2(low = "blue", high = "red",
                       guide = guide_colorbar(title.position = "bottom", title.hjust = 0.5, title.theme = element_text(angle = 0, size = 20, family = "Helvetica"))
  )+
    theme(
    legend.position = "bottom", legend.direction = "horizontal", legend.box = "horizontal", legend.key.width = unit(6.0, "cm"),
    legend.key.height = unit(1.0, "cm"))+
    # set the legend title to bias
    labs(fill = title)

# figure out if lon minimum is less than -10
if( min(df$lon) < -10 ){
    # add sensible labels for longitude and latitude

    gg <- gg +
    scale_x_continuous(breaks = seq(-10, 5, 5), labels = c("10°W", "5°W", "0°", "5°E"))+ 
    scale_y_continuous(breaks = seq(45, 60, 5), labels = c("45°N", "50°N", "55°N", "60°N"))+
    labs(x = "", y = "") 


}

    # move legen

gg

In [None]:
md(f"**Figure {i_figure}**: Bias in near-bottom {variable}. The bias is calculated as model - observation. The colour scale is from blue (negative bias) to red (positive bias). The colour scale is capped at the 98th percentile of the absolute bias. This is to avoid a few extreme outliers from dominating the colour scale. **Note:** values have been binned and averaged to 0.5 degree resolution.") 
i_figure += 1

In [None]:
%%capture --no-display
%%R -i df_raw -i variable -i unit -w 1000 -h 1200
#%%R -i df -i variable -i unit -w 1600 -h 1000

library(tidyverse, warn.conflicts = FALSE)




# # convert month number to month in profile_mld
df <- df_raw %>%
    arrange(month)
df$month <- factor(df$month, levels = df$month, labels = month.abb[df$month])
# df$month <- factor(df$month, labels = month.abb)

x_lab <- str_glue("Model {variable} ({unit})")
y_lab <- str_glue("Observed {variable} ({unit})")


gg <- df %>%
# final six months of the year
    ggplot()+
    geom_point(aes(model, observation))+
    facet_wrap(~month)+
    theme_gray(base_size = 24)+
    labs(fill = title)+
    geom_abline()+
    geom_smooth(aes(model, observation), method = "gam")+
    labs(x = x_lab, y = y_lab)
    # move legen

gg

In [None]:
md(f"**Figure {i_figure}**: Model vs observed {variable} for near-bottom values. The observations are from ICES bottle and CTD data. 5 degree resolution. The line is a GAM fit to the data. The shaded area is the 95% confidence interval of the GAM fit.")
i_figure += 1

## Summary statistics

In [None]:
md(f"The overall ability of the model to predict the observed {variable} was assessed by calculating the average bias, the root mean square error (RMSE) and the correlation coefficient (R). The bias was calculated as the model value minus the observed value. The RMSE was calculated as the square root of the mean squared error. The correlation coefficient was calculated as the Pearson correlation coefficient between the model and observed values.") 
md(f"This was calculated for each month and for the entire dataset. The results are shown in the tables below.")
md(f"This is calculated in two separate ways. First, we use the raw model and observed values. Second, we use data that was averaged to 0.5 to 0.5 bins to account for spatial bias.")

In [None]:
df_bias = (
    df_raw
    .assign(bias = lambda x: x.model - x.observation)
    .groupby("month")
    .mean()
    .reset_index()
    .loc[:,["month", "bias"]]
    # convert month number to name
    .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
)
# add average bias to df_bias as a separate row
annual_bias = df_raw.model.mean() - df_raw.observation.mean() 
df_bias = df_bias.append({"month": "All", "bias": annual_bias}, ignore_index=True)

# move the final row to the top
df_bias = df_bias.iloc[[-1]].append(df_bias.iloc[:-1])
# now create an rmse dataframe
df_rmse = (
    df_raw
    .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
    .groupby("month")
    .apply(lambda x: np.sqrt((x.model - x.observation).pow(2).mean()))
    .reset_index()
    .rename(columns={0: "rmse"})
)
# add average rmse to df_rmse as a separate row
annual_rmse = np.sqrt(((df_raw.model - df_raw.observation).pow(2)).mean())
df_rmse = df_rmse.append({"month": "All", "rmse": annual_rmse}, ignore_index=True)
# move the final row to the top
df_rmse = df_rmse.iloc[[-1]].append(df_rmse.iloc[:-1])
# rename the month column to Month
# merge the two dataframes
df_table = df_bias.merge(df_rmse)
df_table = df_table.round(2)
# df_table = df_table.rename(columns={"month": "Month", "bias": "Bias", "rmse": "RMSE"})
# df_table = df_table[["Month", "Bias", "RMSE"]]
# create df_corr
df_corr = (
    df_raw
    .groupby("month")
    .apply(lambda x: x.model.corr(x.observation))
    .reset_index()
    .rename(columns={0: "correlation"})
    .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
)
# add average correlation to df_corr as a separate row
# calculate annual correlation using all data
annual_corr = df_raw.model.corr(df_raw.observation)
df_corr = df_corr.append({"month": "All", "correlation": annual_corr}, ignore_index=True)

# move the final row to the top
df_corr = df_corr.iloc[[-1]].append(df_corr.iloc[:-1])
df_table = df_table.merge(df_corr)
df_table = df_table.round(2)
df_table = df_table.rename(columns={"month": "Month", "bias": "Bias", "rmse": "RMSE", "correlation": "Correlation"})
df_table = df_table[["Month", "Bias", "RMSE", "Correlation"]]
# change Month to Period
df_table = df_table.rename(columns={"Month": "Time period"})

df_number = df_raw.groupby("month").count().reset_index().loc[:,["month", "observation"]]
# convert month number to name
df_number["month"] = df_number["month"].apply(lambda x: calendar.month_abbr[x])
df_number = df_number.rename(columns={"month": "Time period", "observation": "Number of observations"})
# add total number of observations
annual_number = len(df_raw)
df_number = df_number.append({"Time period": "All", "Number of observations": annual_number}, ignore_index=True)
df_table = df_table.merge(df_number)

# include commas in the number of observations
df_table["Number of observations"] = df_table["Number of observations"].apply(lambda x: "{:,}".format(x))

df_table.style.hide(axis="index")

In [None]:
md(f"**Table {i_table}:** Average bias and root-mean square error in near-bottom {variable} for each month using the raw ICES data. The bias is calculated as model - observation. The average bias is calculated as the mean of the monthly biases.")
i_table += 1

In [None]:
df_bias = (
    df
    .assign(bias = lambda x: x.model - x.observation)
    .groupby("month")
    .mean()
    .reset_index()
    .loc[:,["month", "bias"]]
    # convert month number to name
    .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
)
# add average bias to df_bias as a separate row
annual_bias = df.model.mean() - df.observation.mean() 
df_bias = df_bias.append({"month": "All", "bias": annual_bias}, ignore_index=True)

# move the final row to the top
df_bias = df_bias.iloc[[-1]].append(df_bias.iloc[:-1])
# now create an rmse dataframe
df_rmse = (
    df
    .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
    .groupby("month")
    .apply(lambda x: np.sqrt((x.model - x.observation).pow(2).mean()))
    .reset_index()
    .rename(columns={0: "rmse"})
)
# add average rmse to df_rmse as a separate row
annual_rmse = np.sqrt(((df.model - df.observation).pow(2)).mean())
df_rmse = df_rmse.append({"month": "All", "rmse": annual_rmse}, ignore_index=True)
# move the final row to the top
df_rmse = df_rmse.iloc[[-1]].append(df_rmse.iloc[:-1])
# rename the month column to Month
# merge the two dataframes
df_table = df_bias.merge(df_rmse)
df_table = df_table.round(2)
# df_table = df_table.rename(columns={"month": "Month", "bias": "Bias", "rmse": "RMSE"})
# df_table = df_table[["Month", "Bias", "RMSE"]]
# create df_corr
df_corr = (
    df
    .groupby("month")
    .apply(lambda x: x.model.corr(x.observation))
    .reset_index()
    .rename(columns={0: "correlation"})
    .assign(month = lambda x: x.month.apply(lambda y: calendar.month_abbr[y]))
)
# add average correlation to df_corr as a separate row
# calculate annual correlation using all data
annual_corr = df.model.corr(df.observation)
df_corr = df_corr.append({"month": "All", "correlation": annual_corr}, ignore_index=True)

# move the final row to the top
df_corr = df_corr.iloc[[-1]].append(df_corr.iloc[:-1])
df_table = df_table.merge(df_corr)
df_table = df_table.round(2)
df_table = df_table.rename(columns={"month": "Month", "bias": "Bias", "rmse": "RMSE", "correlation": "Correlation"})
df_table = df_table[["Month", "Bias", "RMSE", "Correlation"]]
# change Month to Period
df_table = df_table.rename(columns={"Month": "Time period"})

df_number = df_raw.groupby("month").count().reset_index().loc[:,["month", "observation"]]
# convert month number to name
df_number["month"] = df_number["month"].apply(lambda x: calendar.month_abbr[x])
df_number = df_number.rename(columns={"month": "Time period", "observation": "Number of observations"})
# add total number of observations
annual_number = len(df_raw)
df_number = df_number.append({"Time period": "All", "Number of observations": annual_number}, ignore_index=True)
df_table = df_table.merge(df_number)

# include commas in the number of observations
df_table["Number of observations"] = df_table["Number of observations"].apply(lambda x: "{:,}".format(x))

df_table.style.hide(axis="index")

In [None]:
md(f"**Table {i_table}:** Average bias and root-mean square error in near-bottom {variable} for each month using binned ICES data. ICES data was averaged in each 0.5 by 0.5 degree cell in each year and month. The bias is calculated as model - observation. The average bias is calculated as the mean of the monthly biases.")
i_table += 1

In [None]:
chunk_end