# Validation of point_layer template_title using point observations from point_obs_source

In [None]:
import numpy as np
variable = "point_variable"


In [None]:
chunk_start

In [None]:
layer = "point_layer"
point_source = "point_obs_source"
# get the units. File inspection could be randomized in case people have put loose files in there...
import glob
import dill
definitions = dill.load(open(glob.glob(f"data_dir_value/oceanval_matchups/point/{layer}/{variable}/{point_source}/*definitions*.pkl")[0], "rb"))
bin_res = definitions[variable].binning
try:
    vv_name = definitions[variable].long_name 
except:
    pass

unit = None
if unit is None:
    try:
        unit = definitions[variable].model_unit 
    except:
        pass
if unit is None:
    unit = "unknown unit"

if unit.endswith("/d"):
    unit = unit[:-2] + "/day"




            

## Read in the data

In [None]:
if layer in ["surface", "all"]:
    n_levels = definitions[variable].n_levels
    if n_levels > 1:
        md(f"## Performance of model sea surface {vv_name}")
    else:
        md(f"## Performance of model {vv_name}")
    layer_select = "surface"
#if layer in ["benthic"]:
chunk_point_surface


In [None]:
if layer == "all":
    md(f"## Depth-resolved comparisons of modelled and observed {vv_name}")

    if bin_res is not None:
        md("The ability of the model to reproduce observed vertical profiles were assessed by comparing model and observational data at depth bins where observations were available.")
        md_markdown(f"**Note**: the observational and model data were binned to a resolution of {bin_res[0]}° longitude by {bin_res[1]}° latitude and climatological monthly averages were calculated before analysis. This was carried out to reduce the influence of spatial bias on the validation statistics.") 


In [None]:
df_raw =  pd.read_csv(glob.glob(f"data_dir_value/oceanval_matchups/point/{layer}/{variable}/{point_source}/*_{variable}.csv")[0])

In [None]:
if layer == "all":
    # overall summary
    #df_raw
    # create bins
    # 0-10m
    # 10-30m
    # 30-60m
    # 60-100m
    # 100-150m
    # 150-300m
    # 300-600m
    # 600-1000m
    # function
    def bin_depth(x):
        if x <= 10:
            return "0-10m"
        if x <= 30:
            return "10-30m"
        if x <= 60:
            return "30-60m"
        if x <= 100:
            return "60-100m"
        if x <= 150:
            return "100-150m"
        if x <= 300:
            return "150-300m"
        if x <= 600:
            return "300-600m"
        if x <= 1000:
            return "600-1000m"
        return ">1000m"

    df_mapped = (
        df_raw
        .assign(depth = lambda x: x.depth.apply(bin_depth))
        .loc[:,["lon", "lat", "depth"]]
        .drop_duplicates()
        .reset_index(drop = True)   

    )
    if bin_res is not None:
        grouping = ["lon", "lat", "depth", "year", "month"]
        df_raw = (
            df_raw
            .assign(
                lon = lambda x: bin_value(x.lon, lon_res),
                lat = lambda x: bin_value(x.lat, lat_res)
            )
        )
        grouping = [x for x in grouping if x in df_raw.columns]
    else:
        grouping = ["lon", "lat", "depth", "year", "month", "day"]
        grouping = [x for x in grouping if x in df_raw.columns]



    df_summary = (
        df_raw
        .assign(depth = lambda x: x.depth.apply(bin_depth))
    )
    if bin_res is not None:
        df_summary = (
            df_summary
            .groupby(grouping)
            .mean()
            .reset_index()
        )

    df_summary = (
        df_summary
        .groupby("depth")
        # calculate bias, cor and rmsd between model and observation, and number of observations
        .apply(lambda x: pd.Series({"bias": x.model.mean() - x.observation.mean(), "cor": x.model.corr(x.observation), "rmsd": np.sqrt(((x.model - x.observation).pow(2)).mean()), "n": len(x)}))   
        .reset_index()
        # sort depth in original order
    )
    # make the titles better
    df_summary = df_summary.rename(columns={"depth": "Depth", "bias": "Bias", "cor": "Correlation", "rmsd": "RMSD", "n": "Number of observations"})
    df_summary = df_summary.assign(Depth = pd.Categorical(df_summary.Depth, ["0-150m","0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m"]))
    # now, calculate the total (raw) number of observations per depth bin (not binned in lon/lat)
    if bin_res is not None:
        n_obs = (
            df_raw
            .assign(depth = lambda x: x.depth.apply(bin_depth))
            .groupby("depth")
            .size()
            .reset_index(name = "Total_number")
            .rename(columns={"depth": "Depth"})
        )
        # shallow
        # add this to df_summary "total number" in brackets
        df_summary = (df_summary.merge(n_obs)
        # add Total_number in brackets to Number of observations
        # change Total number to str, with commas
        .assign(**{"Total_number": lambda x: x["Total_number"].apply(lambda y: "{:,}".format(y))})
        # Number of observations to str with commas
        # number of obs as int
        .assign(**{"Number of observations": lambda x: x["Number of observations"].astype(int)})
        .assign(**{"Number of observations": lambda x: x["Number of observations"].apply(lambda y : "{:,}".format(y))})
        .assign(**{"Number of observations": lambda x: x["Number of observations"].astype(str) + " (" + x["Total_number"].astype(str) + ")"}  )  
        .drop(columns = "Total_number")
        )
    # sort by depth

    df_summary = df_summary.sort_values("Depth")
else:
    df_mapped = None

In [None]:
map_layer = False
if layer == "all":
    map_layer = True

In [None]:
%%capture --no-display
%%R -i df_mapped -i map_layer
options(warn=-1)
options(warn=-1)
library(dplyr)
library(ggplot2)
library(ggtext)

if (map_layer){

    world_map <- map_data("world")  


# make depth a factor
df_mapped <- df_mapped %>%
    mutate(depth = factor(depth, levels = c("0-10m", "10-30m", "30-60m", "60-100m", "100-150m", "150-300m", "300-600m", "600-1000m", ">1000m")))

gg <- df_mapped %>%
    ggplot()+
    geom_point(aes(lon, lat), size = 0.2)+
    theme_gray(base_size = 14)+
    coord_fixed(ratio = 1.5, xlim = c(min(df_mapped$lon), max(df_mapped$lon)), ylim = c(min(df_mapped$lat), max(df_mapped$lat)), expand = FALSE) +
    facet_wrap(~depth)


gg <- gg + 
    geom_polygon(data = world_map, aes(long, lat, group = group), fill = "grey60")

# remove x and y axis names
gg <- gg +
    labs(x = "", y = "") 

# ditch the whitespace around the plot
gg <- gg + theme(plot.margin=unit(c(0,0,0,0),"cm"))

gg <- gg +
    # remove the x and y axis totally
    theme(axis.text.x = element_blank(), axis.text.y = element_blank(),
          axis.ticks.x = element_blank(), axis.ticks.y = element_blank(),
          axis.title.x = element_blank(), axis.title.y = element_blank()) +
    labs(x = "", y = "") 
gg


}

In [None]:
if layer == "all":
    md(f"**Figure {i_figure}**: The geographic distribution of matchups between the model and observational {variable}. The data has been binned into depth ranges. The depth ranges are 0-10m, 10-30m, 30-60m, 60-100m, 100-150m, 150-300m, 300-600m, 600-1000m, and >1000m. The number of observations in each depth range is shown in the tables below.")
    i_figure += 1   

In [None]:
if layer == "all":
    df_display(df_summary)
    # save this to csv in the results directory
    ff_out = f"../../oceanval_results/{variable}_depth_summary.csv"
    # create directory if it does not exist
    os.makedirs(os.path.dirname(ff_out), exist_ok=True)
    df_summary.assign(unit = unit).to_csv(ff_out, index = False)
    
    if bin_res is not None:
        final_text = " Numbers in in brackets indicate the total unbinned observations used"
    else:
        final_text = ""
    md(f"**Table {i_table}:** Average bias ({unit}), root-mean square difference (RMSD) and correlation coefficient of modelled and observed {vv_name} for different depth ranges. The bias is calculated as model - observation. The RMSD is the square root of the mean squared difference. The correlation coefficient is the Pearson correlation coefficient between the model and observed values.{final_text}")
    i_table += 1

In [None]:
chunk_end

In [None]:
md(f"## Data Sources for validation of {vv_name}")
md_basic(definitions[variable].sources[vv_source_raw])

In [None]:
if test_status:
    md("This is getting to the end!")