# Surface template_title validation using NSBC

In [None]:
chunk_start

In [None]:
stamp = nc.session_info["stamp"]
out = ".trackers/" + stamp + ".txt"
if not os.path.exists(".trackers"):
    os.makedirs(".trackers")
# save out as empty file
with open(out, 'w') as f:
    f.write("")

In [None]:
import numpy as np
grid = pd.read_csv("/data/proteus1/scratch/rwi/evaldata/matched/model_grid.csv")
lon = grid.loc[:,[x for x in grid.columns if "lon" in x]].values
lon = np.unique(lon)
lon.sort()
lat = grid.loc[:,[x for x in grid.columns if "lat" in x]].values
lat = np.unique(lat)
lat.sort()
# get unique values in grid and sort them
lon = np.unique(lon)
lon.sort()
lon_res = lon[1] - lon[0]
lat_res = lat[1] - lat[0]

In [None]:
variable = "template_variable" 
Variable = variable.title()

We used version 1.1 of the **North Sea Biogeochemical Climatology** (NSBC) to validate **sea surface template_variable**. NSBC is a monthly climatology that covers the region 47°-65°N and 15°W-15°E. The data is made up of observations over the period 1960-2014. For validation purposes we only used the level 3 data, which a gridded monthly climatology at a spatial resolution of 1/4°.  The data can be download from [NSBC](https://www.cen.uni-hamburg.de/en/icdc/data/ocean/nsbc.html).

In [None]:
ds_model = nc.open_data("../../matched/gridded/nsbc/nsbc_model.nc")
ds_model.set_precision("F32")
ds_year = min(ds_model.years)
ds_model.set_year(ds_year)
ds_times = ds_model.times
df_times = pd.DataFrame({"year":[x.year for x in ds_times]}).groupby("year").size().reset_index()
df_times.columns = ["year", "count"]
years = list(df_times.query("count > 1").year)
mask_all(ds_model)
ds_model.subset(variable = variable)
ds_model.subset(years = years)
ds_model.tmean("month")
ds_model.as_missing(0)
ds_model.run()
ds_annual = ds_model.copy()
ds_annual.tmean()
ds_annual.set_longnames({variable: Variable})

In [None]:
paths = nc.create_ensemble('/data/proteus1/scratch/rwi/evaldata/data/nsbc/level_3')

variables = [os.path.basename(x).split("_")[2] for x in paths]
nsbc_ensemble = pd.DataFrame({"path":paths, "variable":variables})
nsbc_ensemble["match"]= [jellyfish.levenshtein_distance(x, variable) for x in variables ]
match_min = nsbc_ensemble.match.min()
nsbc_ensemble = nsbc_ensemble.query("match == @match_min")
if "chlo" not in variable:
    if match_min > 2:
        raise ValueError("Unable to match variable in ERSEM and NSBC")
    if len(nsbc_ensemble) > 1:
        raise ValueError("Unable to match variable in ERSEM and NSBC")
nsbc_path = str(nsbc_ensemble.path.values[0])
ds_obs = nc.open_data(nsbc_path, checks = False)
ds_obs.set_precision("F32")
ds_obs.subset(variable = "*_mean")
ds_obs.run()
model_units = ds_model.contents.unit.values[0]
obs_units = ds_obs.contents.unit.values[0]
matched = True
if jellyfish.levenshtein_distance(model_units, obs_units) > 4:
    matched = False
    # raise ValueError(f"Unable to match units in ERSEM and NSBC {variable}")
ds_obs.regrid(ds_model)
ds_obs.run()
ds_obs.top()
ds_obs.run()
obs_mask = ds_obs.copy()
obs_mask > -1e20
mod_mask = ds_model.copy()
mod_mask > -1e20
mod_mask * obs_mask
mod_mask.run()
ds_model * mod_mask
ds_obs * mod_mask

In [None]:
if not matched:
    md("Units may not match. Be careful!")

In [None]:
chunk_clim

In [None]:
chunk_bias

## Can the model reproduce seasonality of template_variable?

The ability of the model to reproduce seasonality of template_variable is assessed by comparing the modelled and observed seasonal cycle of template_variable. First, we derive a monthly climatology for the model data. Then, we calculate the Pearson correlation coefficient between the modelled and observed template_variable at each grid cell. The Pearson correlation coefficient is a measure of the linear correlation between two variables. It has a value between -1 and 1, where 1 indicates a perfect positive linear correlation, 0 indicates no linear correlation, and -1 indicates a perfect negative linear correlation. 

Note: we are only assessing the ability of the model to reproduce the ability of the model to reproduce seasonal changes, not long-term trends.


In [None]:
chunk_seasonal

## Regional assessment of model performance for template_variable

We assessed the regional performance of the model by comparing the model with observations from the following regions: Southern North Sea, Central North Sea, Northern North Sea, Channel, Skagerrak, Norwegian Trench, Shetland, Irish Shelf, Irish Sea, Celtic Sea, Armorican, Northern North East Atlantic, Southern North East Atlantic, Shelf, Ocean, Rosa, Locate Shelf, Deep Ocean.

The regions considered are mapped below.

In [None]:
if regional:
    df_mapped = (
        ds_regions
        .to_dataframe()
        .reset_index()
        .melt(id_vars = ["lon", "lat"])
        .dropna()
        .merge(regions_contents.loc[:,["variable", "long_name"]])
        .drop(columns = [ "value"])
    )
    bad = ["Rosa", "Locate Shelf"]
    df_mapped = df_mapped.query("long_name not in @bad")
    xlim = np.array([df_mapped.lon.min(), df_mapped.lon.max()])
    ylim = np.array([df_mapped.lat.min(), df_mapped.lat.max()])
    shape = gpd.read_file("/data/proteus1/scratch/rwi/evaldata//data/mapping/TM_WORLD_BORDERS-0.3.shp")

    def fix_name(x):
        x = x.replace("North East", "NE")
        x = x.replace("North ", "N ")
        if x == "Channel":
            x = "English Channel"
        return x

    fix_name = np.vectorize(fix_name)


    df_mapped.long_name = fix_name(df_mapped.long_name)


    gg = (
        ggplot( df_mapped)+
         geom_tile(aes(x  = "lon",y =   "lat"))+
        geom_map(shape, aes("LON", "LAT"), fill = "grey", colour = "grey")+
        coord_cartesian(xlim = xlim, ylim = ylim)+
        scale_x_continuous(breaks = [-20, -10, 0, 10], labels = ["20°W", "10°W", "0°", "10°E"])+
        scale_y_continuous(breaks = [40, 50, 60], labels = ["40°N", "50°N", "60°N"])+
        theme_bw(base_size = 10)+
        facet_wrap("~long_name")+

        theme(axis_title_x=element_blank(),
                axis_title_y=element_blank())
    )

    gg = gg.draw()
    gg


In [None]:
if regional:
    md(f"**Figure {i_figure}**: Regions used for validation.")
i_figure += 1

In [None]:
if regional:
    md("Time series were constructed comparing the monthly mean of the spatial average template_variable in each region. The spatial average was calculated using the mean of all grid cells within each region, accounting for grid cell area.")

In [None]:
if regional:
    df_all = []
    for vv in ds_regions.variables:
        ds_rr = ds_regions.copy()
        ds_rr.subset(variable = vv)
        ds_rr.run()
        ds_vv = ds_ts.copy()
        time_name = [x for x in list(ds_vv.to_xarray().coords) if "time" in x][0]
        ds_vv * ds_rr
        ds_region = ds_vv.copy()
        ds_vv.spatial_mean()
        region = list(regions_contents.query("variable == @vv").long_name)[0]
        df_vv = (
            ds_vv
            .to_dataframe()
            .reset_index()
            .rename(columns = {time_name: "time"})
            .loc[:,["time", "model", "observation"]]
            .melt("time")
            .assign(month = lambda x: x.time.dt.month)
            .assign(region = vv)
        )
        df_all.append(df_vv)
        ds_region.tmean()
        df_region = (
            ds_region
            .to_dataframe()
            .dropna()
            .reset_index()
            .loc[:,["model", "observation"]]
            .drop_duplicates()
        )
    
        del ds_rr, ds_vv, ds_region
    df_all = pd.concat(df_all).dropna()
        
    df_all = (
        df_all
        .merge(df_mapped.loc[:,["long_name", "variable"]].drop_duplicates().rename(columns = {"variable": "region"}))
    )

In [None]:
if regional:
    ylab = "Spatial average " + variable + " ("+ nc.static_plot.fix_label(ds_ts.contents.unit[0]) + ")"
    gg = (
        ggplot(df_all)+
        geom_line(aes("month", "value", colour = "variable"))+
        facet_wrap("long_name")+
        labs(y = ylab )+
        labs(x = "Month")+
        theme(legend_position = "top")+
        scale_color_manual(values = ["red", "blue"])+
        theme_bw(base_size = 10)+
        labs(colour = "")+
        scale_x_continuous(breaks = [1,4, 7, 10], labels = ["Jan", "Apr", "Jul", "Oct"]) +
        theme(legend_position = "top") 
        
    )
    
    gg = gg.draw()
    gg


In [None]:
if regional:
    md(f"**Figure {i_figure}**: Seasonal cycle of {variable} for model and observations for each region. The spatial average is taken over the region.") 
    i_figure += 1

In [None]:
chunk_results

In [None]:
chunk_end