# Spatial summaries of differences between simulations

In [None]:
# First identify what is available....

import glob
import os
from IPython.display import Markdown as md
# ability to open pickle
import warnings
warnings.filterwarnings('ignore')
import pickle
#pkg_resources
import pkg_resources

import pandas as pd
import numpy as np
import nctoolkit as nc
from plotnine import *
import os
import molmass
def get_molar_mass(element):
    from molmass import Formula
    f = Formula(element)
    return f.mass

paths = glob.glob("../../data/climatologies/**/**/*.nc")
measures = [os.path.basename(x).split("-")[0] for x in paths]
variables = [os.path.basename(x).split("-")[2] for x in paths]
i_table = 1
i_figure = 1

df_options = pd.DataFrame({"measure": measures, "variable": variables, "path": paths})

ff = "../../sim_dict.pkl"
sim_0_name = "simulation 1"
sim_1_name = "simulation 2"
if os.path.exists(ff):
    sim_dict = pickle.load(open(ff, "rb"))
    sim_0_name = sim_dict["sim0"]
    sim_1_name = sim_dict["sim1"]

measures = [x for x in ["vertical_integration", "top", "vertical_mean", "bottom"] if x in measures]



107 files were created by nctoolkit in prior or current sessions. Consider running deep_clean!
nctoolkit is using Climate Data Operators version 2.4.4


In [None]:
data_path = pkg_resources.resource_filename("ecoval", "data/amm7_val_subdomains.nc")
ds_shelf = nc.open_data(data_path)
ds_shelf.subset(variable = "Shelf")
ds_shelf.as_missing(0)

ds_shelf.plot()
md(f"**Figure {i_figure}:** The AMM7 shelf subdomain")
i_figure = i_figure + 1
md("All spatial averages were calculated for the shelf subdomain shown above.")

for mm in set(measures):

    if mm == "vertical_integration":
        md(f"## Spatially aggregated values across the entire water column")
    if mm == "top":
        md(f"## Surface")
    if mm == "bottom":
        md(f"## Near-bottom values")
    if mm == "vertical_mean":
        md(f"## Vertical mean values")

    mm_variables = list(set(df_options[df_options["measure"] == mm]["variable"].values))
    df_all = []
    if mm == "vertical_integration":
        md("Total integrated values were calculated for the northwest European Shelf by first vertically integrating the values in each grid cell, and then taking the spatial sum, accounting for grid cell area.")  
    if mm == "bottom":
        md("Near-bottom values were calculated by taking the deepest grid cell with a non-missing value in each grid cell and then taking a spatial average.")
    if mm == "vertical_mean":
        md("Vertical mean values were calculated by calculating the vertical average per grid cell, accounting for cell thickness, and then taking the spatial average.") 
    if mm == "top":
        md("Surface values were calculated by taking the top value in each grid cell and then taking the spatial average")
    rr_plot = True
    unit_dict = dict()
    for vv in mm_variables:
        try:
            vv_paths = df_options[(df_options["measure"] == mm) & (df_options["variable"] == vv)]["path"].values
            # path ending with sim_0.nc
            ff1 = [x for x in vv_paths if x.endswith("sim_0.nc")][0]
            ff2 = [x for x in vv_paths if x.endswith("sim_1.nc")][0]
            #ds1 = nc.open_data("/data/proteus1/scratch/rwi/validations/differences/data/climatologies/chlorophyll/vertical_integration/vertical_integration_climatology_chlorophyll_sim_0.nc")
            ds1 = nc.open_data(ff1)
            ds2 = nc.open_data(ff2)
            ds1.tmean("month")
            ds2.tmean("month")
            try:
                ds1.fix_amm7_grid()
            except:
                pass
            try:
                ds2.fix_amm7_grid()
            except:
                pass
            ds1 * ds_shelf
            ds2 * ds_shelf
            # subset to sensible bounds
            ds1.subset(lon = [-19, 9], lat = [42, 63])
            ds2.subset(lon = [-19, 9], lat = [42, 63])
            ds1.run()

            if mm == "vertical_integration":
                ds1.regrid(ds_shelf)
                ds2.regrid(ds_shelf)
                ds1.spatial_sum(by_area = True)
                ds2.spatial_sum(by_area=True)
            else:
                ds1.spatial_mean()
                ds2.spatial_mean()
            variable = ds1.variables[0]
            ds1.run()
            ds2.run()
            df1 = ds1.to_dataframe().reset_index()
            time_name = [x for x in df1.columns  if "time" in x and "bnds" not in x][0]
            df1["simulation"] = sim_0_name 
            df1["month"] = df1[time_name].dt.month
            df2 = ds2.to_dataframe().reset_index()
            df2["month"] = df2[time_name].dt.month
            df2["simulation"] = sim_1_name 
            # bind them
            df = pd.concat([df1,df2]) 
            # change variable to value
            df = df.rename(columns={variable: "value"})
            df["variable"] = vv
            unit = ds1.contents.unit[0]
            if "mg" in unit and mm == "vertical_integration":
                if mm == "vertical_integration":
                    max_value = df.value.max()
                # conver from mg to tonnes
                max_value = max_value * 1e-9
                # figure out if tonnes, kt, Mt or Gt is more appropriate
                if max_value < 1:
                    unit = "tonnes"
                if max_value > 1e3:
                    unit = "kt"
                if max_value > 1e6:
                    unit = "Mt"
                if max_value > 1e9:
                    unit = "Gt"
                #df["unit"] = unit
                # convert to new unit. Original is tonnes
                if unit == "tonnes":
                    df["value"] = df["value"] * 1e-9
                if unit == "kt":
                    df["value"] = df["value"] * 1e-12
                if unit == "Mt":
                    df["value"] = df["value"] * 1e-15
                if unit == "Gt":
                    df["value"] = df["value"] * 1e-18
                
                # modify the variable name
                df["new_variable"] = df["variable"] + f" ({unit})"
            else:
                # add unit to the variable name
                df["new_variable"] = df["variable"]

                #
            unit_dict[vv] = unit

            # sort out the units if mm is vertical_integration
            #if mm == "vertical_integration":
            #    df["value"] = df["value"] * 1e-3
            #    unit = "mg m$^{-2}$"
            #  
            # 
            df_all.append(df)
        except:
            pass
    try:
        df_all = pd.concat(df_all)
        # create the title
        n_cols = len(set(df_all["new_variable"].values))
        # square root of this number
        n_cols = int(np.ceil(n_cols ** 0.5))
        n_cols = min(n_cols, 5)
        # make new_variable a title
        df_all["new_variable"] = [x.replace("_", " ") for x in df_all["new_variable"]]
        # fix O_2
        df_all["new_variable"] = [x.replace(" O 2", " O$_2$") for x in df_all["new_variable"]]
        # make the first letter capital
        def capitalize_first_letter(x):
            return x[0].upper() + x[1:]
        
        df_all["new_variable"] = [capitalize_first_letter(x) for x in df_all["new_variable"]]

        df_all["new_variable"] = [x.replace("PH ", "pH ") for x in df_all["new_variable"]] 
        df_all["new_variable"] = [x.replace("biomass ", " biomass") for x in df_all["new_variable"]] 
        df_all["new_variable"] = [x.replace("  ", " ") for x in df_all["new_variable"]] 
        df_all["new_variable"] = [x.replace("Poc ", "POC ") for x in df_all["new_variable"]] 
        df_all["new_variable"] = [x.replace("Doc ", "DOC ") for x in df_all["new_variable"]] 
        df_all["new_variable"] = [x.replace("Co2 ", "CO2 ") for x in df_all["new_variable"]] 
        # make CO2 into CO$_2$
        df_all["new_variable"] = [x.replace("CO2", "CO$_2$") for x in df_all["new_variable"]]
        df_all["new_variable"] = [x.replace("co2", "CO$_2$") for x in df_all["new_variable"]]
        # sort out "/m^3"
        df_all["new_variable"] = [x.replace("/m^3", "m$^{-3}$") for x in df_all["new_variable"]]
        # sort out m^2
        df_all["new_variable"] = [x.replace("/m^2", "m$^{-2}$") for x in df_all["new_variable"]]
        # sort out /d
        df_all["new_variable"] = [x.replace("/d", "d$^{-1}$") for x in df_all["new_variable"]]
        # convert the above into a function

        # 
        #

        # extract new_variable as list
        new_variables = list(set(df_all["new_variable"].values))
        # This needs to be chopped up into roughly even chunks, but maximum size of 12 in each chunk

        if len(new_variables) > 12:
            n_chunks = int(np.ceil(len(new_variables) / 12))
            new_variables = np.array_split(new_variables, n_chunks)
        else:
            new_variables = [new_variables]

        i = 0
        text = "The units are as follows: "
        for vv in unit_dict.keys():
            unit = unit_dict[vv] 
            text = text + f"{vv}: {unit}, "
        text = text[:-2] + "."
        if mm != "vertical_integration":
            md(text)

        import textwrap

        for vars in new_variables:
            gg = ( 
                ggplot(df_all.query("new_variable in @vars"))+
                geom_line(aes(x="month",y="value",color="simulation"))+
                facet_wrap("~new_variable", scales="free", labeller = labeller(new_variable = lambda x: textwrap.fill(x, width=10)))+ 
                expand_limits(y=0)+
                theme_bw()+
                theme(legend_position="top")+
                # better labels for months
                scale_x_continuous(breaks = [2, 4, 6, 8, 10], labels = ["Feb", "Apr", "Jun", "Aug", "Oct"])+
                # rotate a little
                theme(axis_text_x = element_text(angle = 45))+
                labs(x = "")+
                # remove the legend title
                # wrap facet labels
                theme(strip_text_x = element_text(size = 6, angle = 45))+
                theme(legend_title=element_blank())+
                labs(y = "")
            )

            # display the plot
            gg

            i = i + 1

            if len(new_variables) > 1:
                i_part = f".{i}"
            else:
                i_part = ""


            if mm == "top":
                md(f"**Figure {i_figure}{i_part}**: Average sea surface values on the shelf for {sim_0_name} and {sim_1_name} for each variable") 
            if mm == "bottom":
                md(f"**Figure {i_figure}{i_part}**: Average near-bottom values on the shelf for {sim_0_name} and {sim_1_name} for each variable")
            if mm == "vertical_mean":
                md(f"**Figure {i_figure}{i_part}**: Average vertical mean values on the shelf for {sim_0_name} and {sim_1_name} for each variable")
            if mm == "vertical_integration":
                md(f"**Figure {i_figure}{i_part}**: Total integrated values over the shelf for {sim_0_name} and {sim_1_name} for each variable")

        i_figure = i_figure + 1

        df_summary = df_all.drop(columns = "new_variable").groupby(["variable", "simulation"]).mean().reset_index()
        # calcuate the ratio between the two simulations
        df_summary = df_summary.pivot(index="variable", columns="simulation", values="value").reset_index()
        df_summary["ratio"] = df_summary[sim_0_name] / df_summary[sim_1_name]
        # convert to percentage difference
        df_summary["ratio"] = (df_summary["ratio"] - 1) * 100
        df_summary["ratio"] = df_summary["ratio"].round(2)
        # sort by absolute value of ratio
        df_summary = df_summary.sort_values("ratio", key = lambda x: abs(x), ascending = False)
        # improve the column names
        # final column name is Percentage difference
        # df_summary.rename(columns = {"sim_0": sim_0_name, "sim_1": sim_1_name}, inplace = True)
        # put sim_0 before sim_1
        df_summary = df_summary[["variable", sim_0_name, sim_1_name, "ratio"]]
        df_summary.columns = ["Variable", sim_0_name, sim_1_name, "Percentage difference"]

        df_display(df_summary)
        # create a sentence listing the units
        md(f"**Table {i_table}**: Percentage difference between {sim_0_name} to {sim_1_name} for each variable and {mm}.")
        i_table = i_table + 1


    except:
        pass
