## OBS:
This notebook is only slightly edited from Zebedee Nicholls notebook, see [here](https://gitlab.com/rcmip/rcmip/-/blob/master/notebooks/results/phase-1/database-generation.ipynb)

In [1]:
from ar6_ch6_rcmipfigs.constants import INPUT_DATA_DIR

__depends__ = []
__dest__ = [INPUT_DATA_DIR+
    "/data/database-results/phase-1/timestamp.txt",
    INPUT_DATA_DIR+"/data/database-observations/timestamp.txt",
]

/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs
/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in


In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# Database generation


In this notebook we process the data into a database we can later query to make plots/do analysis etc.

## Imports

In [3]:
import logging
import os.path
import re
from pathlib import Path
from distutils.util import strtobool

import pandas as pd
import tqdm
from scmdata import ScmDataFrame, df_append

<IPython.core.display.Javascript object>

pyam - INFO: Running in a notebook, setting `pyam` logging level to `logging.INFO` and adding stderr handler


<IPython.core.display.Javascript object>

from ar6_ch6_rcmipfigs.utils.database_generation import check_all_variables_and_units_as_in_protocol, \
    check_all_scenarios_as_in_protocol, unify_units, save_into_database, mce_get_quantile, hector_get_quantile

TEST_RUN = strtobool(os.getenv("CI", "False")) or False
TEST_RUN

In [5]:
logger = logging.getLogger()

<IPython.core.display.Javascript object>

## Constants

In [6]:
from ar6_ch6_rcmipfigs.constants import  INPUT_DATA_DIR
OUTPUT_DATABASE_PATH = os.path.join(INPUT_DATA_DIR, "database-results", "phase-1/")

OBS_DATABASE_PATH = os.path.join(INPUT_DATA_DIR, "database-observations/")

<IPython.core.display.Javascript object>

In [7]:
from ar6_ch6_rcmipfigs.utils.misc_func import make_folders

if not os.path.isdir(OUTPUT_DATABASE_PATH):
    make_folders(OUTPUT_DATABASE_PATH)

if not os.path.isdir(OBS_DATABASE_PATH):
    make_folders(OBS_DATABASE_PATH)

<IPython.core.display.Javascript object>

## Protocol

In [8]:
SCENARIO_PROTOCOL = os.path.join(INPUT_DATA_DIR, "data", "protocol", "rcmip-emissions-annual-means.csv"
)

<IPython.core.display.Javascript object>

In [9]:
protocol_db = ScmDataFrame(SCENARIO_PROTOCOL)
protocol_db.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,time,1750-01-01 00:00:00,1751-01-01 00:00:00,1752-01-01 00:00:00,1753-01-01 00:00:00,1754-01-01 00:00:00,1755-01-01 00:00:00,1756-01-01 00:00:00,1757-01-01 00:00:00,1758-01-01 00:00:00,1759-01-01 00:00:00,...,2491-01-01 00:00:00,2492-01-01 00:00:00,2493-01-01 00:00:00,2494-01-01 00:00:00,2495-01-01 00:00:00,2496-01-01 00:00:00,2497-01-01 00:00:00,2498-01-01 00:00:00,2499-01-01 00:00:00,2500-01-01 00:00:00
model,scenario,region,variable,unit,activity_id,mip_era,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
AIM,rcp60,World,Emissions|BC,Mt BC/yr,not_applicable,CMIP5,,,,,,,,,,,...,4.3615,4.3615,4.3615,4.3615,4.3615,4.3615,4.3615,4.3615,4.3615,4.3615
AIM,rcp60,World,Emissions|CH4,Mt CH4/yr,not_applicable,CMIP5,,,,,,,,,,,...,254.56784,254.57763,254.58741,254.5972,254.60698,254.61676,254.62655,254.63633,254.64612,254.6559
AIM,rcp60,World,Emissions|CO,Mt CO/yr,not_applicable,CMIP5,,,,,,,,,,,...,792.2257,792.2257,792.2257,792.2257,792.2257,792.2257,792.2257,792.2257,792.2257,792.2257
AIM,rcp60,World,Emissions|CO2,Mt CO2/yr,not_applicable,CMIP5,,,,,,,,,,,...,3165.917333,3162.074667,3158.232,3154.389333,3150.546667,3146.704,3142.861333,3139.018667,3135.176,3131.333333
AIM,rcp60,World,Emissions|CO2|MAGICC AFOLU,Mt CO2/yr,not_applicable,CMIP5,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<IPython.core.display.Javascript object>

In [10]:
protocol_db["scenario"].unique()

array(['rcp60', 'ssp370', 'ssp370-lowNTCF-aerchemmip',
       'ssp370-lowNTCF-gidden', 'historical', 'ssp434', 'ssp460', 'rcp26',
       'ssp119', 'ssp126', 'rcp85', 'ssp245', 'rcp45', 'ssp534-over',
       'ssp585', 'esm-bell-1000PgC', 'esm-bell-2000PgC',
       'esm-bell-750PgC', 'esm-pi-CO2pulse', 'esm-pi-cdr-pulse',
       'esm-piControl', 'historical-cmip5'], dtype=object)

<IPython.core.display.Javascript object>

In [11]:
DATA_PROTOCOL = os.path.join(INPUT_DATA_DIR,
    "data",
    "submission-template",
    "rcmip-data-submission-template.xlsx",
)

<IPython.core.display.Javascript object>

In [12]:
protocol_variables = pd.read_excel(DATA_PROTOCOL, sheet_name="variable_definitions")
protocol_variables.columns = protocol_variables.columns.str.lower()
protocol_variables.head()

Unnamed: 0,idx,category,variable,unit,definition,tier
0,1,Atmospheric Concentrations,Atmospheric Concentrations|CH4,ppb,atmospheric concentrations of CH4,1
1,2,Atmospheric Concentrations,Atmospheric Concentrations|CO2,ppm,atmospheric concentrations of CO2,1
2,3,Atmospheric Concentrations,Atmospheric Concentrations|F-Gases,ppm,equivalent species atmospheric concentrations ...,3
3,4,Atmospheric Concentrations,Atmospheric Concentrations|F-Gases|HFC,ppm,equivalent species atmospheric concentrations ...,3
4,5,Atmospheric Concentrations,Atmospheric Concentrations|F-Gases|HFC|HFC125,ppt,atmospheric concentrations of HFC125,2


<IPython.core.display.Javascript object>

In [13]:
protocol_scenarios = pd.read_excel(
    DATA_PROTOCOL, sheet_name="scenario_info", skip_rows=2
)
protocol_scenarios.columns = protocol_scenarios.columns.str.lower()
protocol_scenarios.head()

Unnamed: 0,unnamed: 0,scenario,description of scenario,detailed scenario description,priority
0,Instructions/Details,Scenario identifier used in the CMIP6 modeling...,Brief description of scenario,Documentation on es-doc + some other clarifyin...,"Top priority is Tier 1, additional runs are Ti..."
1,Answers,# Scenario ID,# Scenario Description,#Scenario Specification,# Tier in RCMP
2,,piControl,pre-industrial control simulation (i.e. consta...,groups should run as long as is sensible/they ...,1
3,,esm-piControl,pre-industrial control simulation with zero an...,should be identical to piControl except CO2 is...,1
4,,esm-piControl-allGHG,pre-industrial control simulation with zero an...,should be identical to piControl except all GH...,2


<IPython.core.display.Javascript object>

## Model output

In [14]:
RESULTS_PATH = os.path.join(INPUT_DATA_DIR, "data", "results", "phase-1")

<IPython.core.display.Javascript object>

In [15]:
_results_files = list(Path(RESULTS_PATH).rglob("*.csv")) + list(
    Path(RESULTS_PATH).rglob("*.xlsx")
)
print(len(_results_files))
sorted(_results_files)

453


[PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/acc2/rcmip_phase-1_acc2_v1-0-0.xlsx'),
 PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/acc2/rcmip_phase-1_acc2_v1-0-1.xlsx'),
 PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/acc2/rcmip_phase-1_acc2_v2-0-0.xlsx'),
 PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/acc2/rcmip_phase-1_acc2_v2-0-1.xlsx'),
 PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/ar5ir/ar5ir-phase-1-results-v1-0-0.csv'),
 PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/ar5ir/ar5ir-phase-1-results-v1-0-1.csv'),
 PosixPath('/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/ar5ir/ar5ir

<IPython.core.display.Javascript object>

In [17]:
model_of_interest = [
#    ".*acc2.*v2-0-1.*",
    ".*rcmip_phase-1_cicero-scm.*v5-0-0.*",
#    ".*escimo.*v2-0-1.*",
    ".*fair-1.5-default.*v1-0-1.csv",
#    ".*rcmip_phase-1_gir.*",
#    ".*greb.*v2-0-0.*",
#    ".*hector.*v2-0-0.*",
#    ".*MAGICC7.1.0aX-rcmip-phase-1.*",
    ".*rcmip_phase-1_magicc7.1.0.beta_v1-0-0.*",
#    ".*MAGICC7.1.0aX.*",
#    ".*mce.*v2-0-1.*",
#    ".*oscar-v3-0*v1-0-1.*",
    ".*oscar-v3-0.*v1-0-1.*"
#    ".*wasp.*v1-0-1.*",
]
extra = [
    ".*escimo-phase-1-v2-0-1.*",
    
]
model_of_interest = model_of_interest + extra

<IPython.core.display.Javascript object>

In [18]:
if TEST_RUN:
    model_of_interest = [
        ".*escimo-phase-1-v2-0-1.*",
        ".*greb.*",
        ".*rcmip_phase-1_cicero-scm.*v5-0-0.*",
    ]

results_files = [
    str(p)
    for p in _results_files
    if any([bool(re.match(m, str(p))) for m in model_of_interest]) and "$" not in str(p)
]
print(len(results_files))
sorted(results_files)

81


['/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/cicero-scm/rcmip_phase-1_cicero-scm-ecs3_v5-0-0.csv',
 '/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/cicero-scm/rcmip_phase-1_cicero-scm_v5-0-0.csv',
 '/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/escimo/escimo-phase-1-v2-0-1.xlsx',
 '/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/fair/rcmip_phase-1_fair-1.5-default-1pctCO2-4xext_v1-0-1.csv',
 '/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/fair/rcmip_phase-1_fair-1.5-default-1pctCO2-bgc_v1-0-1.csv',
 '/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/fair/rcmip_phase-1_fair-1.5-default-1pctCO2-cdr_v1-0-1.csv',
 '/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase

<IPython.core.display.Javascript object>

In [19]:
[
    str(p)
    for p in results_files
    if 'magicc' in str(p)] #for m in model_of_interest]) and "$" not in str(p)
#]

['/home/sarambl/PHD/IPCC/public/AR6_CH6_RCMIPFIGS/ar6_ch6_rcmipfigs/data_in/data/results/phase-1/magicc7/rcmip_phase-1_magicc7.1.0.beta_v1-0-0.csv']

<IPython.core.display.Javascript object>

In [20]:
db = []
for rf in tqdm.tqdm_notebook(results_files):
    if rf.endswith(".csv"):
        loaded = ScmDataFrame(rf)
    else:
        loaded = ScmDataFrame(rf, sheet_name="your_data")
    db.append(loaded)

db = df_append(db).timeseries().reset_index()
db["unit"] = db["unit"].apply(
    lambda x: x.replace("Dimensionless", "dimensionless") if isinstance(x, str) else x
)
db = ScmDataFrame(db)
db.head()

HBox(children=(IntProgress(value=0, max=81), HTML(value='')))




Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,1750-01-01 00:00:00,1751-01-01 00:00:00,1752-01-01 00:00:00,1753-01-01 00:00:00,1754-01-01 00:00:00,1755-01-01 00:00:00,1756-01-01 00:00:00,1757-01-01 00:00:00,1758-01-01 00:00:00,1759-01-01 00:00:00,...,3121-01-01 00:00:00,3122-01-01 00:00:00,3123-01-01 00:00:00,3124-01-01 00:00:00,3125-01-01 00:00:00,3126-01-01 00:00:00,3127-01-01 00:00:00,3128-01-01 00:00:00,3129-01-01 00:00:00,3130-01-01 00:00:00
model,scenario,region,variable,unit,climatemodel,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
unspecified,1pctCO2,World,Airborne Fraction|CO2,dimensionless,FaIR-1.5-DEFAULT,,,,,,,,,,,...,,,,,,,,,,
unspecified,1pctCO2,World,Airborne Fraction|CO2,dimensionless,OSCARv3.0,,,,,,,,,,,...,,,,,,,,,,
unspecified,1pctCO2,World,Airborne Fraction|CO2|16th quantile,dimensionless,OSCARv3.0,,,,,,,,,,,...,,,,,,,,,,
unspecified,1pctCO2,World,Airborne Fraction|CO2|84th quantile,dimensionless,OSCARv3.0,,,,,,,,,,,...,,,,,,,,,,
unspecified,1pctCO2,World,Emissions|CO2|MAGICC AFOLU,Gt C / yr,MAGICC7.1.0.beta-rcmip-phase-1,,,,,,,,,,,...,,,,,,,,,,


<IPython.core.display.Javascript object>

In [21]:
db["climatemodel"].unique()

array(['FaIR-1.5-DEFAULT', 'OSCARv3.0', 'MAGICC7.1.0.beta-rcmip-phase-1',
       'Cicero-SCM', 'Cicero-SCM-ECS3', 'ESCIMO,rcmip,base'], dtype=object)

<IPython.core.display.Javascript object>

### Minor quick fixes

We relabel all the ssp370-lowNTCF data to remove ambiguity.

In [22]:
db = db.timeseries().reset_index()
db["scenario"] = db["scenario"].apply(
    lambda x: "ssp370-lowNTCF-gidden" if x == "ssp370-lowNTCF" else x
)
db["scenario"] = db["scenario"].apply(
    lambda x: "esm-ssp370-lowNTCF-gidden" if x == "esm-ssp370-lowNTCF" else x
)
db["scenario"] = db["scenario"].apply(
    lambda x: "esm-ssp370-lowNTCF-gidden-allGHG"
    if x == "esm-ssp370-lowNTCF-allGHG"
    else x
)
db = ScmDataFrame(db)

<IPython.core.display.Javascript object>

In [23]:
assert "ssp370-lowNTCF" not in db["scenario"].unique().tolist()
assert "esm-ssp370-lowNTCF" not in db["scenario"].unique().tolist()
assert "esm-ssp370-lowNTCF-allGHG" not in db["scenario"].unique().tolist()

<IPython.core.display.Javascript object>

The Hector and MCE data is mislabelled so we do a quick fix here. I also have changed my mind about how to format the quantiles so tweak the FaIR and WASP data too.

In [24]:
mce_prob_data = db.filter(climatemodel="MCE*PROB*")
mce_prob_data["climatemodel"].unique()
if not mce_prob_data.timeseries().empty:
    mce_prob_data = mce_prob_data.timeseries().reset_index()

    mce_prob_data["variable"] = (
        mce_prob_data["variable"]
        + "|"
        + mce_prob_data["climatemodel"].apply(mce_get_quantile)
        + "th quantile"
    )

    mce_prob_data["climatemodel"] = mce_prob_data["climatemodel"].apply(
        lambda x: "-".join(x.split("-")[:-1])
    )

    db = db.filter(climatemodel="MCE*PROB*", keep=False).append(mce_prob_data)

db.filter(climatemodel="MCE*PROB").head(10)

Filtered ScmDataFrame is empty!
Filtered ScmDataFrame is empty!


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time
model,scenario,region,variable,unit,climatemodel


<IPython.core.display.Javascript object>

In [25]:
hector_prob_data = db.filter(climatemodel="hector*HISTCALIB*")
if not hector_prob_data.timeseries().empty:
    hector_prob_data = hector_prob_data.timeseries().reset_index()

    hector_prob_data["variable"] = (
        hector_prob_data["variable"]
        + "|"
        + hector_prob_data["climatemodel"].apply(hector_get_quantile)
    )

    hector_prob_data["climatemodel"] = hector_prob_data["climatemodel"].apply(
        lambda x: x.split("-")[0]
    )

    db = db.filter(climatemodel="hector*HISTCALIB*", keep=False).append(
        hector_prob_data
    )

db.filter(climatemodel="hector*HISTCALIB").head(10)

Filtered ScmDataFrame is empty!
Filtered ScmDataFrame is empty!


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time
model,scenario,region,variable,unit,climatemodel


<IPython.core.display.Javascript object>

In [26]:
fair_prob_data = db.filter(climatemodel="*FaIR*")
if not fair_prob_data.timeseries().empty:
    fair_prob_data = fair_prob_data.timeseries().reset_index()

    fair_prob_data["variable"] = fair_prob_data["variable"].apply(
        lambda x: x.replace("|00th", "|0th").replace("|05th", "|5th")
    )

    db = db.filter(climatemodel="*FaIR*", keep=False).append(
        ScmDataFrame(fair_prob_data)
    )

db.filter(climatemodel="*FaIR*").head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time,1850-01-01 00:00:00,1851-01-01 00:00:00,1852-01-01 00:00:00,1853-01-01 00:00:00,1854-01-01 00:00:00,1855-01-01 00:00:00,1856-01-01 00:00:00,1857-01-01 00:00:00,1858-01-01 00:00:00,1859-01-01 00:00:00,...,2491-01-01 00:00:00,2492-01-01 00:00:00,2493-01-01 00:00:00,2494-01-01 00:00:00,2495-01-01 00:00:00,2496-01-01 00:00:00,2497-01-01 00:00:00,2498-01-01 00:00:00,2499-01-01 00:00:00,2500-01-01 00:00:00
model,scenario,region,variable,unit,climatemodel,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
unspecified,1pctCO2,World,Effective Climate Sensitivity,K,FaIR-1.5-DEFAULT,,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
unspecified,1pctCO2,World,Surface Air Temperature Change,K,FaIR-1.5-DEFAULT,0.0,0.005087526,0.01417913,0.02642579,0.04116217,0.05786686,0.07613122,0.095635,0.1161272,0.137411,...,23.09019,23.1319,23.17361,23.21533,23.25705,23.29878,23.34051,23.38226,23.424,23.46575
unspecified,1pctCO2,World,Instantaneous TCRE,K/MtCO2,FaIR-1.5-DEFAULT,,2.293737e-07,2.820009e-07,3.218446e-07,3.52946e-07,3.774602e-07,3.968285e-07,4.121225e-07,4.241734e-07,4.336351e-07,...,1.670973e-08,1.657436e-08,1.644003e-08,1.630675e-08,1.617449e-08,1.604326e-08,1.591304e-08,1.578383e-08,1.565562e-08,1.55284e-08
unspecified,1pctCO2,World,Carbon Pool|Atmosphere,Mt CO2,FaIR-1.5-DEFAULT,2218008.0,2240188.0,2262590.0,2285216.0,2308068.0,2331148.0,2354460.0,2378005.0,2401785.0,2425802.0,...,1306062000.0,1319122000.0,1332313000.0,1345637000.0,1359093000.0,1372684000.0,1386411000.0,1400275000.0,1414278000.0,1428420000.0
unspecified,1pctCO2,World,Cumulative Emissions|CO2,Mt CO2,FaIR-1.5-DEFAULT,0.0,22180.08,50280.44,82107.3,116624.6,153305.8,191849.2,232054.8,273772.9,316881.7,...,1381841000.0,1395643000.0,1409584000.0,1423664000.0,1437885000.0,1452248000.0,1466754000.0,1481406000.0,1496204000.0,1511151000.0
unspecified,1pctCO2,World,Effective Radiative Forcing,W/m^2,FaIR-1.5-DEFAULT,0.0,0.05756472,0.1151294,0.1726942,0.2302589,0.2878236,0.3453883,0.4029531,0.4605178,0.5180825,...,36.89899,36.95655,37.01412,37.07168,37.12925,37.18681,37.24438,37.30194,37.35951,37.41707
unspecified,1pctCO2,World,Effective Radiative Forcing|Anthropogenic,W/m^2,FaIR-1.5-DEFAULT,0.0,0.05756472,0.1151294,0.1726942,0.2302589,0.2878236,0.3453883,0.4029531,0.4605178,0.5180825,...,36.89899,36.95655,37.01412,37.07168,37.12925,37.18681,37.24438,37.30194,37.35951,37.41707
unspecified,1pctCO2,World,Effective Radiative Forcing|Anthropogenic|Aerosols,W/m^2,FaIR-1.5-DEFAULT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unspecified,1pctCO2,World,Effective Radiative Forcing|Anthropogenic|Albedo Change,W/m^2,FaIR-1.5-DEFAULT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
unspecified,1pctCO2,World,Effective Radiative Forcing|Anthropogenic|CH4,W/m^2,FaIR-1.5-DEFAULT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<IPython.core.display.Javascript object>

In [27]:
wasp_prob_data = db.filter(climatemodel="*WASP*")
if not wasp_prob_data.timeseries().empty:
    wasp_prob_data = wasp_prob_data.timeseries().reset_index()

    wasp_prob_data["variable"] = wasp_prob_data["variable"].apply(
        lambda x: x.replace("|00th", "|0th").replace("|05th", "|5th")
    )

    db = db.filter(climatemodel="*WASP*", keep=False).append(
        ScmDataFrame(wasp_prob_data)
    )

db.filter(climatemodel="*WASP*").head(10)

Filtered ScmDataFrame is empty!
Filtered ScmDataFrame is empty!


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,time
model,scenario,region,variable,unit,climatemodel


<IPython.core.display.Javascript object>

## Unify units and check names

Here we loop over the submissions and unify their units as well as checking their naming matches what we expect.

In [28]:
base_df = db.timeseries()
any_failures = False

clean_db = []
for climatemodel, cdf in tqdm.tqdm_notebook(
    base_df.groupby("climatemodel"), desc="Climate model"
):
    print(climatemodel)
    print("-" * len(climatemodel))

    any_failures_climatemodel = False

    cdf = ScmDataFrame(cdf)
    cdf_converted_units = unify_units(cdf, protocol_variables)
    try:
        check_all_scenarios_as_in_protocol(cdf_converted_units, protocol_scenarios)
        check_all_variables_and_units_as_in_protocol(
            cdf_converted_units, protocol_variables
        )
    except AssertionError as e:
        print(e)
        any_failures_climatemodel = True
    #     # currently not possible as groups weren't told to obey variable hierarchy,
    #     # add this in phase 2
    #     for v_top in cdf_converted_units.filter(level=0)["variable"].unique():
    #         print(v_top)
    #         cdf_pyam = cdf_converted_units.filter(variable="{}*".format(v_top)).timeseries()
    #         cdf_pyam.columns = cdf_pyam.columns.map(lambda x: x.year)

    #         cdf_consistency_checker = pyam.IamDataFrame(cdf_pyam)
    #         if cdf_consistency_checker.check_internal_consistency() is not None:
    #             print("Failed for {}".format(v_top))
    #             any_failures_climatemodel = True
    #             failing_set = cdf_consistency_checker.copy()

    print()
    if not any_failures_climatemodel:
        clean_db.append(cdf_converted_units)
        print("All clear for {}".format(climatemodel))
    else:
        print("Failed {}".format(climatemodel))
        print("X" * len("Failed"))
        any_failures = True

    print()
    print()

if any_failures:
    raise AssertionError("database isn't ready yet")
else:
    clean_db = df_append(clean_db)
    clean_db.head()

HBox(children=(IntProgress(value=0, description='Climate model', max=6, style=ProgressStyle(description_width=…

Cicero-SCM
----------


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))


All clear for Cicero-SCM


Cicero-SCM-ECS3
---------------


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))


All clear for Cicero-SCM-ECS3


ESCIMO,rcmip,base
-----------------


HBox(children=(IntProgress(value=0, max=11), HTML(value='')))


All clear for ESCIMO,rcmip,base


FaIR-1.5-DEFAULT
----------------


HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


All clear for FaIR-1.5-DEFAULT


MAGICC7.1.0.beta-rcmip-phase-1
------------------------------


HBox(children=(IntProgress(value=0, max=226), HTML(value='')))


All clear for MAGICC7.1.0.beta-rcmip-phase-1


OSCARv3.0
---------


HBox(children=(IntProgress(value=0, max=546), HTML(value='')))


All clear for OSCARv3.0





<IPython.core.display.Javascript object>

In [29]:
clean_db.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,time,1750-01-01 00:00:00,1751-01-01 00:00:00,1752-01-01 00:00:00,1753-01-01 00:00:00,1754-01-01 00:00:00,1755-01-01 00:00:00,1756-01-01 00:00:00,1757-01-01 00:00:00,1758-01-01 00:00:00,1759-01-01 00:00:00,...,3121-01-01 00:00:00,3122-01-01 00:00:00,3123-01-01 00:00:00,3124-01-01 00:00:00,3125-01-01 00:00:00,3126-01-01 00:00:00,3127-01-01 00:00:00,3128-01-01 00:00:00,3129-01-01 00:00:00,3130-01-01 00:00:00
model,scenario,region,variable,unit,climatemodel,unit_context,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
unspecified,1pctCO2,World,Surface Air Ocean Blended Temperature Change,K,Cicero-SCM,not_required,0.0,0.0,0.0,-1.09868e-07,-3.75629e-07,-5.73132e-07,-7.23049e-07,-8.40909e-07,-9.36405e-07,-1.01574e-06,...,,,,,,,,,,
unspecified,1pctCO2,World,Surface Air Temperature Change,K,Cicero-SCM,not_required,0.0,0.0,0.0,-2.13313e-07,-5.4928e-07,-7.2698e-07,-8.61866e-07,-9.67909e-07,-1.05383e-06,-1.12521e-06,...,,,,,,,,,,
unspecified,1pctCO2,World,Surface Ocean Temperature Change,K,Cicero-SCM,not_required,0.0,0.0,0.0,-6.52088e-08,-3.00961e-07,-5.07225e-07,-6.63699e-07,-7.86666e-07,-8.86275e-07,-9.6902e-07,...,,,,,,,,,,
unspecified,1pctCO2,World,Surface Air Ocean Blended Temperature Change,K,Cicero-SCM-ECS3,not_required,0.0,0.0,0.0,-1.5686e-07,-5.16675e-07,-7.70094e-07,-9.5865e-07,-1.10597e-06,-1.22584e-06,-1.32656e-06,...,,,,,,,,,,
unspecified,1pctCO2,World,Surface Air Temperature Change,K,Cicero-SCM-ECS3,not_required,0.0,0.0,0.0,-3.17086e-07,-7.89e-07,-1.01622e-06,-1.18528e-06,-1.31737e-06,-1.42485e-06,-1.51516e-06,...,,,,,,,,,,


<IPython.core.display.Javascript object>

Notes whilst doing this:

- I wasn't clear that the variable hierarchy needs to be obeyed, hence doing internal consistency checks isn't going to work

For phase 2:

- checking internal consistency super slow, worth looping over top level variables when doing this to speed up filtering
- need to decide what a sensible tolerance is
- might have to go back to model notes to work out why there are inconsistencies
- will have to implement a custom hack to deal with the double counting in the direct aerosol forcing hierarchy

## Creating a database

In [30]:
save_into_database(clean_db, OUTPUT_DATABASE_PATH, "rcmip-phase-1")

HBox(children=(IntProgress(value=0, description='Climate models', max=6, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Regions', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Variables', max=165, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Regions', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Variables', max=165, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Regions', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Variables', max=11, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Regions', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Variables', max=57, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Regions', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Variables', max=226, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Regions', max=1, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='Variables', max=546, style=ProgressStyle(description_width='i…



<IPython.core.display.Javascript object>

In [31]:
clean_db

<scmdata.dataframe.ScmDataFrame at 0x7f633af54b10>

<IPython.core.display.Javascript object>