# Prepare input data for ClimateBench

In [1]:
import xarray as xr
import os.path
import pandas as pd
from glob import glob
import xesmf as xe

import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Data paths on JASMIN
input_path = '/gws/nopw/j04/impala/shared/input4MIPS/'
path_f = input_path+'/{species}*-{scenario}-1-1_gn*.nc'
output_path = '/home/users/dwatsonparris/ClimateBench/'
path_f_csv = input_path+'/{species}*-{scenario}-1-1_gn*.csv'

In [3]:
experiments = [
               'ssp126', 'ssp245', 'ssp370', 'ssp370-lowNTCF', 'ssp585' #	ScenarioMIP
]

In [4]:
# All units are in kg m-2 s-1
# Convert to GtC -> multiply by area of earth, period I've summed, and divide by weight of carbon
AREA_OF_EARTH = 5.101e+14 #m2
SECONDS_IN_YEAR = 60*60*24*365 #s
MASS_OF_CARBON = 1.9944235e-26  #kg
FRACTION_OF_CARBON_MASS = (6) / (6 + 8 + 8)

convert = lambda x: x * AREA_OF_EARTH * SECONDS_IN_YEAR * 1e-12 # kg -> Gt

def global_mean(ds):
    if 'plev' in ds.coords:
        res = ds.sum('plev')
    else:
        res = ds
        
    weights = np.cos(np.deg2rad(ds.lat))
    weights.name = "weights"
    gl_mean = res.weighted(weights).mean(("lon", "lat"))
    
    return gl_mean

def global_sum(ds):
    if 'plev' in ds.coords:
        res = ds.sum('plev')
    else:
        res = ds
        
    weights = np.cos(np.deg2rad(ds.lat))
    weights.name = "weights"
    gl_total = res.weighted(weights).sum(("lon", "lat")) * AREA_OF_EARTH
    
    return gl_total

# Setup regridding

In [5]:
# Example NorESM grid
noresm_grid = xr.open_dataset(output_path+'/NorESM2-LM_ssp245-GHG_r1i1p1f1.nc')
# Example input4MIPS grid
example_input4MIPs_ds = xr.open_mfdataset(path_f.format(species="CO2", scenario='ssp585'), combine='by_coords').sum('sector')
regridder = xe.Regridder(example_input4MIPs_ds, noresm_grid.rename({'lat': 'latitude', 'lon': 'longitude'}), 'conservative', periodic=True)

FileNotFoundError: [Errno 2] No such file or directory: '/home/users/dwatsonparris/ClimateBench/NorESM2-LM_ssp245-GHG_r1i1p1f1.nc'

In [None]:
# Get an example emissions dataset to check it's working
coarse_emissions = regridder(example_input4MIPs_ds)

print(coarse_emissions.sel(time='2015-01')['CO2_em_anthro'].sum().compute())
coarse_emissions.sel(time='2015-01')['CO2_em_anthro'].squeeze().plot.pcolormesh(vmin=1e-7, vmax=2e-7)

In [None]:
print(example_input4MIPs_ds.sel(time='2015-01')['CO2_em_anthro'].sum().compute())
example_input4MIPs_ds.sel(time='2015-01')['CO2_em_anthro'].squeeze().plot.pcolormesh(vmin=1e-7, vmax=2e-7)

# Get input4MIPS

In [None]:
# Deal with CMIP experiments

# get the baseline CO2 mass from piControl

# CO2 mass in kg
baseline_co2_mass = xr.open_mfdataset(input_path+'/co2mass_Amon_NorESM2-LM_piControl*.nc')['co2mass'].mean()

# 1pctCO2

# (additional) CO2 mass in kg
co2_ts = (xr.open_mfdataset(input_path+'/co2mass_Amon_NorESM2-LM_1pctCO2*.nc')['co2mass'].groupby('time.year').mean() - baseline_co2_mass).rename('CO2')  * 1e-12 # kg -> Gt
so2_ts = xr.zeros_like(co2_ts).rename('SO2')
ch4_ts = xr.zeros_like(co2_ts).rename('CH4')
bc_ts = xr.zeros_like(co2_ts).rename('BC')

co2_ts.plot()

ds = xr.merge([co2_ts, so2_ts, ch4_ts, bc_ts])
ds.rename({'year': 'time'}).to_netcdf('inputs_1pctCO2.nc')

# abrupt-4xCO2

# (additional) CO2 mass in kg
co2_ts = (xr.open_mfdataset(input_path+'/co2mass_Amon_NorESM2-LM_abrupt-4xCO2*.nc')['co2mass'].groupby('time.year').mean() - baseline_co2_mass).rename('CO2')  * 1e-12 # kg -> Gt
so2_ts = xr.zeros_like(co2_ts).rename('SO2')
ch4_ts = xr.zeros_like(co2_ts).rename('CH4')
bc_ts = xr.zeros_like(co2_ts).rename('BC')

co2_ts.plot()
ds = xr.merge([co2_ts, so2_ts, ch4_ts, bc_ts])
ds.rename({'year': 'time'}).to_netcdf('inputs_abrupt-4xCO2.nc')


In [None]:
# Use the global checksums
all_files = glob(input_path+'/CO2-em-AIR-anthro_input4MIPs_emissions_CMIP_CEDS-2017-05-18_gn_*.csv')
co2_air=pd.concat((pd.read_csv(f, index_col=['year', 'month', 'sector']) for f in all_files)) 
co2_air=co2_air.groupby(level=['year']).sum() / 1e6 # kt -> Gt
all_files = glob(input_path+'/CO2-em-anthro_input4MIPs_emissions_CMIP_CEDS-2017-05-18_gn_*.csv')
co2=pd.concat((pd.read_csv(f, index_col=['year', 'month', 'sector']) for f in all_files)) 
co2=co2.groupby(level=['year']).sum() / 1e6 # kt -> Gt
co2_tot = (co2_air + co2)[100:]
co2_tot

In [None]:
# Use the global checksums
all_files = glob(input_path+'/CH4-em-anthro_input4MIPs_emissions_CMIP_CEDS-*.csv')
ch4=pd.concat((pd.read_csv(f, index_col=['year', 'month', 'sector']).rename(columns={'value': 'global_total'}) for f in all_files)) 
# Fill in the data up to 1970
ch4 = ch4.groupby(level=['year']).sum().reindex(range(1850, 2015)).interpolate('linear') / 1e6 # kt -> Gt
# Note the AIR component for methane is zero over the historical period
ch4

In [None]:
# historical

hist_co2_ts = xr.DataArray(co2_tot.cumsum()['global_total'], name='CO2')
hist_so2_ts = regridder(xr.open_mfdataset(input_path+'/SO2-em-anthro_input4MIPs_emissions_CMIP_CEDS-*.nc')['SO2_em_anthro'].sum('sector').groupby('time.year').mean().sel(year=slice(1850, None)).rename('SO2'))
hist_ch4_ts = xr.DataArray(ch4['global_total'], name='CH4')
hist_bc_ts = regridder(xr.open_mfdataset(input_path+'/BC-em-anthro_input4MIPs_emissions_CMIP_CEDS-*.nc')['BC_em_anthro'].sum('sector').groupby('time.year').mean().sel(year=slice(1850, None)).rename('BC'))

ds = xr.merge([hist_co2_ts, hist_so2_ts, hist_ch4_ts, hist_bc_ts], fill_value=0.) # Set a fill value to set the methane to zero before 1970..
# Fix some metadata to help iris
ds.latitude.attrs['units'] = 'degrees'    
ds.rename({'year': 'time'}).to_netcdf('inputs_historical.nc')

In [None]:
# Deal with DAMIP experiments
'hist-GHG', 'hist-aer', 'hist-nat', 'hist-piAer', 'hist-piNTCF'

# hist-GHG
# Time varying global annual mean concentrations for CO2 and other long-lived greenhouse-gases 
# https://view.es-doc.org/?renderMethod=name&project=cmip6&type=cim.2.designing.NumericalExperiment&client=esdoc-url-rewrite&name=hist-ghg

ds = xr.merge([hist_co2_ts, 
               hist_ch4_ts, 
               xr.zeros_like(hist_bc_ts).rename('BC'),
               xr.zeros_like(hist_so2_ts).rename('SO2'),
              ])
print(ds)
ds.rename({'year': 'time'}).to_netcdf('inputs_hist-GHG.nc')

# hist-aer
# forced by changes in anthropogenic aerosol forcing only 
# https://view.es-doc.org/?renderMethod=name&project=cmip6&type=cim.2.designing.NumericalExperiment&client=esdoc-url-rewrite&name=hist-aer

ds = xr.merge([xr.zeros_like(hist_co2_ts).rename('CO2'), 
               xr.zeros_like(hist_ch4_ts).rename('CH4'), 
               hist_bc_ts,
               hist_so2_ts,
              ])
# Fix some metadata to help iris
ds.latitude.attrs['units'] = 'degrees'    
print(ds)
ds.rename({'year': 'time'}).to_netcdf('inputs_hist-aer.nc')



In [None]:

for exp in experiments:
    print(exp)
    
    # According to email from Dirk on 14th December the NorESM2 simulations on ESGF
    #  used the ssp370 GHG emissions for ssp370-lowNTCF (as per the AerChemMIP spec, but not the input4MIPS files)
    if exp == 'ssp370-lowNTCF':
        GHG_exp = 'ssp370'
    else:
        GHG_exp = exp
        
    co2_files = glob(path_f_csv.format(species="CO2", scenario=GHG_exp))
    co2=pd.concat((pd.read_csv(f, index_col=['year', 'month', 'sector']) for f in co2_files)).groupby(level=['year']).sum() / 1e6
    # Get the CO2 emissions
    co2_ts = xr.DataArray(co2['global_total'], name='CO2')

    so2_f = path_f.format(species="SO2", scenario=exp)
    so2_da = xr.open_mfdataset(so2_f, combine='by_coords')['SO2_em_anthro']
    # Get a the current SO2 emissions
    so2_ts = regridder(so2_da.sum('sector').groupby('time.year').mean())
    
    ch4_files = glob(path_f_csv.format(species="CH4", scenario=GHG_exp))
    ch4=pd.concat((pd.read_csv(f, index_col=['year', 'month', 'sector']) for f in ch4_files)).groupby(level=['year']).sum() / 1e3 # The SSP data is in Mt 
    # Get the current CH4 emissions
    ch4_ts = xr.DataArray(ch4['value'], name='CH4')
    
    bc_f = path_f.format(species="BC", scenario=exp, chunks=None)
    # Get a the current BC emissions
    bc_da = xr.open_mfdataset(bc_f, combine='by_coords')['BC_em_anthro']
    bc_ts = regridder(bc_da.sum('sector').groupby('time.year').mean())
    
    # These come in monthly means every 5 years, so take the annual mean then interpolate to every year
    years = list(range(2015, 2101))

    ds = xr.merge([co2_ts.rename('CO2'), so2_ts.rename('SO2'), ch4_ts.rename('CH4'), bc_ts.rename('BC')]).interp(year=years, method='linear')
    
    # Convert the CO2 emissions to *cumulative* CO2 mass in atmosphere. Do this *after* interpolating on to yearly to get the right totals.
    ds['CO2'] = ds['CO2'].cumsum() + hist_co2_ts[-1]  # Be sure to add the current total in 2015
    ds['CH4'].plot(label=exp)

    # Fix some metadata to help iris
    ds.latitude.attrs['units'] = 'degrees'    
#     print(ds)
    ds = ds.rename({'year': 'time'})
    
    if exp == "ssp245":
        # This is a bit nasty but I want to keep the ssp245 in memory for later
        ssp245_ts = ds
    ds.to_netcdf(f"inputs_{exp}.nc")
    
plt.legend()

In [None]:
# ssp245-GHG
# Time varying global annual mean concentrations for CO2 and other long-lived greenhouse-gases 
# https://view.es-doc.org/?renderMethod=name&project=cmip6&type=cim.2.designing.NumericalExperiment&client=esdoc-url-rewrite&name=hist-ghg

ds = xr.merge([ssp245_ts['CO2'], 
               ssp245_ts['CH4'], 
               xr.zeros_like(ssp245_ts['BC']),
               xr.zeros_like(ssp245_ts['SO2']),
              ])
print(ds)
ds.to_netcdf('inputs_ssp245-GHG.nc')

# ssp245-aer
# forced by changes in anthropogenic aerosol forcing only 
# https://view.es-doc.org/?renderMethod=name&project=cmip6&type=cim.2.designing.NumericalExperiment&client=esdoc-url-rewrite&name=hist-aer

ds = xr.merge([xr.zeros_like(ssp245_ts['CO2']), 
               xr.zeros_like(ssp245_ts['CH4']), 
               ssp245_ts['BC'],
               ssp245_ts['SO2'],
              ])
# Fix some metadata to help iris
ds.latitude.attrs['units'] = 'degrees'    
print(ds)
ds.to_netcdf('inputs_ssp245-aer.nc')


# Pull down the timeslice ERF values in case they're useful

In [None]:
erf_df = pd.read_csv('https://raw.githubusercontent.com/chrisroadmap/aerosol-history/main/data_input/RFMIP-ERF-tier2.csv', index_col=0)
noresm_erf_df = erf_df.loc[:2014, erf_df.columns.str.startswith('NorESM2-LM')]
noresm_erf_df.columns = noresm_erf_df.columns.str.replace("NorESM2-LM ", "")
print(noresm_erf_df)
noresm_erf_df.to_csv('inputs_NorESM2_ERF.csv')

In [None]:
# These might also be useful: https://github.com/njleach/GIR/tree/master/GIR/Parameter_Sets

# Merge ensemble members

In [None]:
experiments = [
               '1pctCO2', 'abrupt-4xCO2', 'historical', 'piControl', # CMIP
               'hist-GHG', 'hist-aer', # DAMIP
               'ssp126', 'ssp245', 'ssp370', 'ssp370-lowNTCF', 'ssp585' #	ScenarioMIP
]

# experiments = ['ssp245-aer', 'ssp245-GHG']

def preprocess(ds):
    """
    Promote the ensemble member attribute to a coord
    """
    return ds.assign_coords(member=("member", [ds.attrs['source'].split('_')[2]]))

# Subtract baseline values
## Be sure to not use the first ~100 years which has a wierd diurnal temperature range",
baseline = xr.open_dataset(f"/gws/nopw/j04/impala/public/dwatsonparris/ClimateBench/NorESM2-LM_piControl_r1i1p1f1.nc").sel(year=slice(1800, None)).mean('year')


for exp in experiments:
    files=list(glob(f"NorESM2-LM_{exp}_*"))
    ens_ds = []
    for f in files:
        ds = xr.open_dataset(f)
        ds['member'] = [int(f.split('_')[2][1])]
        ds.assign_coords(member="member")
        ens_ds.append(ds)
    en_ds = xr.concat(ens_ds, dim='member')
    if exp != "piControl":
        en_ds = en_ds - baseline
    # Fix some metadata to help iris
    en_ds.lat.attrs['units'] = 'degrees'
    en_ds = en_ds.rename({'year': 'time'})
    print(en_ds)
    # For some reason these runs go until 2020
    if exp.startswith("hist-"):
        en_ds = en_ds.sel(time=slice(None, 2014))
        
    if exp == '1pctCO2':
        print(en_ds['diurnal_temperature_range'])
    en_ds.to_netcdf(f'outputs_{exp}.nc')

# Create the tar-balls

In [None]:
import tarfile
import os.path

def make_tarfile(output_filename, files):
    with tarfile.open(output_filename, "w:gz") as tar:
        for f in files:
            tar.add(f)

train_val_files = [f"inputs_{exp}.nc" for exp in experiments if exp not in ["ssp245", "piControl"]] + [f"outputs_{exp}.nc" for exp in experiments if exp != "ssp245"]
make_tarfile('train_val.tar.gz', train_val_files)
        
test_files = ["inputs_ssp245.nc", "outputs_ssp245.nc"]
make_tarfile('test.tar.gz', test_files)