
Compute RPSS and plot achieved RPSS on map and as time series.
Also compute RPSS for each variable and lead time separately


In [None]:
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv2D, MaxPooling2D, Dropout, Reshape, Dot, Add, Activation

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


import xarray as xr
xr.set_options(display_style='text')

import xskillscore as xs



from scripts import skill_by_year, add_year_week_coords
from helper_ml_data import load_data, get_basis, rm_annualcycle, rm_tercile_edges, rm_tercile_edges1, DataGenerator1, single_prediction, skill_by_year_single


import warnings
warnings.simplefilter("ignore")

from scripts import assert_predictions_2020

In [None]:
path_data = 'server'

v= 't2m'
lead_output = 0

In [None]:
def gridcellwise_rpss(fct_p, obs_p, v):
    #def compute_rpss():
    # climatology
    clim_p = xr.DataArray([1/3, 1/3, 1/3], dims='category', coords={'category':['below normal', 'near normal', 'above normal']}).to_dataset(name='tp')
    clim_p['t2m'] = clim_p['tp']
    
    clim_p = clim_p[v]
    
    ## RPSS
    # rps_ML
    rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim=[], input_distributions='p').compute()
    # rps_clim
    rps_clim = xs.rps(obs_p, clim_p, category_edges=None, dim=[], input_distributions='p').compute()
    
    # rpss
    rpss = 1 - (rps_ML / rps_clim)
    return rpss



In [None]:
def skill_by_year_byvar_andlead(preds, cache_path = '../../../../Data/s2s_ai/data', adapt=False):
    """Returns pd.Dataframe of RPSS per year."""
    # similar verification_RPSS.ipynb
    # as scorer bot but returns a score for each year
    import xarray as xr
    import xskillscore as xs
    import pandas as pd
    import numpy as np
    xr.set_options(keep_attrs=True)
    
    # from root
    #renku storage pull data/forecast-like-observations_2020_biweekly_terciled.nc
    #renku storage pull data/hindcast-like-observations_2000-2019_biweekly_terciled.nc
    #cache_path = '../data'
    if 2020 in preds.forecast_time.dt.year:
        obs_p = xr.open_dataset(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc').sel(forecast_time=preds.forecast_time)
    else:
        obs_p = xr.open_dataset(f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_terciled.zarr', engine='zarr').sel(forecast_time=preds.forecast_time)
    
    # ML probabilities
    fct_p = preds

    
    # climatology
    clim_p = xr.DataArray([1/3, 1/3, 1/3], dims='category', coords={'category':['below normal', 'near normal', 'above normal']}).to_dataset(name='tp')
    clim_p['t2m'] = clim_p['tp']
    
    if adapt:
        # select only obs_p where fct_p forecasts provided
        for c in ['longitude', 'latitude', 'forecast_time', 'lead_time']:
            obs_p = obs_p.sel({c:fct_p[c]})
        obs_p = obs_p[list(fct_p.data_vars)]
        clim_p = clim_p[list(fct_p.data_vars)]
    
    else:
        # check inputs
        assert_predictions_2020(obs_p)
        assert_predictions_2020(fct_p)
        
    # rps_ML
    rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim=[], input_distributions='p').compute()
    # rps_clim
    rps_clim = xs.rps(obs_p, clim_p, category_edges=None, dim=[], input_distributions='p').compute()

    ## RPSS
    # penalize # https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/issues/7
    expect = obs_p.sum('category')
    expect = expect.where(expect > 0.98).where(expect < 1.02)  # should be True if not all NaN

    # https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/issues/50
    rps_ML = rps_ML.where(expect, other=2)  # assign RPS=2 where value was expected but NaN found

    # following Weigel 2007: https://doi.org/10.1175/MWR3280.1
    rpss = 1 - (rps_ML.groupby('forecast_time.year').mean() / rps_clim.groupby('forecast_time.year').mean())
    # clip
    rpss = rpss.clip(-10, 1)
    
    # weighted area mean
    weights = np.cos(np.deg2rad(np.abs(rpss.latitude)))
    # spatially weighted score averaged over lead_times and variables to one single value
    scores = rpss.sel(latitude=slice(None, -60)).weighted(weights).mean('latitude').mean('longitude')
    scores = scores.to_array()#.mean(['lead_time', 'variable'])
    return scores.to_dataframe('RPSS')

In [None]:
def skill_by_year_old(preds, cache_path = '../../../../Data/s2s_ai/data',adapt=False):
    """Returns pd.Dataframe of RPSS per year."""
    # similar verification_RPSS.ipynb
    # as scorer bot but returns a score for each year
    import xarray as xr
    import xskillscore as xs
    import pandas as pd
    import numpy as np
    xr.set_options(keep_attrs=True)
    
    # from root
    #renku storage pull data/forecast-like-observations_2020_biweekly_terciled.nc
    #renku storage pull data/hindcast-like-observations_2000-2019_biweekly_terciled.nc
   # cache_path = '../template/data'
    if 2020 in preds.forecast_time.dt.year:
        obs_p = xr.open_dataset(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc').sel(forecast_time=preds.forecast_time)
    else:
        obs_p = xr.open_dataset(f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_terciled.zarr', engine='zarr').sel(forecast_time=preds.forecast_time)
    
    # ML probabilities
    fct_p = preds

    
    # climatology
    clim_p = xr.DataArray([1/3, 1/3, 1/3], dims='category', coords={'category':['below normal', 'near normal', 'above normal']}).to_dataset(name='tp')
    clim_p['t2m'] = clim_p['tp']
    
    if adapt:
        # select only obs_p where fct_p forecasts provided
        for c in ['longitude', 'latitude', 'forecast_time', 'lead_time']:
            obs_p = obs_p.sel({c:fct_p[c]})
        obs_p = obs_p[list(fct_p.data_vars)]
        clim_p = clim_p[list(fct_p.data_vars)]
    
    else:
        # check inputs
        assert_predictions_2020(obs_p)
        assert_predictions_2020(fct_p)
    
    ## RPSS
    # rps_ML
    rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim=[], input_distributions='p').compute()
    # rps_clim
    rps_clim = xs.rps(obs_p, clim_p, category_edges=None, dim=[], input_distributions='p').compute()
    
    # rpss
    rpss = 1 - (rps_ML / rps_clim)
    
    # https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/issues/7

    # penalize
    penalize = obs_p.where(fct_p!=1, other=-10).mean('category')
    rpss = rpss.where(penalize!=0, other=-10)

    # clip
    rpss = rpss.clip(-10, 1)

    # average over all forecasts
    rpss = rpss.groupby('forecast_time.year').mean()
    
    # weighted area mean
    weights = np.cos(np.deg2rad(np.abs(rpss.latitude)))
    # spatially weighted score averaged over lead_times and variables to one single value
    scores = rpss.sel(latitude=slice(None, -60)).weighted(weights).mean('latitude').mean('longitude')
    scores = scores.to_array().mean(['lead_time', 'variable'])
    return scores.to_dataframe('RPSS')


In [None]:
def skill_by_year_old_single(preds, cache_path = '../../../../Data/s2s_ai/data',adapt=False):
    """Returns pd.Dataframe of RPSS per year."""
    # similar verification_RPSS.ipynb
    # as scorer bot but returns a score for each year
    import xarray as xr
    import xskillscore as xs
    import pandas as pd
    import numpy as np
    xr.set_options(keep_attrs=True)
    
    # from root
    #renku storage pull data/forecast-like-observations_2020_biweekly_terciled.nc
    #renku storage pull data/hindcast-like-observations_2000-2019_biweekly_terciled.nc
   # cache_path = '../template/data'
    if 2020 in preds.forecast_time.dt.year:
        obs_p = xr.open_dataset(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc').sel(forecast_time=preds.forecast_time)
    else:
        obs_p = xr.open_dataset(f'{cache_path}/hindcast-like-observations_2000-2019_biweekly_terciled.zarr', engine='zarr').sel(forecast_time=preds.forecast_time)
    
    # ML probabilities
    fct_p = preds

    
    # climatology
    clim_p = xr.DataArray([1/3, 1/3, 1/3], dims='category', coords={'category':['below normal', 'near normal', 'above normal']}).to_dataset(name='tp')
    clim_p['t2m'] = clim_p['tp']
    
    if adapt:
        # select only obs_p where fct_p forecasts provided
        for c in ['longitude', 'latitude', 'forecast_time', 'lead_time']:
            obs_p = obs_p.sel({c:fct_p[c]})
        obs_p = obs_p[list(fct_p.data_vars)]
        clim_p = clim_p[list(fct_p.data_vars)]
    
    else:
        # check inputs
        assert_predictions_2020(obs_p)
        assert_predictions_2020(fct_p)
    
    ## RPSS
    # rps_ML
    rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim=[], input_distributions='p').compute()
    # rps_clim
    rps_clim = xs.rps(obs_p, clim_p, category_edges=None, dim=[], input_distributions='p').compute()
    
    # rpss
    rpss = 1 - (rps_ML / rps_clim)
    
    # https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/issues/7

    # penalize
    penalize = obs_p.where(fct_p!=1, other=-10).mean('category')
    rpss = rpss.where(penalize!=0, other=-10)

    # clip
    rpss = rpss.clip(-10, 1)

    # average over all forecasts
    rpss = rpss.groupby('forecast_time.year').mean()
    
    # weighted area mean
    weights = np.cos(np.deg2rad(np.abs(rpss.latitude)))
    # spatially weighted score averaged over lead_times and variables to one single value
    scores = rpss.sel(latitude=slice(None, -60)).weighted(weights).mean('latitude').mean('longitude')
    scores = scores.to_array()#.mean(['lead_time', 'variable'])
    return scores.to_dataframe('RPSS')

### 2020

In [None]:
#read predictions on test year
def load_pred(pred_folder, years):#, v, lead_output):
    
    das = []
    for v in ['t2m', 'tp']:
        if years == '2020':
            das_lead0 = xr.open_dataset(f'../submissions/{pred_folder}/global_prediction_{v}_lead0_{years}_smooth.nc')[v]#, chunks = {'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1})[v]#.chunk({'forecast_time':'auto','longitude': 'auto','latitude': 'auto'})
            das_lead1 = xr.open_dataset(f'../submissions/{pred_folder}/global_prediction_{v}_lead1_{years}_smooth.nc')[v]#, chunks = {'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1})[v]#.chunk({'forecast_time':'auto','longitude': 'auto','latitude': 'auto'})
        else:
            das_lead0 = xr.open_dataset(f'../submissions/{pred_folder}/global_prediction_{v}_lead0_smooth_{years}.nc')[v]#.chunk({'forecast_time':'auto','longitude': 'auto','latitude': 'auto'})
            das_lead1 = xr.open_dataset(f'../submissions/{pred_folder}/global_prediction_{v}_lead1_smooth_{years}.nc')[v]#.chunk({'forecast_time':'auto','longitude': 'auto','latitude': 'auto'})
        das.append(xr.concat([das_lead0, das_lead1], dim = 'lead_time'))#print(das)
    return xr.merge(das)

years = '2020'#'allyears'
ds_24_10_2 = load_pred('24_10_2', years)#.chunk({'forecast_time':'auto','longitude': 'auto','latitude': 'auto'})
ds_24_10_1 = load_pred('24_10_1', years)
ds_25_10_1 = load_pred('25_10_1', years)

In [None]:
#skill for test year
skill_24_10_2 = skill_by_year(ds_24_10_2, cache_path = '../../../../Data/s2s_ai/data')
print(skill_24_10_2)

skill_24_10_1 = skill_by_year(ds_24_10_1, cache_path = '../../../../Data/s2s_ai/data')
print(skill_24_10_1)

skill_25_10_1 = skill_by_year(ds_25_10_1, cache_path = '../../../../Data/s2s_ai/data')
print(skill_25_10_1)

In [None]:
#this is too big if executed on training data on local computer
average_pred_2020 = xr.concat([ds_24_10_1.expand_dims(dim={'pred': ['24_10_11']}).chunk({'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1}), 
                          ds_24_10_2.expand_dims(dim={'pred': ['24_10_2']}).chunk({'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1}), 
                          ds_25_10_1.expand_dims(dim={'pred': ['25_10_1']}).chunk({'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1})],'pred').mean('pred')

In [None]:
skill_average_2020 = skill_by_year(average_pred_2020, cache_path = '../../../../Data/s2s_ai/data')
print(skill_average_2020)

In [None]:
print(skill_by_year_old(average_pred_2020, cache_path = '../../../../Data/s2s_ai/data'))

In [None]:
v= 't2m'
lead_output = 0

fct_p = average_pred_2020.isel(lead_time = lead_output)
if 2020 in fct_p.forecast_time.dt.year:
    obs_p = load_data(data = 'obs_terciled_2020', aggregation = 'biweekly', path = path_data).isel(lead_time = lead_output)[v]
else:
    obs_p = load_data(data = 'obs_terciled_2000-2019', aggregation = 'biweekly', path = path_data).isel(lead_time = lead_output)[v]


rpss = gridcellwise_rpss(fct_p[v], obs_p,v)

In [None]:
print(rpss.mean())

In [None]:
plt.figure()
rpss.mean('forecast_time').plot()

plt.figure()
rpss.mean(('latitude','longitude')).plot()
plt.hlines(y = 0, 
           xmin = rpss.isel(forecast_time = 0).forecast_time.values, 
           xmax = rpss.isel(forecast_time  = -1).forecast_time.values,
           color = 'black')

In [None]:
v= 't2m'
lead_output = 1

fct_p = average_pred_2020.isel(lead_time = lead_output)
if 2020 in fct_p.forecast_time.dt.year:
    obs_p = load_data(data = 'obs_terciled_2020', aggregation = 'biweekly', path = path_data).isel(lead_time = lead_output)[v]
else:
    obs_p = load_data(data = 'obs_terciled_2000-2019', aggregation = 'biweekly', path = path_data).isel(lead_time = lead_output)[v]


rpss = gridcellwise_rpss(fct_p[v], obs_p,v)

In [None]:
print(rpss.mean())

In [None]:
plt.figure()
rpss.mean('forecast_time').plot()

plt.figure()
rpss.mean(('latitude','longitude')).plot()
plt.hlines(y = 0, 
           xmin = rpss.isel(forecast_time = 0).forecast_time.values, 
           xmax = rpss.isel(forecast_time  = -1).forecast_time.values,
           color = 'black')

In [None]:
rpss_single = skill_by_year_byvar_andlead(average_pred_2020, adapt=False)

In [None]:
print(rpss_single)

In [None]:
rpss_single.droplevel(1).unstack()

In [None]:
rpss_single_old = skill_by_year_old_single(average_pred_2020)

In [None]:
rpss_single_old.droplevel(1).unstack()

In [None]:
rpss_single_old.mean()

### training data

In [None]:
years = 'allyears'
ds_24_10_2 = load_pred('24_10_2', years)#.chunk({'forecast_time':'auto','longitude': 'auto','latitude': 'auto'})
ds_24_10_1 = load_pred('24_10_1', years)
ds_25_10_1 = load_pred('25_10_1', years)

In [None]:
#this is too big if executed on training data on local computer
average_pred = xr.concat([ds_24_10_1.expand_dims(dim={'pred': ['24_10_11']}).chunk({'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1}), 
                          ds_24_10_2.expand_dims(dim={'pred': ['24_10_2']}).chunk({'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1}), 
                          ds_25_10_1.expand_dims(dim={'pred': ['25_10_1']}).chunk({'forecast_time':'auto','longitude': 24,'latitude': 'auto', 'category': 1})],'pred').mean('pred')

In [None]:
#skill for train years
skill_24_10_2 = skill_by_year(ds_24_10_2, cache_path = '../../../../Data/s2s_ai/data')
print(skill_24_10_2)

skill_24_10_1 = skill_by_year(ds_24_10_1, cache_path = '../../../../Data/s2s_ai/data')
print(skill_24_10_1)

skill_25_10_1 = skill_by_year(ds_25_10_1, cache_path = '../../../../Data/s2s_ai/data')
print(skill_25_10_1)

In [None]:
skill_average = skill_by_year(average_pred, cache_path = '../../../../Data/s2s_ai/data', adapt=True)
print(skill_average)

In [None]:
fct_p = average_pred.isel(lead_time = 0)
if 2020 in fct_p.forecast_time.dt.year:
    obs_p = load_data(data = 'obs_terciled_2020', aggregation = 'biweekly', path = path_data).isel(lead_time = lead_output)[v]
else:
    obs_p = load_data(data = 'obs_terciled_2000-2019', aggregation = 'biweekly', path = path_data).isel(lead_time = lead_output)[v]



rpss = gridcellwise_rpss(fct_p[v], obs_p,v)

In [None]:
plt.figure()
rpss.mean('forecast_time').plot()



In [None]:
plt.figure()
rpss.mean(('latitude','longitude')).plot()