# Assessing the Skill of a QRF Model with the Continuous Ranked Probability Score (CRPS)
Version 22 January 2024, Selina Kiefer

### Input: csv-files
predictions of the Quantile Random Forest models as continuous timeseries of temperature in csv-format,  continuous timeseries of ground truth temperature in csv-format
### Output: csv-file, png-files
continuous timeseries of daily CRPS values in csv-format and plotted in png-format as well as the prediction of the QRF plotted together with the ground truth in png-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configurations/'
ifile_configurations = 'Configurations_Skill_Assessment_QRF_with_CRPS.yaml'

#### Import the necessary python packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import numpy as np
import calendar
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import properscoring as ps

In [None]:
# Import the necessary defined functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *
from truncate_data_by_date import*
from create_auxiliary_date import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file.
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# Read in the predictions and remove any unnamed columns as well as the index column.
df_predictions = read_in_csv_data(config['PATH_predictions'], config['ifile_predictions'])
df_predictions = df_predictions.loc[:, ~df_predictions.columns.str.contains('^Unnamed')]
df_predictions = df_predictions.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the predictions.
time_column_name_predictions = df_predictions.columns[0]
var_column_name_predictions = df_predictions.columns[1:]

In [None]:
# Check that everything is selected correctly.
print('Names of predictions done by the ML model: ')
print(var_column_name_predictions)
print('Name of the column containing the time: ')
print(time_column_name_predictions)
print('Dataframe containing the predictions: ')
df_predictions.head()

In [None]:
# Read in the ground truth and remove any unnamed columns as well as the index column.
df_ground_truth = read_in_csv_data(config['PATH_ground_truth'], config['ifile_ground_truth'])
df_ground_truth = df_ground_truth.loc[:, ~df_ground_truth.columns.str.contains('^Unnamed')]
df_ground_truth = df_ground_truth.drop(['index', 'level_0'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the ground truth.
time_column_name_ground_truth = df_ground_truth.columns[0]
var_column_name_ground_truth = df_ground_truth.columns[1]

In [None]:
# Check that everything is selected correctly.
print('Ground truth to compare the predictions with: ')
print(var_column_name_ground_truth)
print('Name of the column containing the time: ')
print(time_column_name_ground_truth)
print('Dataframe containing the ground truth: ')
df_ground_truth.head()

#### Select only the dates from the ground truth which are present in the S2S reforecast ensemble

In [None]:
# Select the evaluation period from the ground truth.
start_evaluation_period = datetime(config['start_year_of_first_winter'], config['start_month_winter'], config['start_day_winter'])
end_evaluation_period = datetime(config['start_year_of_last_winter']+1, config['end_month_winter'], config['end_day_winter'])

df_ground_truth = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_evaluation_period.strftime('%Y_%m_%d'), end_evaluation_period.strftime('%Y_%m_%d')) 

In [None]:
# Find the dates which are present in the S2S reforecasts ensemble and the ground truth data.
joint_dates = []
l = 0

for i in range(len(df_ground_truth[time_column_name_ground_truth])):
    if df_ground_truth[time_column_name_ground_truth].iloc[i].strftime('%Y-%m-%d') == df_predictions[time_column_name_predictions].iloc[l]:
        joint_dates.append(df_ground_truth[time_column_name_ground_truth].iloc[i])
        l = l+1
        if l>len(df_predictions[time_column_name_predictions])-1:
            l = 0
    else:
        joint_dates.append(np.nan)

In [None]:
# Append these dates to the dataframe containing the ground truth data.
df_ground_truth['joint_dates'] = joint_dates
df_ground_truth = df_ground_truth.dropna()
df_ground_truth = df_ground_truth.drop(['joint_dates'], axis=1)

#### Prepare the QRF predictions and the ground truth for the skill assessment 

In [None]:
# A list with all the start years of the winters in the evaluation period is created. 
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

In [None]:
# In a next step, the predictions of each year are extracted and saved to a list. The same is
# done for the ground truth. The respective forecast dates of each year are also saved to a 
# list. 
predictions = []
ground_truth = []
forecast_dates = []

for start_year_of_winter in start_years_of_winter:
    
    start_winter = datetime(start_year_of_winter, config['start_month_winter'], config['start_day_winter'])
    end_winter = datetime(start_year_of_winter+1, config['end_month_winter'], config['end_day_winter'])

    df_ground_truth_respective_winter = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_winter.strftime('%Y_%m_%d'), end_winter.strftime('%Y_%m_%d')) 
    df_predictions_respective_winter = truncate_data_by_date(df_predictions, time_column_name_predictions, start_winter.strftime('%Y_%m_%d'), end_winter.strftime('%Y_%m_%d'))   
    
    predictions_respective_winter = df_predictions_respective_winter.drop([time_column_name_predictions], axis=1)
    predictions_respective_winter = np.array(np.squeeze(predictions_respective_winter))
 
    predictions.append(predictions_respective_winter)
    
    ground_truth.append(df_ground_truth_respective_winter[var_column_name_ground_truth])
    forecast_dates.append(pd.to_datetime(df_ground_truth_respective_winter[time_column_name_ground_truth]))

#### Calculation of the CRPS between the ground truth and the QRF predictions

In [None]:
# Now, the CRPS between the ground truth data and the QRF forecasts is computed
# (ps.crps_ensemble), converted into a dataframe and written into a list. Additionally, a 
# continuous list of all forecast dates is created.
crps = []
crps_winterwise = []
forecast_time = []

for l in range(len(start_years_of_winter)):
       
    for_crps = ps.crps_ensemble(ground_truth[l], predictions[l]) 
    df_crps = pd.DataFrame(for_crps)
    crps.extend(np.array(df_crps))
    crps_winterwise.append(np.array(df_crps))
    forecast_time.extend(forecast_dates[l])

In [None]:
# The CRPS and the respective forecast dates are combined in a new dataframe. The CRPS values
# are rounded for a nicer representation.
df_skill_measure_crps = pd.DataFrame()
crps = np.round(crps, decimals=2)

df_skill_measure_crps['time'] = forecast_time
df_skill_measure_crps['CRPS'] = crps

#### Save the CRPS values in csv-format

In [None]:
# Now, the pandas dataframe containing the CRPS values is saved in csv format. 
df_skill_measure_crps.to_csv(config['PATH_statistics']+config['model_name']+'_CRPS_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.csv')

#### Visualizing the QRF predictions together with the ground truth and the CRPS for a plausibility check

In [None]:
# Before plotting, the information about the input data which should be shown in the plot title
# is converted to a nice-looking string by creating the line-breaks set in the configuration 
# file.
str_input_info_for_plot_titles = config['input_data_title']
str_input_info_for_plot_titles = str_input_info_for_plot_titles.replace('|', '\n')

In [None]:
# For illustration purposes, the median and two in the configuration file defined percentiles of
# the predictions are plotted together with the ground truth. This gives a first impression
# about the models' forecast skill.
for k in range(len(start_years_of_winter)):
    fig = plt.subplots()
    plt.plot(forecast_dates[k], np.median(predictions[k], axis=1), color='b', label='Median of Predictions')
    plt.fill_between(x=forecast_dates[k], y1=np.percentile(predictions[k], config['upper_quantile']*100, axis=1), y2=np.percentile(predictions[k], config['lower_quantile']*100, axis=1), color='b', alpha=0.25, label=(str(config['lower_quantile'])+'-'+str(config['upper_quantile'])+' Quantiles of Predictions'))
    plt.plot(forecast_dates[k], np.array(np.squeeze(ground_truth[k])), color='k', linestyle='--', label='Ground Truth')
    plt.legend(bbox_to_anchor=(0, -0.15), loc='upper left')
    plt.xlabel(time_column_name_ground_truth)
    plt.ylabel(var_column_name_ground_truth+' in '+config['unit_of_ground_truth_and_predictions'])
    plt.title(config['model_name']+' Predictions, Lead Time '+str(config['lead_time'])+'d, \n Input: '+str_input_info_for_plot_titles)
    plt.savefig(config['PATH_plots']+config['model_name']+'_predictions_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(start_years_of_winter[k])+'_'+str(start_years_of_winter[k]+1)+'.png', bbox_inches='tight')

In [None]:
# The CRPS values for each winter are plotted separately. In combination with the plot above a first plausibility
# check is possible. The lower the CRPS value, the more similar the prediction of the QRF and the ground truth 
# have to be.
for m in range(len(start_years_of_winter)):
    fig = plt.subplots()
    plt.plot(forecast_dates[m], crps_winterwise[m], color='b', marker='o', markersize=4, linestyle='--')
    plt.axhline(y=np.nanmean(crps_winterwise[m]), color='grey', linestyle='-', label='Wintermean')
    plt.legend(bbox_to_anchor=(0, -0.15), loc='upper left')
    plt.xlabel(time_column_name_ground_truth)
    plt.ylabel('CRPS in '+config['unit_of_ground_truth_and_predictions'])
    plt.title('Daily CRPS of '+config['model_name']+' Predictions, Lead Time '+str(config['lead_time'])+'d, \n Input: '+str_input_info_for_plot_titles)
    plt.savefig(config['PATH_plots']+config['model_name']+'_CRPS_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(start_years_of_winter[m])+'_'+str(start_years_of_winter[m]+1)+'.png', bbox_inches='tight')

#### Visualizing the CRPS for all winters in the evaluation period for a quick overview of the forecasting performance of the climatological ensemble

In [None]:
# The timeseries of the daily CRPS values is plotted for the whole evaluation period.
plt.plot(forecast_time, crps, marker='s', linestyle='', markersize=2, color='b')
plt.xlabel(time_column_name_ground_truth)
plt.ylabel('CRPS in '+config['unit_of_ground_truth_and_predictions'])
plt.title('Daily CRPS of '+config['model_name']+' Predictions, Lead Time '+str(config['lead_time'])+'d, \n Input: '+str_input_info_for_plot_titles)
plt.savefig(config['PATH_plots']+config['model_name']+'_timeseries_CRPS_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.png', bbox_inches='tight')

In [None]:
# End of Program