# Assessing the Skill of a RFC Model with the Brier Score (BS)
Version 22 January 2024, Selina Kiefer 

### Input: csv-files
predictions of the Random Forest Classifier Models as binary timeseries of cold wave days in csv-format, binary timeseries of cold wave days in csv-format
### Output: csv-file, png-files
continuous timeseries of daily BS values in csv-format and plotted in png-format as well as the prediction of the RFC plotted together with the ground truth in png-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configurations/'
ifile_configurations = 'Configurations_Skill_Assessment_RFC_with_BS.yaml'

#### Import the necessary python packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import numpy as np
import calendar
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
# Import the necessary defined functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *
from truncate_data_by_date import*
from create_auxiliary_date import *
from apply_cold_wave_definition_smid_et_al_2019 import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file.
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# Read in the predictions and remove any unnamed columns as well as the index column.
df_predictions = read_in_csv_data(config['PATH_predictions'], config['ifile_predictions'])
df_predictions = df_predictions.loc[:, ~df_predictions.columns.str.contains('^Unnamed')]
df_predictions = df_predictions.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the predictions.
time_column_name_predictions = df_predictions.columns[0]
var_column_name_predictions = df_predictions.columns[1]

In [None]:
# Check that everything is selected correctly.
print('Names of predictions done by the ML model: ')
print(var_column_name_predictions)
print('Name of the column containing the time: ')
print(time_column_name_predictions)
print('Dataframe containing the predictions: ')
df_predictions.head()

In [None]:
# Read in the ground truth and remove any unnamed columns as well as the index column.
df_ground_truth = read_in_csv_data(config['PATH_ground_truth'], config['ifile_ground_truth'])
df_ground_truth = df_ground_truth.loc[:, ~df_ground_truth.columns.str.contains('^Unnamed')]
df_ground_truth = df_ground_truth.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the ground truth.
time_column_name_ground_truth = df_ground_truth.columns[0]
var_column_name_ground_truth = df_ground_truth.columns[1]

In [None]:
# Check that everything is selected correctly.
print('Ground truth to compare the predictions with: ')
print(var_column_name_ground_truth)
print('Name of the column containing the time: ')
print(time_column_name_ground_truth)
print('Dataframe containing the ground truth: ')
df_ground_truth.head()

#### Select only the dates from the ground truth which are present in the S2S reforecast ensemble

In [None]:
# Select the evaluation period from the ground truth.
start_evaluation_period = datetime(config['start_year_of_first_winter'], config['start_month_winter'], config['start_day_winter'])
end_evaluation_period = datetime(config['start_year_of_last_winter']+1, config['end_month_winter'], config['end_day_winter'])

df_ground_truth = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_evaluation_period.strftime('%Y_%m_%d'), end_evaluation_period.strftime('%Y_%m_%d')) 

In [None]:
# Find the dates which are present in the S2S reforecasts ensemble and the ground truth data.
joint_dates = []
l = 0

for i in range(len(df_ground_truth[time_column_name_ground_truth])):
    if df_ground_truth[time_column_name_ground_truth].iloc[i].strftime('%Y-%m-%d') == df_predictions[time_column_name_predictions].iloc[l]:
        joint_dates.append(df_ground_truth[time_column_name_ground_truth].iloc[i])
        l = l+1
        if l>len(df_predictions[time_column_name_predictions])-1:
            l = 0
    else:
        joint_dates.append(np.nan)

In [None]:
# Append these dates to the dataframe containing the ground truth data.
df_ground_truth['joint_dates'] = joint_dates
df_ground_truth = df_ground_truth.dropna()
df_ground_truth = df_ground_truth.drop(['joint_dates'], axis=1)

#### Prepare the RFC predictions and the ground truth for the skill assessment 

In [None]:
# A list with all the start years of the winters in the evaluation period is created. 
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

In [None]:
# In a next step, the predictions of each year are extracted and saved to a list. The same is
# done for the ground truth. The respective forecast dates of each year are also saved to a 
# list. 
all_winters_list_cold_waves_ground_truth = []
all_winters_list_cold_waves_predictions = []
forecast_dates = []

for start_year_of_winter in start_years_of_winter:
        
    start_winter = datetime(start_year_of_winter, config['start_month_winter'], config['start_day_winter'])
    end_winter = datetime(start_year_of_winter+1, config['end_month_winter'], config['end_day_winter'])

    df_ground_truth_respective_winter = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_winter.strftime('%Y_%m_%d'), end_winter.strftime('%Y_%m_%d')) 
    df_predictions_respective_winter = truncate_data_by_date(df_predictions, time_column_name_predictions, start_winter.strftime('%Y_%m_%d'), end_winter.strftime('%Y_%m_%d'))   

    all_winters_list_cold_waves_ground_truth.append(df_ground_truth_respective_winter[var_column_name_ground_truth])
    all_winters_list_cold_waves_predictions.append(df_predictions_respective_winter[var_column_name_predictions])
    
    forecast_dates.append(pd.to_datetime(df_ground_truth_respective_winter[time_column_name_ground_truth]))

#### Calculation of the BS between the ground truth and the RFC predictions

In [None]:
# Now, the daily BS is computed and saved to a list. Furthermore, the forecast time is saved in
# continuous form to a list.
forecast_time = []
bs_daily_one_year = []
bs = []
bs_winterwise = []

for k in range(len(start_years_of_winter)):
    forecast_time.extend(forecast_dates[k])
    bs_one_year = 0
    bs_daily_one_year = []
    for l in range(len(all_winters_list_cold_waves_predictions[k])):
        bs_daily = (all_winters_list_cold_waves_predictions[k][l]-all_winters_list_cold_waves_ground_truth[k][l])**2
        bs_daily_one_year.append(bs_daily)
    bs.extend(np.array(bs_daily_one_year))
    bs_winterwise.append(np.array(bs_daily_one_year))

In [None]:
# Then, a dataframe containing the daily BS values and the corresponding forecasting times is
# created. 
df_skill_measure_bs = pd.DataFrame()
df_skill_measure_bs['time'] = forecast_time
df_skill_measure_bs['BS'] = np.array(bs)

#### Save the BS values in csv-format

In [None]:
# This pandas dataframe containing the BS values is saved in csv format. 
df_skill_measure_bs.to_csv(config['PATH_statistics']+config['model_name']+'_BS_ground_truth_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.csv')

#### Visualizing the RFC predictions together with the ground truth and the BS for a plausibility check

In [None]:
# Before plotting, the information about the input data which should be shown in the plot title
# is converted to a nice-looking string by creating the line-breaks set in the configuration 
# file.
str_input_info_for_plot_titles = config['input_data_title']
str_input_info_for_plot_titles = str_input_info_for_plot_titles.replace('|', '\n')

In [None]:
# For illustration purposes, the fraction of ensemble members of the RFC predicting a cold wave
# days is plotted with the cold waves in the ground truth. This gives a first impression about 
# the model's forecast skill.
for i in range(len(start_years_of_winter)):
    fig = plt.subplots()
    plt.plot(forecast_dates[i], all_winters_list_cold_waves_ground_truth[i], color='k', linestyle='--', label='Ground Truth')
    plt.plot(forecast_dates[i], all_winters_list_cold_waves_predictions[i], marker='o', linestyle='', color='b', alpha=0.6, label='Predictions')    
    plt.legend(bbox_to_anchor=(0, -0.15), loc='upper left')
    plt.xlabel(time_column_name_ground_truth)
    plt.ylabel(var_column_name_predictions)
    plt.title(config['model_name']+' Predictions, Lead Time '+str(config['lead_time'])+'d, \n Input: '+str_input_info_for_plot_titles)
    plt.savefig(config['PATH_plots']+config['model_name']+'_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(start_years_of_winter[i])+'_'+str(start_years_of_winter[i]+1)+'.png', bbox_inches='tight')

In [None]:
# The BS values for each winter are plotted separately. In combination with the plot above a first plausibility
# check is possible. The lower the BS value, the more similar the prediction of the RFC and the ground truth 
# have to be.
for n in range(len(start_years_of_winter)):
    fig = plt.subplots()
    plt.plot(forecast_dates[n], bs_winterwise[n], color='b', marker='o', markersize=4, linestyle='--')
    plt.axhline(y=np.nanmean(bs_winterwise[n]), color='grey', linestyle='--', label='Wintermean')
    plt.legend(bbox_to_anchor=(0, -0.15), loc='upper left')
    plt.xlabel(time_column_name_ground_truth)
    plt.ylabel('BS')
    plt.title('Daily BS of '+config['model_name']+' Predictions, Lead Time '+str(config['lead_time'])+'d, \n Input: '+str_input_info_for_plot_titles)
    plt.savefig(config['PATH_plots']+config['model_name']+'_BS_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(start_years_of_winter[n])+'_'+str(start_years_of_winter[n]+1)+'.png', bbox_inches='tight')

#### Visualizing the CRPS for all winters in the evaluation period for a quick overview of the forecasting performance of the climatological ensemble

In [None]:
# The timeseries of the daily BS values is plotted for all winters in the evaluation period. 
plt.plot(forecast_time, bs, marker='s', linestyle='', markersize=2, color='b')
plt.xlabel(time_column_name_ground_truth)
plt.ylabel('BS')
plt.title('Daily BS of '+config['model_name']+' Predictions, Lead Time '+str(config['lead_time'])+'d, \n Input: '+str_input_info_for_plot_titles)
plt.savefig(config['PATH_plots']+config['model_name']+'_BS_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.png', bbox_inches='tight')

In [None]:
# End of Program