# Calculating a Lead Time Dependent Mean Bias Correction for the S2S Reforecasts Ensemble
Version 6 May 2024, Selina Kiefer

### Input: csv-files
ensemble of S2S reforecasts of absolute temperature for a winter in csv-format, continuous timeseries of ground truth temperature in csv-format
### Output: csv-file
timeseries lead time dependent mean bias corrected S2S reforecasts in csv-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configurations/'
ifile_configurations = 'Configurations_Lead_Time_Dependent_Mean_Bias_Correction_S2S_Reforecasts_Ensemble.yaml'

#### Import the necessary python packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import calendar
import numpy as np
import calendar
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import properscoring as ps

In [None]:
# Import the necessary defined functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *
from truncate_data_by_date import*
from create_auxiliary_date import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file.
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# Read in the s2s_reforecasts_ensemble and remove any unnamed columns as well as the index column.
df_s2s_reforecasts_ensemble = read_in_csv_data(config['PATH_s2s_reforecasts_ensemble'], config['ifile_s2s_reforecasts_ensemble'])
df_s2s_reforecasts_ensemble = df_s2s_reforecasts_ensemble.loc[:, ~df_s2s_reforecasts_ensemble.columns.str.contains('^Unnamed')]
df_s2s_reforecasts_ensemble = df_s2s_reforecasts_ensemble.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the S2S reforecasts ensemble.
time_column_name_s2s_reforecasts_ensemble = df_s2s_reforecasts_ensemble.columns[0]
var_column_name_s2s_reforecasts_ensemble = df_s2s_reforecasts_ensemble.columns[1:]

In [None]:
# Check that everything is selected correctly.
print('Names of ensemble members of the S2S reforecasts ensemble: ')
print(var_column_name_s2s_reforecasts_ensemble)
print('Name of the column containing the time: ')
print(time_column_name_s2s_reforecasts_ensemble)
print('Dataframe containing the S2S reforecasts ensemble: ')
df_s2s_reforecasts_ensemble.head()

In [None]:
# Read in the ground truth and remove any unnamed columns as well as the index column.
df_ground_truth = read_in_csv_data(config['PATH_ground_truth'], config['ifile_ground_truth'])
df_ground_truth = df_ground_truth.loc[:, ~df_ground_truth.columns.str.contains('^Unnamed')]
df_ground_truth = df_ground_truth.drop(['index', 'level_0'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the ground truth.
time_column_name_ground_truth = df_ground_truth.columns[0]
var_column_name_ground_truth = df_ground_truth.columns[1]

In [None]:
# Check that everything is selected correctly.
print('Ground truth to compare the s2s_reforecasts ensemble with: ')
print(var_column_name_ground_truth)
print('Name of the column containing the time: ')
print(time_column_name_ground_truth)
print('Dataframe containing the ground truth: ')
df_ground_truth.head()

#### Select only the dates from the ground truth which are present in the S2S reforecast ensemble

In [None]:
# A list with all start years of the winters in the evaluation period is created. 
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

In [None]:
# Select the evaluation period from the ground truth.
start_evaluation_period = datetime(config['start_year_of_first_winter'], config['start_month_winter'], config['start_day_winter'])
end_evaluation_period = datetime(config['start_year_of_last_winter']+1, config['end_month_winter'], config['end_day_winter'])

df_ground_truth = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_evaluation_period.strftime('%Y_%m_%d'), end_evaluation_period.strftime('%Y_%m_%d')) 

In [None]:
# Find the dates which are present in the S2S reforecasts ensemble and the ground truth data.
joint_dates = []
l = 0

for i in range(len(df_ground_truth[time_column_name_ground_truth])):
    if df_ground_truth[time_column_name_ground_truth].iloc[i].strftime('%Y-%m-%d') == df_s2s_reforecasts_ensemble[time_column_name_s2s_reforecasts_ensemble].iloc[l]:
        joint_dates.append(df_ground_truth[time_column_name_ground_truth].iloc[i])
        l = l+1
        if l>len(df_s2s_reforecasts_ensemble[time_column_name_s2s_reforecasts_ensemble])-1:
            l = 0
    else:
        joint_dates.append(np.nan)

In [None]:
# Append these dates to the dataframe containing the ground truth data.
df_ground_truth['joint_dates'] = joint_dates
df_ground_truth = df_ground_truth.dropna()
df_ground_truth = df_ground_truth.drop(['joint_dates'], axis=1)

In [None]:
# Set the date as the index of the dataframes.
df_ground_truth[time_column_name_ground_truth] = pd.to_datetime(df_ground_truth[time_column_name_ground_truth])
df_ground_truth = df_ground_truth.set_index(time_column_name_ground_truth)

df_s2s_reforecasts_ensemble[time_column_name_s2s_reforecasts_ensemble] = pd.to_datetime(df_s2s_reforecasts_ensemble[time_column_name_s2s_reforecasts_ensemble])
df_s2s_reforecasts_ensemble = df_s2s_reforecasts_ensemble.set_index(time_column_name_s2s_reforecasts_ensemble)

In [None]:
# Now, the mean bias is calculated with a leave-one-winter-out approach. For all winters but one, the one which 
# serves as the forecast, the mean ensemble prediction per day is calculated. From this, the ground truth values for
# the same day is subtracted which gives the mean bias of that day. Then, the temporal mean over all days is taken.
s2s_reforecasts_ensemble = []
mean_bias_for_correction = []

for start_year in start_years_of_winter:
    
    month_before_start_winter = datetime(start_year, config['start_month_winter']-1, config['start_day_winter'])
    end_winter = datetime(start_year+1, config['end_month_winter'], config['end_day_winter'])
    
    df_ground_truth_without_respective_winter = df_ground_truth.loc[(df_ground_truth.index < month_before_start_winter) | (df_ground_truth.index > end_winter)]    
    df_ground_truth_without_respective_winter = df_ground_truth_without_respective_winter.reset_index()
    ground_truth = np.array(df_ground_truth_without_respective_winter[var_column_name_ground_truth])


    df_s2s_reforecasts_ensemble_without_respective_winter = df_s2s_reforecasts_ensemble.loc[(df_s2s_reforecasts_ensemble.index < month_before_start_winter) | (df_s2s_reforecasts_ensemble.index > end_winter)]    
    
    df_s2s_reforecasts_ensemble_without_respective_winter = df_s2s_reforecasts_ensemble_without_respective_winter.reset_index()
    s2s_reforecasts_ensemble_without_respective_winter = df_s2s_reforecasts_ensemble_without_respective_winter.drop([time_column_name_s2s_reforecasts_ensemble], axis=1)
    s2s_reforecasts_ensemble_without_respective_winter = np.array(np.squeeze(s2s_reforecasts_ensemble_without_respective_winter))
 
    mean_bias_all_dates = []
 
    for m in range(len(s2s_reforecasts_ensemble_without_respective_winter)):
        s2s_mean_prediction = np.mean(s2s_reforecasts_ensemble_without_respective_winter[m])
        mean_bias_one_date = s2s_mean_prediction - ground_truth[m]
        mean_bias_all_dates.append(mean_bias_one_date)
    
    mean_bias_for_correction.append(np.mean(mean_bias_all_dates))

In [None]:
# The index of the dataframes containing the ground truth and S2S reforecasts is reset again.
df_ground_truth = df_ground_truth.reset_index()
df_s2s_reforecasts_ensemble = df_s2s_reforecasts_ensemble.reset_index()

In [None]:
# Now, the S2S reforecasts ensemble of each year is extracted. From this, the respective mean bias is subtracted and 
# the so mean bias corrected S2S reforecast ensemble is added to a list.
s2s_reforecasts_ensemble = []
mean_bias_corrected_s2s_reforecasts_ensemble = []

for n in range(len(start_years_of_winter)):
    
    start_winter = datetime(start_years_of_winter[n], config['start_month_winter'], config['start_day_winter'])
    end_winter = datetime(start_years_of_winter[n]+1, config['end_month_winter'], config['end_day_winter'])

    df_ground_truth_respective_winter = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_winter.strftime('%Y_%m_%d'), end_winter.strftime('%Y_%m_%d')) 
   
    df_s2s_reforecasts_ensemble_respective_winter = truncate_data_by_date(df_s2s_reforecasts_ensemble, time_column_name_s2s_reforecasts_ensemble, start_winter.strftime('%Y_%m_%d'), end_winter.strftime('%Y_%m_%d')) 
    
    s2s_reforecasts_ensemble_respective_winter = df_s2s_reforecasts_ensemble_respective_winter.drop([time_column_name_s2s_reforecasts_ensemble], axis=1)
    s2s_reforecasts_ensemble_respective_winter = np.array(np.squeeze(s2s_reforecasts_ensemble_respective_winter))
 
    s2s_reforecasts_ensemble.append(s2s_reforecasts_ensemble_respective_winter)
    
    mean_bias_corrected_s2s_reforecasts_ensemble.extend(s2s_reforecasts_ensemble_respective_winter-mean_bias_for_correction[n])

In [None]:
# In a next step, the mean bias corrected S2S reforecast ensemble is converted into a pandas dataframe again and the
# time information about the valid date is added.
df_mean_bias_corrected_s2s_reforecasts_ensemble = pd.DataFrame(mean_bias_corrected_s2s_reforecasts_ensemble)
df_mean_bias_corrected_s2s_reforecasts_ensemble.insert(0, time_column_name_s2s_reforecasts_ensemble, df_s2s_reforecasts_ensemble[time_column_name_s2s_reforecasts_ensemble])

In [None]:
# Then, the mean bias corrected S2S reforecast ensemble is saved in csv-format.
df_mean_bias_corrected_s2s_reforecasts_ensemble.to_csv(config['PATH_output_file']+'S2S_Reforecast_Ensemble_Mean_Bias_Corrected_Lead_Time_'+config['lead_time']+'_2000_2020.csv')

In [None]:
# End of Program