# Forecasting with Quantile Random Forest (QRF) Models

Version 19 January 2024, Selina Kiefer

### Input: csv-file, pt-file
continuous timeseries of input data (e.g. statistics of meteorological predictor fields), Quantile Random Forests models in pt-format
### Output: csv-file, png-file
predictions of the Quantile Random Forest models as continuous timeseries of temperature in csv-format  and plotted for one winter exemplarily in png-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configurations/'
ifile_configurations = 'Configurations_QRF_Forecast.yaml'

#### Import the necessary python packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import calendar
from collections import defaultdict
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import torch
from skranger.ensemble import RangerForestRegressor

In [None]:
# Import the necessary python packages and functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *
from truncate_data_by_date import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file (nothing needs to be changed here).
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# If applicable, read in the reanalysis data and remove any unnamed columns as well as the index column.
if config['use_reanalysis_data']: 
    df_input_data_era5 = read_in_csv_data(config['PATH_input_data_era5'], config['ifile_input_data_era5'])
    df_input_data_era5 = df_input_data_era5.loc[:, ~df_input_data_era5.columns.str.contains('^Unnamed')]
    df_input_data_era5 = df_input_data_era5.drop(['index'], axis =1 )

In [None]:
# If applicable, set the name of the columns containing the time and the variables of the reanalysis data.
if config['use_reanalysis_data']: 
    time_column_name_input_data_era5 = df_input_data_era5.columns[0]
    var_column_name_input_data_era5 = df_input_data_era5.columns[1:]

In [None]:
# Check that everything is selected correctly for the reanalysis data if used.
if config['use_reanalysis_data']: 
    print('Predictors used for training the ML model: ')
    print(var_column_name_input_data_era5)
    print('Name of the column containing the time: ')
    print(time_column_name_input_data_era5)
    print('Dataframe containing the p_era5redictors: ')
    df_input_data_era5.head()

In [None]:
# Read in the input data and remove any unnamed columns as well as the index column.
df_input_data_s2s = read_in_csv_data(config['PATH_input_data_s2s'], config['ifile_input_data_s2s'])
df_input_data_s2s = df_input_data_s2s.loc[:, ~df_input_data_s2s.columns.str.contains('^Unnamed')]
df_input_data_s2s = df_input_data_s2s.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the input data.
time_column_name_input_data_s2s = df_input_data_s2s.columns[0]
var_column_name_input_data_s2s = df_input_data_s2s.columns[1:]

In [None]:
# Check that everything is selected correctly.
print('Predictors used for training the ML model: ')
print(var_column_name_input_data_s2s)
print('Name of the column containing the time: ')
print(time_column_name_input_data_s2s)
print('Dataframe containing the p_s2sredictors: ')
df_input_data_s2s.head()

#### If reanalysis data is used, select only the dates which are present in both, the ERA5 input and the S2S reforecasts

In [None]:
# Select the evaluation period from the input_data.
if config['use_reanalysis_data']: 
    start_evaluation_period = datetime(config['start_year_of_first_winter'], config['start_month_winter'], config['start_day_winter'])
    end_evaluation_period = datetime(config['start_year_of_last_winter']+1, config['end_month_winter'], config['end_day_winter'])


In [None]:
# Convert the column containing the time to datetime objects for both, the reanalysis data and the reforecast data.
df_input_data_s2s[time_column_name_input_data_s2s] = pd.to_datetime(df_input_data_s2s[time_column_name_input_data_s2s])
if config['use_reanalysis_data']: 
    df_input_data_era5[time_column_name_input_data_era5] = pd.to_datetime(df_input_data_era5[time_column_name_input_data_era5])


In [None]:
# Find the dates which are present in the S2S reforecasts ensemble and the reanalysis data. Consider thereby the 
# lead time since for the S2S reforecasts only the valid date is given and not the initial date.
if config['use_reanalysis_data']: 
    dates_era5 = []

    for i in range(len(df_input_data_era5[time_column_name_input_data_era5])):
        dates_era5.append(df_input_data_era5[time_column_name_input_data_era5].iloc[i])

    joint_dates = []
    l = 0

    for i in range(len(df_input_data_era5[time_column_name_input_data_era5])):
        date_with_lead_time_considered = df_input_data_s2s[time_column_name_input_data_s2s].iloc[l]-timedelta(days=config['lead_time'])
    
        if df_input_data_era5[time_column_name_input_data_era5].iloc[i] == date_with_lead_time_considered:
            joint_dates.append(date_with_lead_time_considered)
            l = l+1
            if l>len(df_input_data_s2s[time_column_name_input_data_s2s])-1:
                l = 0
            
        elif date_with_lead_time_considered in dates_era5:
    
            joint_dates.append(np.nan)
        
        else:
            joint_dates.append(np.nan)
        
            l = l+1
            if l>len(df_input_data_s2s[time_column_name_input_data_s2s])-1:
                l = 0

In [None]:
# Append these dates to the dataframe containing the reanalysis data.
if config['use_reanalysis_data']: 
    df_input_data_era5['joint_dates'] = joint_dates
    df_input_data_era5 = df_input_data_era5.dropna()
    df_input_data_era5 = df_input_data_era5.drop(['joint_dates'], axis=1)

In [None]:
# Combine S2S and ERA5 predictors and set a new time column name.
if config['use_reanalysis_data']: 
    df_input_data_era5 = df_input_data_era5.drop(time_column_name_input_data_era5, axis=1)
    
    columns_era5 = df_input_data_era5.columns
    df_input_data = df_input_data_s2s

    for k in columns_era5:
        df_input_data[k] = np.array(df_input_data_era5[k])
        
    time_column_name_input_data = time_column_name_input_data_s2s

#### Preparing the input data for forecasting

In [None]:
# A list with all the start years of the winters to be predicted is created. 
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

In [None]:
# In order to extract the different winters to be predicted, the index of the dataframe
# containing the input data is set to the time. The time column is converted beforehand into a
# datetime-object.
df_input_data[time_column_name_input_data] = pd.to_datetime(df_input_data[time_column_name_input_data])
df_input_data = df_input_data.set_index(time_column_name_input_data)

#### Defining the quantiles used for predicting by the QRF models

In [None]:
# For predicting with the QRF-model, a list with the desired quantiles is created. 
if config['distributed_evenly']:
    list_quantiles_qrf = list(np.round(np.linspace(0, 1,config['number_of_quantiles']), decimals=2))
else:
    list_quantiles_qrf = config['list_quantiles_qrf']

#### Forecasting with the QRF-models
For every winter to be predicted, the respective QRF model trained with the leave-one(-winter)-out cross-validation is used.

In [None]:
# Here, the forecasting with the QRFs takes place. At first, the model used for forecasting the 
# respective winter is loaded. Then, the start- and end-date of this winter is determined and
# the number of days which have to be taken from the month before the winter. This is done to
# take the lead time into account and start the prediction with the first day of the winter.
# In a next steps, the days of winter are computed in order to create a list of forecast dates.
# These are needed to assign the predictions of the QRF to a date later. Then, the respective
# winter is extracted from the input data (.loc[]) and the time column removed. Now, the loaded
# QRF model is used to predict the desired quantiles. These predictions are saved to a list.
# This is done for every winter in the evaluation period separately.
predictions = []
forecast_dates = []

for i in range(len(start_years_of_winter)):
    
    quantile_regresssion_forest = torch.load(config['PATH_model']+config['list_file_name_model'][i])
    
    start_winter = datetime(start_years_of_winter[i], config['start_month_winter'], config['start_day_winter'])
    month_before_start_winter = datetime(start_years_of_winter[i], config['start_month_winter']-1, config['start_day_winter'])
    end_winter = datetime(start_years_of_winter[i]+1, config['end_month_winter'], config['end_day_winter'])
    
    df_X_val = df_input_data.loc[(df_input_data.index > month_before_start_winter) & (df_input_data.index < end_winter)]    
    df_X_val = df_X_val.reset_index()
    
    forecast_dates_winter = df_X_val[time_column_name_input_data]

    X_val = df_X_val.drop([time_column_name_input_data], axis=1)
    predictions_qrf = quantile_regresssion_forest.predict_quantiles(X_val, quantiles=list_quantiles_qrf)
    print('Predictions for Winter '+str(start_years_of_winter[i])+'/'+str(start_years_of_winter[i]+1)+' Made.')

    predictions.append(predictions_qrf)
    forecast_dates.extend(forecast_dates_winter)

#### Bringing the forecasts into a nice representation

In [None]:
# For a nice representation of the predictions in a pandas dataframe, the list containing the
# predictions is transposed, so that each predicted winter can be extracted easily. For every
# day of the winter, the respective forecast is stored first as a numpy array and then 
# appended to a new dataframe which contains every prediction in a nicely sorted way.
df_predictions = pd.DataFrame()

for k in range(len(start_years_of_winter)):
    predictions_single_winter = np.array(predictions[k])
    
    for l in range(52):
            predictions_daily = np.array(predictions_single_winter[l])
            df_predictions = df_predictions.append(pd.Series(predictions_daily), ignore_index=True)


In [None]:
# To this dataframe, the forecast dates are added and moved to the beginning of the dataframe
# for a good overview.
df_predictions['time'] = forecast_dates
time_column = df_predictions.pop('time')
df_predictions.insert(0, 'time', time_column)

#### Saving the predictions in csv-format

In [None]:
# Then, the dataframe is saved in csv-format.
df_predictions.to_csv(config['PATH_predictions']+'QRF_predictions_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_winter_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.csv')

#### Visualizing the predictions of one winter for a plausibility check

In [None]:
# For simplicity, the first predicted winter is plotted for a plausibility check.
df_predictions.iloc[0:52].plot(x='time', legend=False, marker='o', linestyle='')
plt.xlabel('Date')
plt.ylabel(config['ground_truth']+' in '+config['unit_of_ground_truth_and_prediction'])
plt.title('QRF Predictions for the Winter '+str(config['start_year_of_first_winter'])+'/'+str(config['start_year_of_first_winter']+1))
plt.savefig(config['PATH_predictions']+'QRF_predictions_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_winter_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_first_winter']+1)+'.png', bbox_inches='tight')

In [None]:
# End of Program