# Forecasting with Random Forest Classifier (RFC) Models

Version 19 December 2022, Selina Kiefer

### Input: csv-file, pt-file
continuous timeseries of input data (e.g. statistics of meteorological predictor fields), Random Forest Classifier models in pt-format
### Output: csv-file, png-file
predictions of the Random Forest Classifier models as timeseries of cold wave days in csv-format and plotted for one winter exemplarily in png-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configuration_Files/'
ifile_configurations = 'Configurations_RFC_Forecast.yaml'

#### Import the necessary python packages and functions
Nothing needs to be changed here.

In [None]:
# Import the necessary python packages.
import yaml
import calendar
from collections import defaultdict
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import torch
from skranger.ensemble import RangerForestClassifier

In [None]:
# Import the necessary python packages and functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file (nothing needs to be changed here).
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# Read in the input data and remove any unnamed columns as well as the index column (nothing 
# needs to be changed here).
df_input_data = read_in_csv_data(config['PATH_input_data'], config['ifile_input_data'])
df_input_data = df_input_data.loc[:, ~df_input_data.columns.str.contains('^Unnamed')]
df_input_data = df_input_data.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the input data.
time_column_name_input_data = df_input_data.columns[0]
var_column_name_input_data = df_input_data.columns[1:]

In [None]:
# Check that everything is selected correctly (nothing needs to be changed here).
print('Predictors used for training the ML model: ')
print(var_column_name_input_data)
print('Name of the column containing the time: ')
print(time_column_name_input_data)
print('Dataframe containing the predictors: ')
df_input_data.head()

#### Preparing the input data for forecasting
From here on, nothing needs to be changed.

In [None]:
# A list with all the start years of the winters to be predicted is created. 
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

In [None]:
# In order to extract the different winters to be predicted, the index of the dataframe
# containing the input data is set to the time. The time column is converted beforehand into a
# datetime-object.
df_input_data[time_column_name_input_data] = pd.to_datetime(df_input_data[time_column_name_input_data])
df_input_data = df_input_data.set_index(time_column_name_input_data)

#### Forecasting with the RFC-models
For every winter to be predicted, the respective RFC model trained with the leave-one(-winter)-out cross-validation is used.

In [None]:
# Here, the forecasting with the RFCs takes place. At first, the model used for forecasting the 
# respective winter is loaded. Then, the start- and end-date of this winter is determined and
# the number of days which has to be taken from the month before the winter. This is done to
# take the lead time into account and start the prediction with the first day of the winter.
# In a next steps, the days of winter computed in order to create a list of forecast dates.
# These are needed to assign the predictions of the RFCs to a date later. Then, the respective
# winter is extracted from the input data (.loc[]) and the time column removed. Now, the loaded
# RFC model is used to predict the occurrence of cold wave days. These predictions are saved to a list.
# This is done for every winter in the evaluation period separately.
predictions = []
forecast_dates = []

for i in range(len(start_years_of_winter)):
    
    random_forest_classifier = torch.load(config['PATH_model']+config['list_file_name_model'][i])
    
    start_winter = datetime(start_years_of_winter[i], config['start_month_winter'], config['start_day_winter'])
    month_before_start_winter = datetime(start_years_of_winter[i], config['start_month_winter']-1, config['start_day_winter'])
    end_winter = datetime(start_years_of_winter[i]+1, config['end_month_winter'], config['end_day_winter'])
    
    days_of_winter = ((end_winter-start_winter).days)+1
    forecast_dates_winter = [start_winter + timedelta(days=x) for x in range(0, days_of_winter)]

    df_X_val = df_input_data.loc[(df_input_data.index > month_before_start_winter) & (df_input_data.index < end_winter)]    
    df_X_val = df_X_val.reset_index()
        
    X_val = df_X_val.drop([time_column_name_input_data], axis=1)
    predictions_rfc = random_forest_classifier.predict_proba(X_val)
    predictions.append(predictions_rfc[:,1])
    forecast_dates.extend(forecast_dates_winter)

#### Bringing the forecasts into a nice representation

In [None]:
# For a nice representation of the predictions in a pandas dataframe, list containing the
# predictions is transposed, so that each predicted winter can be extracted easily. For every
# day of the winter, the respective forecast is stored first as a numpy array and then 
# appended to a new dataframe which contains every prediction in a nicely sorted way.
df_predictions = pd.DataFrame()

days_of_regular_winter = days_of_winter

if calendar.isleap(config['start_year_of_last_winter']+1)==False:
    days_of_regular_winter = days_of_winter+1
    
for k in range(len(start_years_of_winter)):
    predictions_single_winter = np.transpose(np.array(predictions[k]))
    if calendar.isleap(start_years_of_winter[k]+1):
        for l in range(days_of_regular_winter):
            predictions_daily = np.array(predictions_single_winter[l])
            df_predictions = df_predictions.append(pd.Series(predictions_daily), ignore_index=True)
    else:
        for m in range(days_of_regular_winter-1):
            predictions_daily = np.array(predictions_single_winter[m])
            df_predictions = df_predictions.append(pd.Series(predictions_daily), ignore_index=True)

In [None]:
# To this dataframe, the forecast dates are added and moved to the beginning of the dataframe
# for a good overview of the predictions.
df_predictions = df_predictions.rename(columns={0:'Fraction of Ensembles Members \n Predicting Cold Wave Day'})
df_predictions['time'] = forecast_dates
time_column = df_predictions.pop('time')
df_predictions.insert(0, 'time', time_column)

#### Saving the predictions in csv-format

In [None]:
# Then, the dataframe is saved in csv-format.
df_predictions.to_csv(config['PATH_predictions']+'RFC_predictions_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_winter_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.csv')

#### Visualizing the predictions of one winter for a plausibility check

In [None]:
# For simplicity, the first predicted winter is plotted for a plausibility check.
fig = plt.subplots()

start_winter = datetime(start_years_of_winter[0], config['start_month_winter'], config['start_day_winter'])
end_winter = datetime(start_years_of_winter[1], config['end_month_winter'], config['end_day_winter'])
days_of_winter = ((end_winter-start_winter).days)+1

plt.plot(df_predictions['time'].iloc[0:days_of_winter-1], df_predictions['Fraction of Ensembles Members \n Predicting Cold Wave Day'].iloc[0:180], marker='o', linestyle='', color='k')
plt.xlabel('Date')
plt.ylabel('Fraction od ensemble members \n predicting a cold wave days')
plt.title('QRF Predictions for the Winter '+str(config['start_year_of_first_winter'])+'/'+str(config['start_year_of_first_winter']+1))
plt.savefig(config['PATH_predictions']+'RFC_predictions_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_winter_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_first_winter']+1)+'.png', bbox_inches='tight')

In [None]:
# End of Program