# Creating and Training Quantile Random Forest (QRF) Models

Version 19 December 2022, Selina Kiefer

### Input: csv-files
continuous timeseries of input data (e.g. statistics of meteorological predictor fields), continuous timeseries of ground truth temperature in csv-format
### Output: pt-file and txt-file
Quantile Random Forests models in pt-format, file with metadata of the models in txt-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configuration_Files/'
ifile_configurations = 'Configurations_QRF_Model.yaml'

#### Import the necessary python packages and functions
Nothing needs to be changed here.

In [None]:
# Import the necessary python packages.
import yaml
import calendar
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import torch
from skranger.ensemble import RangerForestRegressor

In [None]:
# Import the needed functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file (nothing needs to be changed here).
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# Read in the input data and remove any unnamed columns as well as the index column (nothing 
# needs to be changed here).
df_input_data = read_in_csv_data(config['PATH_input_data'], config['ifile_input_data'])
df_input_data = df_input_data.loc[:, ~df_input_data.columns.str.contains('^Unnamed')]
df_input_data = df_input_data.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the input data.
time_column_name_input_data = df_input_data.columns[0]
var_column_name_input_data = df_input_data.columns[1:]

In [None]:
# Check that everything is selected correctly (nothing needs to be changed here).
print('Predictors used for training the ML model: ')
print(var_column_name_input_data)
print('Name of the column containing the time: ')
print(time_column_name_input_data)
print('Dataframe containing the predictors: ')
df_input_data.head()

In [None]:
# Read in the ground truth and remove any unnamed columns as well as the index column (nothing 
# needs to be changed here).
df_ground_truth = read_in_csv_data(config['PATH_ground_truth'], config['ifile_ground_truth'])
df_ground_truth = df_ground_truth.loc[:, ~df_ground_truth.columns.str.contains('^Unnamed')]
df_ground_truth = df_ground_truth.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the ground truth.
time_column_name_ground_truth = df_ground_truth.columns[0]
var_column_name_ground_truth = df_ground_truth.columns[1]

In [None]:
# Check that everything is selected correctly (nothing needs to be changed here).
print('Predictand used for training the ML model: ')
print(var_column_name_ground_truth)
print('Name of the column containing the time: ')
print(time_column_name_ground_truth)
print('Dataframe containing the predictand: ')
df_ground_truth.head()

#### Setting the input features' names and the winters to be evaluated 
From here on, nothing needs to be changed.

In [None]:
# A list with all the start years of the winters in the evaluation period is created.
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

In [None]:
# Another list containing the names of the input features is created. 
df_input_features = df_input_data.drop([time_column_name_input_data], axis=1)
input_features = df_input_features.columns.values

#### Preparing the input data for a leave-one(-winter)-out cross-validation

In [None]:
# When performing a leave-one-out cross-validation appraoch, the training data needs to be
# different for every left-out winter of the validation period. For an easy removal of the
# winter to be left out, the time column of the input and the ground truth data is converted
# to a datetime-object and then set as the index. 
df_input_data[time_column_name_input_data] = pd.to_datetime(df_input_data[time_column_name_input_data])
df_input_data = df_input_data.set_index(time_column_name_input_data)

df_ground_truth[time_column_name_ground_truth] = pd.to_datetime(df_ground_truth[time_column_name_ground_truth])
df_ground_truth = df_ground_truth.set_index(time_column_name_ground_truth)

#### Training of the QRF-models with a leave-one(-winter)-out cross validation
For every of these winters, a separate QRF model is trained and then saved.

In [None]:
# Here, the actual training takes place. To perform a leave-one-out cross-validation, the 
# respective winter has to be cut out of the training data timeseries (.loc[]). Then, the 
# variable columns of the splitted training data (the one without the respective winter) is
# written into a pandas dataframe for both, the input data and the ground truth. Now, the 
# Quantile (quantile=true) Random Forest (RangerForestRegressor) is trained (fit()) and saved
# (torch.save) for further use. This is done for every winter in the evaluation period 
# separately.
for start_year in start_years_of_winter:        
    month_before_start_winter = datetime(start_year, config['start_month_winter']-1, config['start_day_winter'])
    end_winter = datetime(start_year+1, config['end_month_winter'], config['end_day_winter'])
 
    df_X_train = df_input_data.loc[(df_input_data.index < month_before_start_winter) | (df_input_data.index > end_winter)]    
    df_y_train = df_ground_truth.loc[(df_ground_truth.index < month_before_start_winter) | (df_ground_truth.index > end_winter)]    
    df_X_train = df_X_train.reset_index()
    df_y_train = df_y_train.reset_index()
    
    df_y_train = df_y_train.drop([time_column_name_ground_truth], axis=1)
    df_X_train = df_X_train.drop([time_column_name_input_data], axis=1)
    
    y_train = np.array(df_y_train)
    X_train = np.array(df_X_train)   
 
    if config['obtain_additional_details_of_trees']:
        quantile_regresssion_forest = RangerForestRegressor(n_estimators=1000, min_mode_size=5, quantiles=True, enable_tree_details=True)
        quantile_regresssion_forest = quantile_regresssion_forest.fit(X_train, np.squeeze(y_train))
        torch.save(quantile_regresssion_forest, config['PATH_model']+'QRF_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_without_'+str(start_year)+'_'+str(start_year+1)+'_with_tree_details.pt')        
        print('Model without Winter '+str(start_year)+'/'+str(start_year+1)+' Trained.')
    else: 
        quantile_regresssion_forest = RangerForestRegressor(n_estimators=1000, min_mode_size=5, quantiles=True)
        quantile_regresssion_forest = quantile_regresssion_forest.fit(X_train, np.squeeze(y_train))
        torch.save(quantile_regresssion_forest, config['PATH_model']+'QRF_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_without_'+str(start_year)+'_'+str(start_year+1)+'.pt')
        print('Model without Winter '+str(start_year)+'/'+str(start_year+1)+' Trained.')

#### Creation of the metadata as specified in the configuration file for all trained QRF models

In [None]:
# In order to combine every relevant information about the QRF model and the training process,
# everything which cannot be inferred from the code is written in a list. This information
# has to be given manually in the configuration file. 
additional_info_on_variables=['dataset_input : '+config['dataset_input_data'],
                             'dataset_ground_truth: '+config['dataset_ground_truth'],
                              'type_input_data: '+config['type_input_data'],
                              'type_ground_truth: '+config['type_ground_truth'],
                              'unit_of_ground_truth_and_prediction : '+config['unit_of_ground_truth_and_prediction'],
                             'training_period: '+config['training_period'], 
                            'start_month_winter: '+str(config['start_month_winter']),
                              'start_day_winter: '+str(config['start_day_winter']),
                            'end_month_winter: '+str(config['end_month_winter']),
                              'end_day_winter: '+str(config['end_day_winter']),
                              'lead_time_in_days: '+str(config['lead_time']),
                             'training_type: '+config['training_type']]

#### Save the metadata and the model parameters for all trained QRF models in one combined txt-file

In [None]:
# All the relevant information about the QRF model and its training is combined and saved to a
# txt-file. This is done only once for the whole evaluation period since the model setup is the
# same for every winter. 
metadata_model = additional_info_on_variables
qrf_hyperparameters = quantile_regresssion_forest.get_params()
winter_left_out = 'validation_period_winters_left_out_one_at_a_time_: '+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)
metadata_model.append(winter_left_out)
metadata_model.append('QRF_hyperparameters: '+str(qrf_hyperparameters))
if config['obtain_additional_details_of_trees']:
    file = open(config['PATH_model']+'QRF_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_validation_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'_with_tree_details.txt', 'w') 
else: 
    file = open(config['PATH_model']+'QRF_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_validation_'+str(config['start_year_of_first_winter'])+'_'+str(config['start_year_of_last_winter']+1)+'.txt', 'w') 
file.write('\n'.join(metadata_model))
file.close() 

In [None]:
# End of Program