# Creating and Training Random Forest Classifier (RFC) Models

Version 19 January 2024, Selina Kiefer

### Input: csv-files
continuous timeseries of input data (e.g. statistics of meteorological predictor fields), binary timeseries of cold wave days in csv-format
### Output: pt-file and txt-file
Random Forest Classifier models in pt-format, file with metadata of the models in txt-format

#### Set the paths' to the defined functions and configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = './Configurations/'
ifile_configurations = 'Configurations_RFC_Model.yaml'

#### Import the necessary python packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import calendar
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import xarray as xr
import matplotlib.pyplot as plt
import torch
from skranger.ensemble import RangerForestClassifier

In [None]:
# Import the needed functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *
from truncate_data_by_date import *

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file (nothing needs to be changed here).
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# If applicable, read in the reanalysis data and remove any unnamed columns as well as the index column.
if config['use_reanalysis_data']: 
    df_input_data_era5 = read_in_csv_data(config['PATH_input_data_era5'], config['ifile_input_data_era5'])
    df_input_data_era5 = df_input_data_era5.loc[:, ~df_input_data_era5.columns.str.contains('^Unnamed')]
    df_input_data_era5 = df_input_data_era5.drop(['index'], axis =1 )

In [None]:
# If applicable, set the name of the columns containing the time and the variables of the reanalysis data.
if config['use_reanalysis_data']: 
    time_column_name_input_data_era5 = df_input_data_era5.columns[0]
    var_column_name_input_data_era5 = df_input_data_era5.columns[1:]

In [None]:
# Check that everything is selected correctly for the reanalysis data if used.
if config['use_reanalysis_data']: 
    print('Predictors used for training the ML model: ')
    print(var_column_name_input_data_era5)
    print('Name of the column containing the time: ')
    print(time_column_name_input_data_era5)
    print('Dataframe containing the predictors: ')
    df_input_data_era5.head()

In [None]:
# Read in the input data and remove any unnamed columns as well as the index column.
df_input_data_s2s = read_in_csv_data(config['PATH_input_data_s2s'], config['ifile_input_data_s2s'])
df_input_data_s2s = df_input_data_s2s.loc[:, ~df_input_data_s2s.columns.str.contains('^Unnamed')]
df_input_data_s2s = df_input_data_s2s.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the input data.
time_column_name_input_data_s2s = df_input_data_s2s.columns[0]
var_column_name_input_data_s2s = df_input_data_s2s.columns[1:]

In [None]:
# Check that everything is selected correctly.
print('Predictors used for training the ML model: ')
print(var_column_name_input_data_s2s)
print('Name of the column containing the time: ')
print(time_column_name_input_data_s2s)
print('Dataframe containing the predictors: ')
df_input_data_s2s.head()

In [None]:
# Read in the ground truth and remove any unnamed columns as well as the index column.
df_ground_truth = read_in_csv_data(config['PATH_ground_truth'], config['ifile_ground_truth'])
df_ground_truth = df_ground_truth.loc[:, ~df_ground_truth.columns.str.contains('^Unnamed')]
df_ground_truth = df_ground_truth.drop(['index'], axis =1 )

In [None]:
# Set the name of the columns containing the time and the variables of the ground truth.
time_column_name_ground_truth = df_ground_truth.columns[0]
var_column_name_ground_truth = df_ground_truth.columns[1]

In [None]:
# Check that everything is selected correctly.
print('Predictand used for training the ML model: ')
print(var_column_name_ground_truth)
print('Name of the column containing the time: ')
print(time_column_name_ground_truth)
print('Dataframe containing the predictand: ')
df_ground_truth.head()

#### If reanalysis data is used, select only the dates which are present in both, the ERA5 input and the S2S reforecasts

In [None]:
# Select the evaluation period from the input_data.
if config['use_reanalysis_data']: 
    start_evaluation_period = datetime(config['start_year_of_first_winter'], config['start_month_winter'], config['start_day_winter'])
    end_evaluation_period = datetime(config['start_year_of_last_winter']+1, config['end_month_winter'], config['end_day_winter'])


In [None]:
# Convert the column containing the time to datetime objects for both, the reanalysis data and the reforecast data.
df_input_data_s2s[time_column_name_input_data_s2s] = pd.to_datetime(df_input_data_s2s[time_column_name_input_data_s2s])
if config['use_reanalysis_data']: 
    df_input_data_era5[time_column_name_input_data_era5] = pd.to_datetime(df_input_data_era5[time_column_name_input_data_era5])


In [None]:
# Find the dates which are present in the S2S reforecasts ensemble and the reanalysis data. Consider thereby the 
# lead time since for the S2S reforecasts only the valid date is given and not the initial date.
if config['use_reanalysis_data']: 
    dates_era5 = []

    for i in range(len(df_input_data_era5[time_column_name_input_data_era5])):
        dates_era5.append(df_input_data_era5[time_column_name_input_data_era5].iloc[i])

    joint_dates = []
    l = 0

    for i in range(len(df_input_data_era5[time_column_name_input_data_era5])):
        date_with_lead_time_considered = df_input_data_s2s[time_column_name_input_data_s2s].iloc[l]-timedelta(days=config['lead_time'])
    
        if df_input_data_era5[time_column_name_input_data_era5].iloc[i] == date_with_lead_time_considered:
            joint_dates.append(date_with_lead_time_considered)
            l = l+1
            if l>len(df_input_data_s2s[time_column_name_input_data_s2s])-1:
                l = 0
            
        elif date_with_lead_time_considered in dates_era5:
    
            joint_dates.append(np.nan)
        
        else:
            joint_dates.append(np.nan)
        
            l = l+1
            if l>len(df_input_data_s2s[time_column_name_input_data_s2s])-1:
                l = 0

In [None]:
# Append these dates to the dataframe containing the reanalysis data.
if config['use_reanalysis_data']: 
    df_input_data_era5['joint_dates'] = joint_dates
    df_input_data_era5 = df_input_data_era5.dropna()
    df_input_data_era5 = df_input_data_era5.drop(['joint_dates'], axis=1)

In [None]:
# Combine S2S and ERA5 predictors and set a new time column name.
if config['use_reanalysis_data']: 
    df_input_data_era5 = df_input_data_era5.drop(time_column_name_input_data_era5, axis=1)
    
    columns_era5 = df_input_data_era5.columns
    df_input_data = df_input_data_s2s
    
    for k in columns_era5:
        df_input_data[k] = np.array(df_input_data_era5[k])
        
    time_column_name_input_data = time_column_name_input_data_s2s

#### Select only the dates which are present in both, the ground truth data and the S2S reforecasts

In [None]:
# Select the evaluation period from the ground truth.
start_evaluation_period = datetime(config['start_year_of_first_winter'], config['start_month_winter'], config['start_day_winter'])
end_evaluation_period = datetime(config['start_year_of_last_winter']+1, config['end_month_winter'], config['end_day_winter'])

df_ground_truth = truncate_data_by_date(df_ground_truth, time_column_name_ground_truth, start_evaluation_period.strftime('%Y_%m_%d'), end_evaluation_period.strftime('%Y_%m_%d')) 

In [None]:
# Find the dates which are present in the input data and the ground truth data.
joint_dates = []
l = 0

for i in range(len(df_ground_truth[time_column_name_ground_truth])):
    if df_ground_truth[time_column_name_ground_truth].iloc[i].strftime('%Y-%m-%d') == df_input_data[time_column_name_input_data].iloc[l].strftime('%Y-%m-%d'):
        joint_dates.append(df_ground_truth[time_column_name_ground_truth].iloc[i])
        l = l+1
        if l>len(df_input_data[time_column_name_input_data])-1:
            l = 0
    else:
        joint_dates.append(np.nan)


In [None]:
# Append these dates to the dataframe containing the ground truth data.
df_ground_truth['joint_dates'] = joint_dates
df_ground_truth = df_ground_truth.dropna()
df_ground_truth = df_ground_truth.drop(['joint_dates'], axis=1)

#### Setting the winters to be evaluated 

In [None]:
# A list with all the start years of the winters in the evaluation period is created. A
# leave-one-out cross-validation will be used later to increase the amount of training data. 
start_years_of_winter = np.arange(config['start_year_of_first_winter'], config['start_year_of_last_winter']+1)

#### Preparing the input data for the leave-one(-winter)-out cross-validation

In [None]:
# When performing a leave-one-out cross-validation appraoch, the training data needs to be
# different for every left-out winter of the evaluation period. For an easy removal of the
# winter to be left out, the time column of the input and the ground truth data is converted
# to a datetime-object and then set as the index. 
df_input_data[time_column_name_input_data] = pd.to_datetime(df_input_data[time_column_name_input_data])
df_input_data = df_input_data.set_index(time_column_name_input_data)

df_ground_truth[time_column_name_ground_truth] = pd.to_datetime(df_ground_truth[time_column_name_ground_truth])
df_ground_truth = df_ground_truth.set_index(time_column_name_ground_truth)

#### Training of the RFC-models with a leave-one(-winter)-out cross validation
For every of these winters, a separate RFC model is trained and then saved.

In [None]:
# Here, the actual training takes place. To perform a leave-one-out cross-validation, the 
# respective winter has to be cut out of the training data timeseries (.loc[]). Then, the 
# variable columns of the splitted training data (the one without the respective winter) is
# written into a pandas dataframe for both, the input data and the ground truth. Now, the 
# Random Forest Classifier (RangerForestClassifier) is trained (fit()) and saved
# (torch.save) for further use. This is done for every winter in the evaluation period 
# separately.
feature_importances = []

for start_year in start_years_of_winter:        
    month_before_start_winter = datetime(start_year, config['start_month_winter']-1, config['start_day_winter'])
    end_winter = datetime(start_year+1, config['end_month_winter'], config['end_day_winter'])
 
    df_X_train = df_input_data.loc[(df_input_data.index < month_before_start_winter) | (df_input_data.index > end_winter)]    
    df_y_train = df_ground_truth.loc[(df_ground_truth.index < month_before_start_winter) | (df_ground_truth.index > end_winter)]    
    df_X_train = df_X_train.reset_index()
    df_y_train = df_y_train.reset_index()
    
    df_y_train = df_y_train.drop([time_column_name_ground_truth], axis=1)
    df_X_train = df_X_train.drop([time_column_name_input_data], axis=1)
    
    y_train = np.array(df_y_train)
    X_train = np.array(df_X_train)   
    
    random_forest_classifier = RangerForestClassifier(n_estimators = 1000, min_node_size = 5, importance='impurity')
    random_forest_classifier = random_forest_classifier.fit(X_train, np.squeeze(y_train))
        #torch.save(random_forest_classifier, config['PATH_model']+'RFC_'+config['ground_truth']+'_'+config['location_ground_truth']+'_input_'+config['input_data']+'_'+config['location_input']+'_lead_'+str(config['lead_time'])+'d_without_'+str(start_year)+'_'+str(start_year+1)+'.pt')
    print('Model without Winter '+str(start_year)+'/'+str(start_year+1)+' Trained.')
    feature_importances.append(random_forest_classifier.feature_importances_)

In [None]:
np.shape(feature_importances)

In [None]:
# Calculate the mean feature importance.
mean_feature_importance = np.mean(feature_importances, axis=0)
np.shape(mean_feature_importance)

In [None]:
# Find the 10 largest feature importances.
indices_of_largest_feature_importances = np.argsort(mean_feature_importance, axis=0)[-10:]
indices_of_largest_feature_importances

In [None]:
# Extract only the 10 lagest feature importances.
list_values_most_important_features = mean_feature_importance[indices_of_largest_feature_importances]
list_names_most_important_features = df_X_train.columns[indices_of_largest_feature_importances]

In [None]:
# Use different hatching for the predictors from reanalysis and reforecast data as well as a separate color when
# t2m is the predictor.
list_hatching = []
list_colors = []

for f in range(len(list_names_most_important_features)):
    if 't2m' in list_names_most_important_features[f]:
        list_hatching.append('/')
        list_colors.append('thistle')

    elif list_names_most_important_features[f].count('_') > 1:
        list_hatching.append('/')
        list_colors.append('cadetblue')
    
    else:
        list_hatching.append('.')
        list_colors.append('cadetblue')

In [None]:
# Plot the most important feature importances in a bar plot.
plt.bar(list_names_most_important_features, list_values_most_important_features, color=list_colors, hatch=list_hatching, edgecolor='k')
plt.xticks(rotation=45, ha='right')
plt.gca().invert_xaxis()
plt.ylabel('Impurity Feature Importance Value')
plt.title('RFC_stat_all_s2s_ens_era5, '+str(config['lead_time'])+' d lead')
plt.savefig('/home/my6406/Desktop/RFs_with_ERA5_and_S2S_Reforecasts_as_Input/Data_and_Plots/Plots/RFC_stat_all_s2s_ens_era5_'+str(config['lead_time'])+'d.png', bbox_inches='tight')
plt.show()

In [None]:
# End of Program