# Creating "Statistics of field" from the Meteorological Predictor Fields as Input for RF-based ML-Models
Version 19 January 2024, Selina Kiefer

### Input: csv-files
continuous timeseries of S2S reforecasts' meteorological predictors as 2d-fields in csv-format
### Output: csv-file
continuous timeseries of the minimum, mean, maximum and variance of the meteorological predictors, of separate ensemble members and the overall ensemble information of these, per date in csv-format

#### Define the paths' and files' names 

In [None]:
# Set the needed path and file names.
PATH_defined_functions = './Defined_Functions/'

PATH_input_data = './Data_in_csv_Format/'
ifiles_input_data = ['s2s_reforecasts_u10_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z100_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z300_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z500_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z850_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_t850_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_H850_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_u300_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_msl_60W_60E_20N_80N_2000_2020_lead_time_14d.csv']

# We add the 2-meter temperature (= target variable of the forecast) of the S2S reforecasts as a predictor.
PATH_target_variable = './Data_in_csv_Format/'
ifile_target_variable = 'S2S_Reforecast_Ensemble_Lead_Time_14d_2000_2020.csv'

PATH_output_file = './Data_in_csv_Format/'
file_name_output_file_separate_fields = 's2s_reforecasts_statistics_separate_fields_u10_z100_z300_z500_z850_t850_H850_u300_msl_t2m_60W_60E_20N_80N_2000_2020_lead_time_14d.csv'
file_name_output_file_ensemble_information = 's2s_reforecasts_statistics_ensemble_info_u10_z100_z300_z500_z850_t850_H850_u300_msl_t2m_60W_60E_20N_80N_2000_2020_lead_time_14d.csv'

#### Import the necessary packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import pandas as pd
import numpy as np

In [None]:
# Import the necessary functions.
import sys
sys.path.insert(1,PATH_defined_functions)
from read_in_csv_data import *

#### Read in the input data (one file as an example)

In [None]:
# Read in one input data file and show its content.
df_input_data = read_in_csv_data(PATH_input_data,ifiles_input_data[0])
df_input_data = df_input_data.drop(['index', 'Unnamed: 0'], axis=1)
df_input_data = df_input_data.reset_index()
df_input_data

#### Read in the target variable

In [None]:
# Read in the target variable file and show its content. 
df_target_variable = read_in_csv_data(PATH_target_variable,ifile_target_variable)
df_target_variable = df_target_variable.drop(['index', 'Unnamed: 0'], axis=1)
df_target_variable = df_target_variable.reset_index()
df_target_variable

####  List the predictors to be combined

In [None]:
# List the desired predictors and set how many of these should be taken from the first 
# dataframe. From all other dataframes, only 1 predictor is taken (if more are needed, list
# these input files multiple times in "ifiles_input_data"). It is necessary to take the time as
# a predictor since the data will be grouped by date later.
target_variable = 't2m'
desired_predictors = ['valid_time', 'number', 'latitude', 'longitude', 'u', 'gh', 'gh', 'gh', 'gh', 't', 'q', 'u', 'msl']
desired_predictor_names = ['time', 'number', 'latitude', 'longitude', 'u10', 'z100', 'z300', 'z500', 'z850', 't850', 'H850', 'u300', 'msl']
number_of_predictors_in_first_dataframe = 5
time_column_name = 'time'

#### Combine all predictors into one dataframe

In [None]:
# A new dataframe is created and the desired predictors from the first data file are written
# into it.
df_combined_input_data = pd.DataFrame()
df_input_data = read_in_csv_data(PATH_input_data, ifiles_input_data[0])
for i in range(number_of_predictors_in_first_dataframe):
    df_combined_input_data[desired_predictor_names[i]] = df_input_data [desired_predictors[i]]

In [None]:
# From all other dataframes but the one with the target variable, the specified predictor is added to this new 
# dataframe.
for k in range(len(ifiles_input_data)-1):
    df_input_data = read_in_csv_data(PATH_input_data, ifiles_input_data[k+1])
    df_combined_input_data[desired_predictor_names[i+k+1]] = df_input_data [desired_predictors[i+k+1]]

#### Calculate the statistics (minimum, mean, maximum and variance) of the predictor fields for each ensemble member separately

In [None]:
# Now, the time is set as the index and the data is grouped by date. For every desired statistic
# of the field, the calculation is done directly after the grouping and written as separate
# pandas series. Here, the minimum, mean, maximum and variance are calculated.
df_statistics = pd.DataFrame()

for i in range(11):
    # Take every ensemble member separately.
    df_combined_input_data_one_member = df_combined_input_data.where(df_input_data['number']==i)
    df_combined_input_data_one_member = df_combined_input_data_one_member.dropna()
    
    # Calculate the statistics.
    df_combined_input_data_one_member[time_column_name] = pd.to_datetime(df_combined_input_data_one_member[time_column_name])
    df_combined_input_data_one_member = df_combined_input_data_one_member.set_index(time_column_name)
    ds_input_data_one_member_grouped_min = df_combined_input_data_one_member.groupby([df_combined_input_data_one_member.index.year, df_combined_input_data_one_member.index.month, df_combined_input_data_one_member.index.day], as_index=False).min()
    ds_input_data_one_member_grouped_mean = df_combined_input_data_one_member.groupby([df_combined_input_data_one_member.index.year, df_combined_input_data_one_member.index.month, df_combined_input_data_one_member.index.day], as_index=False).mean()
    ds_input_data_one_member_grouped_max = df_combined_input_data_one_member.groupby([df_combined_input_data_one_member.index.year, df_combined_input_data_one_member.index.month, df_combined_input_data_one_member.index.day], as_index=False).max()
    ds_input_data_one_member_grouped_var = df_combined_input_data_one_member.groupby([df_combined_input_data_one_member.index.year, df_combined_input_data_one_member.index.month, df_combined_input_data_one_member.index.day], as_index=False).var()
    
    # Add the results to a new dataframe.
    for l in range(len(desired_predictor_names)-1):
        df_statistics['min_'+desired_predictor_names[l+1]+'_'+str(i)] = ds_input_data_one_member_grouped_min[desired_predictor_names[l+1]]
        df_statistics['mean_'+desired_predictor_names[l+1]+'_'+str(i)] = ds_input_data_one_member_grouped_mean[desired_predictor_names[l+1]]
        df_statistics['max_'+desired_predictor_names[l+1]+'_'+str(i)] = ds_input_data_one_member_grouped_max[desired_predictor_names[l+1]]   
        df_statistics['var_'+desired_predictor_names[l+1]+'_'+str(i)] = ds_input_data_one_member_grouped_var[desired_predictor_names[l+1]]

    # Remove unmeaningful columns again.    
    df_statistics = df_statistics.drop(['min_number'+'_'+str(i), 'mean_number'+'_'+str(i),'max_number'+'_'+str(i),'var_number'+'_'+str(i)], axis=1)
    df_statistics = df_statistics.drop(['min_latitude'+'_'+str(i), 'mean_latitude'+'_'+str(i),'max_latitude'+'_'+str(i),'var_latitude'+'_'+str(i)], axis=1)
    df_statistics = df_statistics.drop(['min_longitude'+'_'+str(i), 'mean_longitude'+'_'+str(i),'max_longitude'+'_'+str(i),'var_longitude'+'_'+str(i)], axis=1)

#### Add the month to the predictors

In [None]:
# Extract the time and the month from the input data.
df_target_variable['valid_time'] = pd.to_datetime(df_target_variable['valid_time'])
time = df_target_variable['valid_time']
df_target_variable = df_target_variable.set_index('valid_time')
month = df_target_variable.index.month
df_target_variable = df_target_variable.reset_index()

In [None]:
# Add the month as predictor.
df_statistics['month'] = month

#### Add every ensemble member of the target variable to the predictors

In [None]:
# Add this to the dataframe containing the statistics of the fields.
for l in range(11):
    df_statistics[target_variable+'_'+str(l)] =  df_target_variable[str(l)+'.0']

#### Add the time information again to the reshaped data

In [None]:
# In the next step, the time is added to the dataframe containing the statistics as predictors 
# (nothing needs to be changed here).
df_statistics.insert(0, 'time', time)

#### Doublecheck the representation of the data

In [None]:
# Check if everything is reshaped and sorted correctly.
df_statistics.head()

In [None]:
# Also check if everything is sorted, renamed or removed correctly at the end of the
# dataframe.
df_statistics.tail()

#### Save the statistics of every separate ensemble member in csv format

In [None]:
# Save the pandas dataframe in csv-format.
df_statistics.to_csv(PATH_output_file+file_name_output_file_separate_fields)

#### Calculate the ensemble information of the statistics of the predictor fields

In [None]:
# Condense the information of the ensemble by calculating the minimum and variance of the minima of all ensemble
# members, the mean and variance of the means and the variances of all ensemble members and the maximum and the
# variance of the maxima of all ensemble members.
df_statistics_ensemble = pd.DataFrame()

for m in range(len(desired_predictor_names)-4):
    selection = df_statistics.filter(regex=(''.join(['min_',desired_predictor_names[m+4],'*'])))
    df_statistics_ensemble['min_min_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_statistics_ensemble['var_min_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_statistics.filter(regex=(''.join(['mean_',desired_predictor_names[m+4],'*'])))
    df_statistics_ensemble['mean_mean_'+desired_predictor_names[m+4]] = selection.mean(axis=1)
    df_statistics_ensemble['var_mean_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_statistics.filter(regex=(''.join(['max_',desired_predictor_names[m+4],'*'])))
    df_statistics_ensemble['max_max_'+desired_predictor_names[m+4]] = selection.max(axis=1)
    df_statistics_ensemble['var_max_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_statistics.filter(regex=(''.join(['var_',desired_predictor_names[m+4],'*'])))
    df_statistics_ensemble['mean_var_'+desired_predictor_names[m+4]] = selection.mean(axis=1)
    df_statistics_ensemble['var_var_'+desired_predictor_names[m+4]] = selection.var(axis=1)


#### Add the month

In [None]:
# Add the month as predictor.
df_statistics_ensemble['month'] = month

#### Add the statistics of the target variable

In [None]:
# Add the statistics of the target variable to the dataframe containing the statistics of the fields.
df_target_variable = df_target_variable.drop(['index', 'valid_time'], axis=1) 

df_statistics_ensemble['min_'+target_variable] = df_target_variable.min(axis=1)
df_statistics_ensemble['mean_'+target_variable] = df_target_variable.mean(axis=1)
df_statistics_ensemble['max_'+target_variable] = df_target_variable.max(axis=1)
df_statistics_ensemble['var_'+target_variable] = df_target_variable.var(axis=1)

#### Add the time information again to the data

In [None]:
# In the next step, the time is added to the dataframe containing the statistics as predictors.
df_statistics_ensemble.insert(0, 'time', time)

#### Doublecheck the representation of the data

In [None]:
# Check if everything is reshaped and sorted correctly.
df_statistics_ensemble.head()

In [None]:
# Also check if everything is sorted, renamed or removed correctly at the end of the
# dataframe.
df_statistics_ensemble.tail()

#### Save the ensemble information of the statistics of every separate ensemble member in csv format

In [None]:
# Save the pandas dataframe in csv-format.
df_statistics_ensemble.to_csv(PATH_output_file+file_name_output_file_ensemble_information)

In [None]:
# End of Program