# Creating "PCA of field" from the Meteorological Predictor Fields as Input for RF-based ML-Models
Version 19 January 2024, Selina Kiefer

### Input: csv-files
continuous timeseries of S2S reforecasts' meteorological predictors as 2d-fields in csv-format
### Output: csv-file
continuous timeseries of the first 10 principle comonents of the meteorological predictors, of separate ensemble members and the mean and variance of these, per date in csv-format

#### Define the paths' and files' names 

In [None]:
# Set the needed path and file names.
PATH_defined_functions = './Defined_Functions/'

PATH_input_data = './Data_in_csv_Format/'
ifiles_input_data = ['s2s_reforecasts_u10_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z100_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z300_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z500_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_z850_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_t850_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_H850_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_u300_60W_60E_20N_80N_2000_2020_lead_time_14d.csv',
                    's2s_reforecasts_msl_60W_60E_20N_80N_2000_2020_lead_time_14d.csv']

# We add the 2-meter temperature (= target variable of the forecast) of the S2S reforecasts as a predictor.
PATH_target_variable = './Data_in_csv_Format/'
ifile_target_variable = 'S2S_Reforecast_Ensemble_Lead_Time_14d_2000_2020.csv'

PATH_output_file = './Data_in_csv_Format/'
file_name_output_file_separate_fields = 's2s_reforecasts_pca_n10_separate_fields_u10_z100_z300_z500_z850_t850_H850_u300_msl_t2m_60W_60E_20N_80N_2000_2020_lead_time_14d.csv'
file_name_output_file_ensemble_information = 's2s_reforecasts_pca_n10_ensemble_info_u10_z100_z300_z500_z850_t850_H850_u300_msl_t2m_60W_60E_20N_80N_2000_2020_lead_time_14d.csv'

#### Import the necessary packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA 

In [None]:
# Import the necessary functions.
import sys
sys.path.insert(1,PATH_defined_functions)
from read_in_csv_data import *

#### Read in the input data (one file as an example)

In [None]:
# Read in one input data file and show its content.
df_input_data = read_in_csv_data(PATH_input_data,ifiles_input_data[0])
df_input_data = df_input_data.drop(['index', 'Unnamed: 0'], axis=1)
df_input_data = df_input_data.reset_index()
df_input_data

#### Read in the target variable

In [None]:
# Read in the target variable file and show its content. 
df_target_variable = read_in_csv_data(PATH_target_variable,ifile_target_variable)
df_target_variable = df_target_variable.drop(['index', 'Unnamed: 0'], axis=1)
df_target_variable = df_target_variable.reset_index()
df_target_variable

####  List the predictors to be combined

In [None]:
# List the desired predictors and set how many of these should be taken from the first 
# dataframe. From all other dataframes, only 1 predictor is taken (if more are needed, list
# these input files multiple times in "ifiles_input_data"). It is necessary to take the time as
# a predictor since the data will be grouped by date later.
target_variable = 't2m'
desired_predictors = ['valid_time', 'number', 'latitude', 'longitude', 'u', 'gh', 'gh', 'gh', 'gh', 't', 'q', 'u', 'msl']
desired_predictor_names = ['time', 'number', 'latitude', 'longitude', 'u10', 'z100', 'z300', 'z500', 'z850', 't850', 'H850', 'u300', 'msl']
number_of_predictors_in_first_dataframe = 5
time_column_name = 'valid_time'
number_of_latitudes = 41
number_of_longitudes = 81

#### Decide how many principle components should be calculated

In [None]:
# Set how many components should be used by the Principal Components Analysis (PCA). 
number_of_principle_components = 10
pca = PCA(n_components=number_of_principle_components)

#### Calculate the PCA of the predictor fields

In [None]:
# The PCA is performed for every day and every predictor field separately. 
# Therefore, one dataframe is read in and grouped by year, month and days (.groupby()). Then, 
# one day is selected (.iloc[]) and from the resulting dataframe only the predictor's column is
# taken and converted into a numpy array. Then, this column is reshaped into the dimensions of
# a field again (.reshape(latitude, longitude)). Then, the PCA is performed (pca.fit_transform).
# From the PCA, the singular values (= PCA loadings) are taken and written in to a list. This
# list is then appended to list containing all singular values of the predictor field for all
# days and from this list, all the singular values are written into a pandas dataframe.
field_one_variable_all_days = []
df_input_data_pca = pd.DataFrame()

for i in range(len(ifiles_input_data)):
    # Consider every predictor separately.
    df_input_data_one_variable = read_in_csv_data(PATH_input_data, ifiles_input_data[i])   
    
    # Also consider every ensemble member separately.
    for m in range(11):

        df_input_data_one_variable_one_member = df_input_data_one_variable.where(df_input_data['number']==m)
        df_input_data_one_variable_one_member = df_input_data_one_variable_one_member.dropna()
    
        # Consider every date separately.
        df_input_data_one_variable_one_member[time_column_name] = pd.to_datetime(df_input_data_one_variable_one_member[time_column_name])
        df_input_data_one_variable_one_member = df_input_data_one_variable_one_member.set_index(time_column_name)
        ds_input_data_one_variable_grouped = df_input_data_one_variable_one_member.groupby([df_input_data_one_variable_one_member.index.year, df_input_data_one_variable_one_member.index.month, df_input_data_one_variable_one_member.index.day], as_index=False)
    
        df_input_data_one_variable_grouped = pd.DataFrame(ds_input_data_one_variable_grouped)

        for k in range(len(df_input_data_one_variable_grouped)):
            df_input_data_one_variable_one_day = df_input_data_one_variable_grouped.iloc[k]
            df_input_data_one_variable_one_day = df_input_data_one_variable_one_day[1]

            field_one_variable_one_day = np.array(df_input_data_one_variable_one_day[desired_predictors[i+4]])
            
            # Reshape the data into a 2d representation, datewise.
            field_one_variable_one_day = field_one_variable_one_day.reshape(number_of_latitudes,number_of_longitudes)
            field_one_variable_all_days.append(field_one_variable_one_day)
        
        field_one_variable_all_days = np.array(field_one_variable_all_days)
        
        # Reshape the data again to perform the PCA for all days.
        field_one_variable_all_days = field_one_variable_all_days.reshape(( -1, number_of_latitudes*number_of_longitudes))
    
        field_one_variable_all_days_fitted = pca.fit_transform(field_one_variable_all_days)
              
        field_one_variable_all_days_transformed = pca.transform(field_one_variable_all_days)
   
        field_one_variable_all_days = []
    
        # Add the first 10 principle components to a new dataframe.    
        for l in range(number_of_principle_components):    
            df_input_data_pca[desired_predictor_names[i+4]+'_'+str(m)+'_n'+str(l+1)] = field_one_variable_all_days_transformed[:,l]

#### Add the month to the predictors

In [None]:
# Extract the time and the month from the target variable data.
df_target_variable['valid_time'] = pd.to_datetime(df_target_variable['valid_time'])
time = df_target_variable['valid_time']
df_target_variable = df_target_variable.set_index('valid_time')
month = df_target_variable.index.month
df_target_variable = df_target_variable.reset_index()

In [None]:
# Add the month as predictor.
df_input_data_pca['month'] = month

#### Add every ensemble member of the target variable to the predictors

In [None]:
# Add this to the dataframe containing the statistics of the fields.
for l in range(11):
    df_input_data_pca[target_variable+'_'+str(l)] =  df_target_variable[str(l)+'.0']

#### Add the time information again to the reshaped data

In [None]:
# In the next step, the time is added to the dataframe containing the statistics as predictors 
# (nothing needs to be changed here).
df_input_data_pca.insert(0, 'time', time)

#### Doublecheck the representation of the data

In [None]:
# Check if everything is reshaped and sorted correctly.
df_input_data_pca.head()

In [None]:
# Also check if everything is sorted, renamed or removed correctly at the end of the
# dataframe.
df_input_data_pca.tail()

#### Save the first 10 principle components of every separate ensemble member in csv format

In [None]:
# Save the pandas dataframe in csv-format.
df_input_data_pca.to_csv(PATH_output_file+file_name_output_file)

#### Calculate the ensemble mean and variance of the first 10 principle components of ensemble members

In [None]:
# Condense the information of the ensemble by taking the mean and variance of the first 10 principl components of
# the ensemble members.
df_input_data_pca_ensemble = pd.DataFrame()

for m in range(len(desired_predictor_names)-4):
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n1_', '.*'])))
    df_input_data_pca_ensemble['mean_n1_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n1_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n2_', '.*'])))
    df_input_data_pca_ensemble['mean_n2_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n2_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n3_', '.*'])))
    df_input_data_pca_ensemble['mean_n3_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n3_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n4_', '.*'])))
    df_input_data_pca_ensemble['mean_n4_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n4_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n5_', '.*'])))
    df_input_data_pca_ensemble['mean_n5_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n5_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n6_', '.*'])))
    df_input_data_pca_ensemble['mean_n6_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n6_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n7_', '.*'])))
    df_input_data_pca_ensemble['mean_n7_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n7_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n8_', '.*'])))
    df_input_data_pca_ensemble['mean_n8_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n8_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n9_', '.*'])))
    df_input_data_pca_ensemble['mean_n9_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n9_'+desired_predictor_names[m+4]] = selection.var(axis=1)
    
    selection = df_input_data_pca.filter(regex=(''.join([desired_predictor_names[m+4],'_n10_', '.*'])))
    df_input_data_pca_ensemble['mean_n10_'+desired_predictor_names[m+4]] = selection.min(axis=1)
    df_input_data_pca_ensemble['var_n10_'+desired_predictor_names[m+4]] = selection.var(axis=1)


#### Add the month to the condensed predictors

In [None]:
# Add the month as predictor.
df_input_data_pca_ensemble['month'] = month

#### Add the statistics of the target variable

In [None]:
# Add the statistics of the target variable to the dataframe containing the statistics of the fields.
df_target_variable = df_target_variable.drop(['index', 'valid_time'], axis=1) 

df_input_data_pca_ensemble['min_'+target_variable] = df_target_variable.min(axis=1)
df_input_data_pca_ensemble['mean_'+target_variable] = df_target_variable.mean(axis=1)
df_input_data_pca_ensemble['max_'+target_variable] = df_target_variable.max(axis=1)
df_input_data_pca_ensemble['var_'+target_variable] = df_target_variable.var(axis=1)

#### Add the time information again to the predictors

In [None]:
# In the next step, the time is added to the dataframe containing the statistics as predictors.
df_input_data_pca_ensemble.insert(0, 'time', time)

#### Doublecheck the representation of the data

In [None]:
# Check the beginning of the dataframe.
df_input_data_pca_ensemble.head()

In [None]:
# Check the end of the dataframe.
df_input_data_pca_ensemble.tail()

#### Save the ensemble mean and variance of the first 10 principle components of the ensemble member in csv format

In [None]:
# Save the pandas dataframe in csv-format.
df_input_data_pca_ensemble.to_csv(PATH_output_file+file_name_output_file_ensemble_information)

In [None]:
# End of Program