# Creating "Principle Components Analysis (PCA) of field" from the Meteorological Predictor Fields as Input for RF-based ML-Models
Version 18 January 2024, Selina Kiefer

### Input: csv-files
continuous timeseries of meteorological predictors as 2d-fields in csv-format
### Output: csv-file
continuous timeseries of the first 10 principle components of the meteorological predictors per date in csv-format

#### Define the paths' and files' names

In [None]:
# Set the needed path and file names.
PATH_defined_functions = './Defined_Functions/'

PATH_input_data = './Data_in_csv_Format/'
ifiles_input_data = ['era5_u10_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                     'era5_z100_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                     'era5_z250_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                    'era5_z500_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                    'era5_z850_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                    'era5_t850_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                    'era5_H850_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                    'era5_u300_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv',
                    'era5_msl_60W_60E_20N_80N_1950_2020_only_Oct_Apr_lead_time_14d.csv']

PATH_output_file = './Data_in_csv_Format/'
file_name_output_file = 'era5_pca_n10_u10_z100_z250_z500_z850_t850_H850_u300_msl_60W_60E_20N_80N_1950_2020_lead_time_14d.csv'


#### Import the necessary packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA 

In [None]:
# Import the necessary functions.
import sys
sys.path.insert(1,PATH_defined_functions)
from read_in_csv_data import *

#### List the predictors to be combined

In [None]:
# List the desired predictors. From all dataframes, only 1 predictor is taken (if more are 
# needed, list these input files multiple times in "ifiles_input_data"). The month should not be
# included in the predictors list since a PCA over the month is not sensible. Therefore, the 
# month is added at a later stage to the dataframe.
desired_predictors = ['u10', 'z100', 'z250', 'z500', 'z850', 't850', 'H850', 'u300', 'msl']
time_column_name = 'time'
number_of_latitudes = 40
number_of_longitudes = 81

#### Decide how many principle components should be calculated

In [None]:
# Set how many components should be used by the Principal Components Analysis (PCA). 
number_of_principle_components = 10
pca = PCA(n_components=number_of_principle_components)

#### Perform the PCA for every predictor field separately and then combine the principle components in one dataframe

In [None]:
# The PCA is performed for every day and every predictor field separately. 
# Therefore, one dataframe is read in and grouped by year, month and days (.groupby()). Then, 
# one day is selected (.iloc[]) and from the resulting dataframe only the predictor's column is
# taken and converted into a numpy array. Then, this column is reshaped into the dimensions of
# a field again (.reshape(latitude, longitude)). Then, the PCA is performed (pca.fit_transform).
# From the PCA, the singular values (= PCA loadings) are taken and written in to a list. This
# list is then appended to list containing all singular values of the predictor field for all
# days and from this list, all the singular values are written into a pandas dataframe.
field_one_variable_all_days = []
df_input_data_pca = pd.DataFrame()

for i in range(len(ifiles_input_data)):
    df_input_data_one_variable = read_in_csv_data(PATH_input_data, ifiles_input_data[i])
    
    df_input_data_one_variable[time_column_name] = pd.to_datetime(df_input_data_one_variable[time_column_name])
    df_input_data_one_variable = df_input_data_one_variable.set_index(time_column_name)
    ds_input_data_one_variable_grouped = df_input_data_one_variable.groupby([df_input_data_one_variable.index.year, df_input_data_one_variable.index.month, df_input_data_one_variable.index.day], as_index=False)
    
    df_input_data_one_variable_grouped = pd.DataFrame(ds_input_data_one_variable_grouped)

    for k in range(len(df_input_data_one_variable_grouped)):
        df_input_data_one_variable_one_day = df_input_data_one_variable_grouped.iloc[k]
        df_input_data_one_variable_one_day = df_input_data_one_variable_one_day[1]

        field_one_variable_one_day = np.array(df_input_data_one_variable_one_day[desired_predictors[i]])
        field_one_variable_one_day = field_one_variable_one_day.reshape(number_of_latitudes,number_of_longitudes)
        
        field_one_variable_all_days.append(field_one_variable_one_day)
        
    field_one_variable_all_days = np.array(field_one_variable_all_days)
    
    field_one_variable_all_days = field_one_variable_all_days.reshape(( -1, number_of_latitudes*number_of_longitudes))
    
    field_one_variable_all_days_fitted = pca.fit_transform(field_one_variable_all_days)
              
    field_one_variable_all_days_transformed = pca.transform(field_one_variable_all_days)
   
    field_one_variable_all_days = []
            
    for l in range(number_of_principle_components):    
        df_input_data_pca[desired_predictors[i]+'_n'+str(l+1)] = field_one_variable_all_days_transformed[:,l]

#### Add the time information again to the reshaped data

In [None]:
# Since the time got lost by using .groupby() and is not needed for the PCA, a separate new 
# dataframe is created containing only the time. To this dataframe, three new columns are added
# containing the year, month and day.
df_input_data_one_variable = df_input_data_one_variable.reset_index()
df_time = pd.DataFrame()
df_time[time_column_name] = pd.to_datetime(df_input_data_one_variable[time_column_name])
df_time = df_time.set_index(time_column_name)
df_time['year'] = df_time.index.year
df_time['month'] = df_time.index.month
df_time['day'] = df_time.index.day
df_time = df_time.reset_index()

In [None]:
# This new dataframe is then grouped by date and 'averaged' resulting in a daily time-
# series but separated into year, month and day.
df_time = df_time.set_index(time_column_name)
ds_time_mean = df_time.groupby([df_time.index.year, df_time.index.month, df_time.index.day], as_index=False).mean().astype(int).astype(str) 
df_time_mean = pd.DataFrame(ds_time_mean)

In [None]:
# The separated timeseries is now combined into a single daily timeseries (nothing needs to be
# changed here).
daily_timeseries = (df_time_mean['year'].astype(str)+'-'+df_time_mean['month'].astype(str)+'-'+df_time_mean['day']).astype(str)

In [None]:
# In the next step, firstly the month is added to the dataframe containing the singular values
# of the PCA and then the time.
#df_input_data_pca.insert(0, 'month', df_time['month'])
df_input_data_pca.insert(0, time_column_name, daily_timeseries)

#### Doublecheck the representation of the data

In [None]:
# Check if everything is reshaped and sorted correctly.
df_input_data_pca.head()

In [None]:
# Also check if everything is sorted, renamed or removed correctly at the end of the
# dataframe.
df_input_data_pca.tail()

#### Save the data in csv format

In [None]:
# Save the pandas dataframe in csv-format.
df_input_data_pca.to_csv(PATH_output_file+file_name_output_file)

In [None]:
# End of Program