 # Create an S2S Reforecast Ensemble
 Version 18 January 2024, Selina Kiefer

### Input: grib-files
S2S reforecasts at a specific time step in grib-format (e.g. from https://apps.ecmwf.int/datasets/data/s2s-reforecasts-daily-averaged-ecmf/levtype=sfc/type=cf/)
### Output: csv-file, png-files
S2S reforecast ensemble consisting of the control run and all perturbed runs in csv-format and plotted in png-format

#### Set the paths' to the defined functions, the style sheet for plotting and the configuration file and set its name

In [None]:
# Set the path to the defined functions.
PATH_defined_functions = './Defined_Functions/'

In [None]:
# Set the path and name of the style file which should be used for plotting.
style_file_for_plotting = './Style_File_Matplotlib.mplstyle'

In [None]:
# Set the path and name of the configuration file.
PATH_configurations = '/Configuration_Files/'
ifile_configurations = 'Configurations_S2S_Reforecasts_Ensemble.yaml'

#### Import the necessary python packages and functions

In [None]:
# Import the necessary python packages.
import yaml
import cfgrib
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
# Read in the necessary defined functions.
import sys
sys.path.insert(1, PATH_defined_functions)
from read_in_csv_data import *

#### Read in the style sheet for plotting

In [None]:
# Load the style sheet to be used by matplotlib for plotting. This will update the plotting
# parameters to e.g. have the right font, font size and figure size. The latter is adjusted to
# the textwidth of the LaTeX-document in order to avoid re-scaling the plot and changing 
# thereby the font size again.
plt.style.use(style_file_for_plotting)
import warnings
warnings.simplefilter(action='ignore')

#### Read in the configuration file and the data specified in it

In [None]:
# Read in the configuration file (nothing needs to be changed here).
with open(PATH_configurations+ifile_configurations) as f:
    config = yaml.safe_load(f)

In [None]:
# Read in the control run of the S2S reforecasts and show its variables.
input_data_control = xr.open_dataset(config['PATH_s2s_reforecasts']+config['ifiles_s2s_reforecasts_control'][0], engine='cfgrib')
df_input_data_control = input_data_control.to_dataframe()
df_input_data_control = df_input_data_control.reset_index() 
df_input_data_control

In [None]:
# Read in the perturbed runs of the S2S reforecasts and show their variables.
input_data_perturbed = xr.open_dataset(config['PATH_s2s_reforecasts']+config['ifiles_s2s_reforecasts_perturbed'][0], engine='cfgrib')
df_input_data_perturbed = input_data_perturbed.to_dataframe()
df_input_data_perturbed = df_input_data_perturbed.reset_index() 
df_input_data_perturbed

In [None]:
# Read in the area mask for Central Europe.
df_area_mask = read_in_csv_data(config['PATH_area_mask'], config['ifile_area_mask'])
df_area_mask = df_area_mask.drop(['index', 'Unnamed: 0'], axis=1)
df_area_mask

#### Adjust the mask based on the E-OBS elevation data to be compatibel with the S2S reforecast data
E.g. upscaling from a 0.25° resolution to a resolution of 1.5°

In [None]:
# At first, drop any unnecessary columns.
df_area_mask = df_area_mask.drop(['elevation'], axis=1)

In [None]:
# Round the longitude and latitude values of the area mask to the nearest 0.5 float in order to be compatible with
# the grid points used in the S2S reforecasts.
df_area_mask['longitude'] = round(df_area_mask['longitude'] * 2) / 2.0
df_area_mask['latitude'] = round(df_area_mask['latitude'] * 2) / 2.0

In [None]:
# Only take longitudes and latitudes present in the S2S reforecasts (grid spacing of 1.5°).
df_area_mask['longitude'] = df_area_mask['longitude'].where(np.mod(df_area_mask['longitude'],1.5)==0)
df_area_mask['latitude'] = df_area_mask['latitude'].where(np.mod(df_area_mask['latitude'],1.5)==0)
df_area_mask = df_area_mask.dropna()
df_area_mask = df_area_mask.drop_duplicates(['longitude', 'latitude'])

In [None]:
# Sort the mask by longitude and latitude (same as the sorting of the S2S reforecasts) and repeat each value of the
# mask by the number of reforecasted winters in order to have the same length as the reforecast data.
df_area_mask = df_area_mask.sort_values(by=['longitude', 'latitude'])
area_mask_for_control_runs = np.repeat(df_area_mask['eroded_mask'], len(np.unique(df_input_data_control['valid_time'])))
area_mask_for_perturbed_runs =  np.tile(area_mask_for_control_runs, len(np.unique(df_input_data_perturbed['number'])))

#### Create an ensemble of S2S reforecasts

In [None]:
# Create an ensemble of S2S reforecasts.
df_s2s_reforecast_ensemble = pd.DataFrame()

for i in tqdm(range(len(config['ifiles_s2s_reforecasts_control']))):
    # Read in the control runs.
    input_data_control = xr.open_dataset(config['PATH_s2s_reforecasts']+config['ifiles_s2s_reforecasts_control'][i], engine='cfgrib')
    df_input_data_control = input_data_control.to_dataframe()
    df_input_data_control = df_input_data_control.reset_index() 

    # At first, drop the columns containing unnecessary information. Then, select the area comprising Central Europe 
    # and sort the data y longitude and latitude.
    df_input_data_control = df_input_data_control.drop(['time', 'step', 'heightAboveGround'], axis=1)
    df_input_data_control = df_input_data_control.where(df_input_data_control['longitude']>2)
    df_input_data_control = df_input_data_control.where(df_input_data_control['longitude']<21)
    df_input_data_control = df_input_data_control.where(df_input_data_control['latitude']>44)
    df_input_data_control = df_input_data_control.where(df_input_data_control['latitude']<61)
    df_input_data_control = df_input_data_control.dropna()
    df_input_data_control = df_input_data_control.sort_values(by=['longitude', 'latitude'])
    
    # Add the mask to the dataframe containing the control runs data.
    df_input_data_control['area_mask'] = np.array(area_mask_for_control_runs)
    
    # In a next step, the mask is applied to the data by multiplication. Again, the valid values keep their value and the
    # values which are masked are set to 0 by the multiplication with the binary mask. Since in the next step an areal 
    # mean will be calculated, the zeros are set to NaNs.
    df_input_data_control['mask_applied_to_t2m'] = df_input_data_control['t2m']*df_input_data_control['area_mask']
    df_input_data_control['mask_applied_to_t2m'] = df_input_data_control['mask_applied_to_t2m'].replace(0, np.nan)
    df_input_data_control = df_input_data_control.drop(['t2m', 'area_mask'], axis=1)
    
    # Then, all rows containing NaNs are dropped. Now, the aerial mean is calculated for every day.
    df_input_data_control = df_input_data_control.dropna()
    df_input_data_control = df_input_data_control.groupby(df_input_data_control['valid_time']).mean()
    df_input_data_control = df_input_data_control.reset_index()
    df_input_data_control = df_input_data_control.drop(['latitude', 'longitude'], axis=1)

    
    # Next, read in the perturbed runs.
    input_data_perturbed = xr.open_dataset(config['PATH_s2s_reforecasts']+config['ifiles_s2s_reforecasts_perturbed'][i], engine='cfgrib')
    df_input_data_perturbed = input_data_perturbed.to_dataframe()
    df_input_data_perturbed = df_input_data_perturbed.reset_index() 
    
    # Then, drop the columns containing unnecessary information, select the area comprising Central Europe and
    # sort the data by longitude and latitude.
    df_input_data_perturbed = df_input_data_perturbed.drop(['time', 'step', 'heightAboveGround'], axis=1)
    df_input_data_perturbed = df_input_data_perturbed.where(df_input_data_perturbed['longitude']>2)
    df_input_data_perturbed = df_input_data_perturbed.where(df_input_data_perturbed['longitude']<21)
    df_input_data_perturbed = df_input_data_perturbed.where(df_input_data_perturbed['latitude']>44)
    df_input_data_perturbed = df_input_data_perturbed.where(df_input_data_perturbed['latitude']<61)
    df_input_data_perturbed = df_input_data_perturbed.dropna()
    df_input_data_perturbed = df_input_data_perturbed.sort_values(by=['longitude', 'latitude'])
    
    # Add the mask to the dataframe containing the perturbed runs data.
    df_input_data_perturbed['area_mask'] = np.array(area_mask_for_perturbed_runs)
    
    # In a next step, the mask is applied to the data by multiplication. Again, the valid values keep their value and the
    # values which are masked are set to 0 by the multiplication with the binary mask. Since in the next step an areal 
    # mean will be calculated, the zeros are set to NaNs.
    df_input_data_perturbed['mask_applied_to_t2m'] = df_input_data_perturbed['t2m']*df_input_data_perturbed['area_mask']
    df_input_data_perturbed['mask_applied_to_t2m'] = df_input_data_perturbed['mask_applied_to_t2m'].replace(0, np.nan)
    df_input_data_perturbed = df_input_data_perturbed.drop(['t2m', 'area_mask'], axis=1)
    
    # Now, all rows containing NaNs are dropped. Then, the aerial mean is calculated for every day.
    df_input_data_perturbed = df_input_data_perturbed.dropna()
    df_input_data_perturbed = df_input_data_perturbed.groupby([df_input_data_perturbed['valid_time'], df_input_data_perturbed['number']]).mean()
    df_input_data_perturbed = df_input_data_perturbed.reset_index()
    df_input_data_perturbed = df_input_data_perturbed.drop(['latitude', 'longitude'], axis=1)
    
    
    # Combine the control and the perturbed runs.
    df_s2s_reforecast_ensemble = pd.concat([df_s2s_reforecast_ensemble, df_input_data_perturbed, df_input_data_control], ignore_index=True, sort=False)
    
# Sort the S2S reforecast ensemble in a sensible way.
df_s2s_reforecast_ensemble = df_s2s_reforecast_ensemble.sort_values(by=['valid_time', 'number'])
df_s2s_reforecast_ensemble = df_s2s_reforecast_ensemble.reset_index()
df_s2s_reforecast_ensemble = df_s2s_reforecast_ensemble.drop(['index'], axis=1)  
     

In [None]:
# Bring the S2S reforecast ensemble into a nice representation.
df_s2s_reforecast_ensemble = df_s2s_reforecast_ensemble.set_index(['valid_time','number'])['mask_applied_to_t2m'].unstack()
df_s2s_reforecast_ensemble = df_s2s_reforecast_ensemble.reset_index()
df_s2s_reforecast_ensemble = df_s2s_reforecast_ensemble.rename_axis(None, axis=1)

#### Save the S2S reforecast ensemble in csv-format

In [None]:
# Save the S2S reforecast ensemble.
df_s2s_reforecast_ensemble.to_csv(config['PATH_output_files']+'S2S_Reforecast_Ensemble_Lead_Time_'+config['lead_time']+'_2000_2020.csv')

#### Visualize the S2S reforecast ensemble for a plausibility check

In [None]:
# Plot the S2S reforecast ensemble for all winters for a plausibility check.
fig, ax = plt.subplots()
for k in range(10):
    plt.plot(df_s2s_reforecast_ensemble['valid_time'], df_s2s_reforecast_ensemble[float(k)], marker='o', linestyle='', markersize=1)
plt.xlabel('time')
plt.ylabel('t2m')
plt.title('ECMWF S2S Reforecasts, Lead Time '+config['lead_time'])
plt.savefig(config['PATH_plots']+'S2S_Reforecast_Ensemble_Lead_Time_'+config['lead_time']+'_Winter_2000_2020.png', bbox_inches='tight')

In [None]:
# Plot the S2S reforecast ensemble for a single winter for a plausibility check.
df_s2s_reforecast_ensemble_winter_2011_2012 = df_s2s_reforecast_ensemble.iloc[572:623]
fig, ax = plt.subplots()
for k in range(10):
    plt.plot(df_s2s_reforecast_ensemble_winter_2011_2012['valid_time'], df_s2s_reforecast_ensemble_winter_2011_2012[float(k)], marker='o', linestyle='--', linewidth=1, markersize=3)
    plt.xlabel('time')
plt.ylabel('t2m')
plt.title('ECMWF S2S Reforecasts, Lead Time '+config['lead_time'])
plt.savefig(config['PATH_plots']+'S2S_Reforecast_Ensemble_Lead_Time_'+config['lead_time']+'_Winter_2011_2012.png', bbox_inches='tight')

In [None]:
# End of Program.