<a name="top"></a>
<div style="width:1000 px">

<div style="float:right; width:98 px; height:98px;">
<img src="https://cdn.miami.edu/_assets-common/images/system/um-logo-gray-bg.png" alt="Miami Logo" style="height: 98px;">
</div>

<div style="float:right; width:98 px; height:98px;">
<img src="https://media.licdn.com/dms/image/C4E0BAQFlOZSAJABP4w/company-logo_200_200/0/1548285168598?e=2147483647&v=beta&t=g4jl8rEhB7HLJuNZhU6OkJWHW4cul_y9Kj_aoD7p0_Y" alt="STI Logo" style="height: 98px;">
</div>


<h1>Calculate the Multi-Prototype Mean, Stack Along Lead Time for the UFS-S2S Database Files</h1>
By: Kayla Besong, PhD
    <br>
Last Edited: 01/09/24
<br>
<br>    
<br>
Calculates the mean along forecast time of prototypes 5-8 of the UFS-S2S forecast model. There is also methodology to group forecasts by lead day and lead week, saving out files accordingly by variable. This notebook leverages already generated database files, structured in sub folders by init dates. 
<div style="clear:both"></div>
</div>

<hr style="height:2px;">

## Import needed libraries, etc.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import pandas as pd
from dask.distributed import Client, LocalCluster
import dask.array as da
import os
import glob
from metpy.units import units
import math

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
pd.options.mode.chained_assignment = None

### The integral notebook of functions to run

In [3]:
%run File_concat_mod_functions.ipynb

In [4]:
output_dir = 'database_files'

In [5]:
input_vars = os.listdir('UFS_S2S/5/20110401/')

In [6]:
input_vars = [i for i in input_vars if i != 'hindex']
input_vars = [i for i in input_vars if i != 'hpbl']
input_vars = [i for i in input_vars if i != 'lsm']

input_vars.append('wdir')
input_vars.append('wspeed')
input_vars

['t2m', 'r2', 'u10', 'v10', 'gust', 'cape', 'prate', 'soilw', 'wdir', 'wspeed']

In [7]:
model = 'UFS_S2S'

# Grab files and take mean of all prototypes 

In [None]:
parent_dirs_5 = sorted(glob.glob(os.path.join(f'{model}/5/', '*')))
parent_dirs_6 = sorted(glob.glob(os.path.join(f'{model}/6/', '*')))
parent_dirs_7 = sorted(glob.glob(os.path.join(f'{model}/7/', '*')))
parent_dirs_8 = sorted(glob.glob(os.path.join(f'{model}/8/', '*')))


In [None]:
for i in range(len(parent_dirs_5)):                                                                                                                       # Iterate over the indices of parent directories for prototype 5
                                                                                                                                                          # Check if the last 8 characters of the directory names match across all prototypes
    if parent_dirs_5[i][-8:] == parent_dirs_6[i][-8:] == parent_dirs_7[i][-8:] == parent_dirs_8[i][-8:]:
                                                                                                                       
        dir_maker(os.path.join(f'{output_dir}/UFS_S2S/MPM', f'{parent_dirs_5[i][-8:]}'))                                                                  # Create a directory for the matched subfolder within the MPM directory
        for v in range(len(input_vars)):                                                                                                                  # Iterate over the indices of input variables
            # Open the dataset for each prototype, select the variable, and chunk the data
            p5_var = xr.open_dataset(glob.glob(os.path.join(f'{output_dir}/{parent_dirs_5[i]}/', f'*{input_vars[v]}*Abs*'))[0]).chunk(get_chunk(model))
            p6_var = xr.open_dataset(glob.glob(os.path.join(f'{output_dir}/{parent_dirs_6[i]}/', f'*{input_vars[v]}*Abs*'))[0]).chunk(get_chunk(model))
            p7_var = xr.open_dataset(glob.glob(os.path.join(f'{output_dir}/{parent_dirs_7[i]}/', f'*{input_vars[v]}*Abs*'))[0]).chunk(get_chunk(model))
            p8_var = xr.open_dataset(glob.glob(os.path.join(f'{output_dir}/{parent_dirs_8[i]}/', f'*{input_vars[v]}*Abs*'))[0]).chunk(get_chunk(model))
            
            merged = xr.concat([p5_var, p6_var, p7_var, p8_var], dim = 'prototype')                                                                       # Concatenate the datasets along a new 'prototype' dimension
            mpm = merged.mean('prototype')                                                                                                                # Calculate the mean across the 'prototype' dimension
            
            resampler_UFS(input_vars[v], mpm.chunk(get_chunk(model)), f'{output_dir}/UFS_S2S/MPM', parent_dirs_5[i][-8:], 'MPM')                          # Resample the mean dataset and save it
    else:
        print('the subfolders for each prototype are not aligned')                                                                                        # Print a message if the subfolder names do not match

# Stack along lead time 

In [9]:
data_file_type = ['Abs', 'MAX', 'MIN', 'AVG']
#ps = [5, 6, 7, 8, 'MPM']
ps = [7, 8, 'MPM']

In [10]:
input_vars

['t2m', 'r2', 'u10', 'v10', 'gust', 'cape', 'prate', 'soilw', 'wdir', 'wspeed']

In [None]:
%%time

for p in ps:                                                                                                                                # Iterate over each prototype
    for dft in data_file_type:                                                                                                              # Iterate over each data file type
        for v in range(len(input_vars)):                                                                                                    # Iterate over each variable by index

            print(f'Prototype: {p} - File Type: {dft} - Variable: {input_vars[v]}')
                   
            sub_fl = sorted(glob.glob(f"{output_dir}/{model}/{p}/*/*{input_vars[v]}*{dft}*"))                                               # Generate a sorted list of file paths matching the current prototype, variable, and data file type

            day_dict = {}                                                                                                                   # Initialize a dictionary to store daily data
            week_dict = {}                                                                                                                  # Initialize a dictionary to store weekly data
                                                              
            ct = 0                                                                                                                          # Initialize a counter
                                                              
            for f in sub_fl:                                                                                                                # Iterate over each file in the sorted list
                                                              
                df1 = xr.open_dataset(f).chunk(get_chunk(model))                                                                            # Open the dataset and chunk it according to the model
                
                df1['lead_days'] = xr.DataArray(pd.to_timedelta((df1['valid_time'] - df1['time']).values).days)                             # Calculate lead days and weeks from valid_time and time, then add as data arrays
                df1['lead_week'] = df1['lead_days'] // 7
                df1 = df1.rename({'dim_0': 'lead_dim'})                                                                                     # Rename 'dim_0' to 'lead_dim'
                                                                                              
                for wd in np.unique(df1.lead_dim):                                                                                          # Iterate over unique lead dimensions
                                                                                  
                    # Select data for the current lead day
                    selected_group_day = df1.groupby('lead_days').groups[wd]    
                    selected_valid_times_day = df1.isel(valid_time=selected_group_day).drop(['lead_days', 'lead_week'])
                    selected_valid_times_day = selected_valid_times_day.rename({'lead_dim': 'lead_days'}).sel(lead_days = wd)

                    if ct == 0:                                                                                                             # If first iteration, initialize the dictionary with the data
                        day_dict[wd] = selected_valid_times_day                                                              
                    else:                                                                                                                   # Otherwise, concatenate the new data with existing data
                        day_dict[wd] = xr.concat([day_dict[wd], selected_valid_times_day], dim = 'valid_time')
                    
                    if wd in np.arange(0,6):                                                                                                # If within the first 6 lead dimensions (weeks)
                        # Select data for the current lead week
                        selected_group_wk = df1.groupby('lead_week').groups[wd]    
                        selected_valid_times_wk = df1.isel(valid_time=selected_group_wk).drop(['lead_days', 'lead_week'])                      
                        selected_valid_times_wk['lead_dim'] = selected_valid_times_wk['lead_dim'] // 7
                        selected_valid_times_wk = selected_valid_times_wk.rename({'lead_dim': 'lead_week'}).sel(lead_week = wd)

                        if ct == 0:                                                                                                         # If first iteration, initialize the dictionary with the data
                            week_dict[wd] = selected_valid_times_wk                                                              
                        else:                                                                                                               # Otherwise, concatenate the new data with existing data
                            week_dict[wd] = xr.concat([week_dict[wd], selected_valid_times_wk], dim = 'valid_time')

                ct += 1                                                                                                                     # Increment the counter
                                                              
            for wd in np.unique(df1.lead_dim):                                                                                              # Iterate over unique lead dimensions again
                                                              
                str_wd = str(wd).zfill(2)                                                                                                   # Format the lead dimension with leading zeros
                
                day_dict[wd].to_netcdf(f"{output_dir}/{model}/LEAD/{p}/day/{input_vars[v]}_UFS_S2S_FORECAST_{dft}_lead{str_wd}.nc")         # Save the daily data to NetCDF

                if wd in np.arange(0,6):                                                                                                    # If within the first 6 lead dimensions (weeks)
                    
                    week_dict[wd].to_netcdf(f"{output_dir}/{model}/LEAD/{p}/week/{input_vars[v]}_UFS_S2S_FORECAST_{dft}_lead{str_wd}.nc")   # Save the weekly data to NetCDF