<a name="top"></a>
<div style="width:1000 px">

<div style="float:right; width:98 px; height:98px;">
<img src="https://cdn.miami.edu/_assets-common/images/system/um-logo-gray-bg.png" alt="Miami Logo" style="height: 98px;">
</div>

<div style="float:right; width:98 px; height:98px;">
<img src="https://media.licdn.com/dms/image/C4E0BAQFlOZSAJABP4w/company-logo_200_200/0/1548285168598?e=2147483647&v=beta&t=g4jl8rEhB7HLJuNZhU6OkJWHW4cul_y9Kj_aoD7p0_Y" alt="STI Logo" style="height: 98px;">
</div>


<h1>Clean Data and Export to Absolute and 24H-Min, Max, and Avg netcdf files. .</h1>
By: Kayla Besong, PhD
    <br>
Last Edited: 11/16/23
<br>
<br>    
<br>

Downloaded files are cleaned and organized by model, variable, and year. The absolute (all time steps) 24H-Min, Max, and Avg are each exported by year, variable individually. If only min or absolute are desired, they will need commented out in the function suite notebook Example of each output file saved to local directory for one variable 'air' for NCEP:
    
air_NCEP_REANALYSIS_Abs_2020.nc  

airMAX_NCEP_REANALYSIS_Daily_2020.nc 

airMIN_NCEP_REANALYSIS_Daily_2020.nc 

airAVG_NCEP_REANALYSIS_Daily_2020.nc 

NOTE: the NAM and HRRR may take some extra care/attention and using one year at a time may help runtime/memory.

<div style="clear:both"></div>
</div>

<hr style="height:2px;">

## Import needed libraries, etc.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import pandas as pd
from dask.distributed import Client, LocalCluster
import dask.array as da
import os
import glob

In [None]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
pd.options.mode.chained_assignment = None

## Establish a dask client.

Figure out the appropriate workers, threads, and memory limit for your system.

In [None]:
Cluster = LocalCluster(n_workers = 8, threads_per_worker=4, memory_limit='30GB',  processes=True)
client = Client(Cluster)
client

# The integral notebook of functions to run

In [None]:
%run ../../Universal_Functions/File_concat_mod_functions.ipynb

## These are the available options the functions are designed for

Not all variables will be available from all models but most are. 

In [None]:
model_options = ['CONUS404', 'ERA5', 'HRRR', 'NAM', 'NARR', 'NCEP']
variable_options = ['PBL', 'CAPE', 'SOILM', 'WIND', 'PRECIP', 'TEMP', 'RH']               # this references a dictionary in the function suite that will grab the correct variable name per model

In [None]:
output_dir = 'database_files'           # choose wisely, this will be referenced significantly throughout the code repository 

# CONUS404

In [None]:
%%time

for v in variable_options:                              # loop through variable options above
    print(f'starting {v}')
    save_min_max_vars('CONUS404', v, output_dir)        # call the function that concats, cleans, and generates the 4 output files per year per variable 

### plot checker

In [None]:
conus_outs = []
for f in os.listdir(f'database_files/CONUS404/'):
    if f[-2:] == 'nc':
        conus_outs.append(xr.open_dataset(f'database_files/CONUS404/{f}'))

In [None]:
for c in conus_outs:
    cv = list(c.variables)
    
    if 'soil_layers_stag' in list(c[cv[0]].dims):
        
        c[cv[0]].isel(soil_layers_stag = 0).groupby('Time.season').mean('Time').plot.contourf(x = 'west_east', y = 'south_north', col = 'season', levels = 22)
    
    else:
        c[cv[0]].groupby('Time.season').mean('Time').plot.contourf(x = 'west_east', y = 'south_north', col = 'season', levels = 22)

# ERA5
expected timesteps = 1460

In [None]:
%%time

for v in variable_options: 
    print(f'starting {v}')
    save_min_max_vars('ERA5', v, output_dir)        # call the function that concats, cleans, and generates the 4 output files per year per variable 

# HRRR
expected timesteps = 1460

In [None]:
model = 'HRRR'

In [None]:
vars_hrrr = []                             

for vvv in variable_options:              ### loop through and append variables, unnesting where there are multiple names for a variable (ex. wind: [u10, v10])
    v = get_model_var(model, vvv)
    if type(v) == list:
        for vv in v:
            vars_hrrr.append(vv)
    else:
        vars_hrrr.append(v)

vars_hrrr

In [None]:
years = np.arange(2014, 2018)
years

In [None]:
temp = xr.open_dataset('HRRR/t2m/hrrr_t2m_hrrr.20140801_00.nc')

In [None]:
x_lon = temp.latitude[0,:].longitude.values
y_lat = temp.longitude[:,0].latitude.values


In [None]:
for v in vars_hrrr:                                                                          # Iterate over variables in the HRRR dataset
    for y in years:                                                                          # Iterate over years in the dataset
        
             
        year_model_list = sorted(glob.glob(os.path.join(f'{model}/{v}', f'*{y}*.nc')))       # Generate a sorted list of file paths for the current variable and year
    
        # Open the first dataset of the year for the variable, and chunk it
        df1 = xr.open_dataset(year_model_list[0]).chunk(get_chunk(model))
        to_keep = ['time', 'latitude', 'longitude']                                          # Define coordinates to keep
        ds_coords = list(df1.coords)                                                         # List all coordinates in the dataset
        delete = [i for i in ds_coords if i not in to_keep]                                  # Identify coordinates to delete
        df1 = df1.drop(delete)                                                               # Drop unwanted coordinates
            
        for f in year_model_list[1:]:                                                        # Iterate over the rest of the files for the year
                    
            try:               
                            
                df2 = xr.open_dataset(f).chunk(get_chunk(model))                             # Open and chunk the next dataset 
                to_keep = ['time', 'latitude', 'longitude']                                  # Re-define coordinates to keep
                ds_coords = list(df2.coords)                                                 # List all coordinates in the dataset
                delete = [i for i in ds_coords if i not in to_keep]                          # Identify coordinates to delete
                df2 = df2.drop(delete)                                                       # Drop unwanted coordinates
                
                df1 = xr.concat([df1, df2], dim = 'time').chunk(get_chunk_database(model))   # Concatenate the current dataset with the aggregated dataset along the time dimension
            except:
                print(f'{f} may be corrupt')                                                 # Print a message if the file may be corrupt
               
                       
        lat = df1.latitude.isel(time=0).values                                               # Extract latitude and longitude values from the first time step
        lon = df1.longitude.isel(time=0).values               
        
        df1 = df1.assign_coords(latitude=(('y', 'x'), lat), longitude=(('y', 'x'), lon))     # Assign these latitude and longitude values as coordinates        
        df1 = df1.assign_coords({'x': x_lon, 'y': y_lat})                                    # Re-assign x and y coordinates
        
        
        resampler_regular_vars(v, df1.chunk(get_chunk_database(model)), output_dir, model)   # Call the function that concats, cleans, and generates the 4 output files per year per variable 

    del df1                                                                                  # Delete df1 dataset to free memory
        

# NAM
the nam is a special problem that cannot be handled like the rest of the datasets.

In [None]:
years = np.arange(2013, 2019, 1)
years

In [None]:
model = 'NAM'
output_dir = 'database_files'

In [None]:
vars_nm = []

for vvv in variable_options:
    vars_nm.append(get_model_var(model, vvv))

In [None]:
vars_nm

In [None]:
%%time
       
for v in vars_nm:                                                                               # Iterate over variables in the HRRR dataset
    for y in years:                                                                          # Iterate over years in the dataset
                     
        year_model_list = sorted(glob.glob(os.path.join(f'{model}/{v}', f'*{y}*.nc')))       # Generate a sorted list of file paths for the current variable and year
    
        # Open the first dataset of the year for the variable, and chunk it
        df1 = xr.open_dataset(year_model_list[0]).chunk(get_chunk(model))
        to_keep = ['time', 'latitude', 'longitude']                                          # Define coordinates to keep
        ds_coords = list(df1.coords)                                                         # List all coordinates in the dataset
        delete = [i for i in ds_coords if i not in to_keep]                                  # Identify coordinates to delete
        df1 = df1.drop(delete)                                                               # Drop unwanted coordinates
            
        for f in year_model_list[1:]:                                                        # Iterate over the rest of the files for the year
                    
            try:               
                            
                df2 = xr.open_dataset(f).chunk(get_chunk(model))                             # Open and chunk the next dataset 
                to_keep = ['time', 'latitude', 'longitude']                                  # Re-define coordinates to keep
                ds_coords = list(df2.coords)                                                 # List all coordinates in the dataset
                delete = [i for i in ds_coords if i not in to_keep]                          # Identify coordinates to delete
                df2 = df2.drop(delete)                                                       # Drop unwanted coordinates
                
                df1 = xr.concat([df1, df2], dim = 'time').chunk(get_chunk_database(model))   # Concatenate the current dataset with the aggregated dataset along the time dimension
            except:
                print(f'{f} may be corrupt')                                                 # Print a message if the file may be corrupt
                       
        resampler_regular_vars(v, df1.chunk(get_chunk_database(model)), output_dir, model)   # Call the function that concats, cleans, and generates the 4 output files per year per variable 

    del df1                                                                                  # Delete df1 dataset to free memory            

### adjusting the 2017-2018 gap

This is an example of one variable for one file type (AVG) to handle the 2017 gap between grib and grib2

In [None]:
n1 = xr.open_dataset('database_files/NAM/smAVG_NAM_HISTORICAL_Daily_2017.nc').chunk(get_chunk_database('NAM'))
n2 = xr.open_dataset('database_files/NAM/soilwAVG_NAM_HISTORICAL_Daily_2017.nc').chunk(get_chunk_database('NAM'))

In [None]:
n2 = n2.rename({'soilw': 'sm'})

In [None]:
n1, n2

In [None]:
n3 = xr.concat([n1, n2], dim = 'time').chunk(get_chunk_database('NAM'))

In [None]:
n3

In [None]:
%%time
n3.to_netcdf('database_files/NAM/smAVG_NAM_HISTORICAL_Daily_2017.nc')

### plot checker

In [None]:
nam_outs = []
for f in os.listdir(f'database_files/NAM/'):
    if f[-2:] == 'nc':
        nam_outs.append(xr.open_dataset(f'database_files/NAM/{f}'))

In [None]:
for n in nam_outs:
    nv = list(n.variables)
    nd = list(n.dims)

    if len(nd) > 3:        
        n[nv[0]].isel(depthBelowLandLayer = 0).groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)            

    else:
        n[nv[0]].groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)

# NARR
expected timesteps = 1460

In [None]:
%%time

for v in variable_options: 
    print(f'starting {v}')
    save_min_max_vars('NARR', v, output_dir)        # call the function that concats, cleans, and generates the 4 output files per year per variable 

plot checker

In [None]:
narr_outs = []
for f in os.listdir(f'database_files/NARR/'):
    if f[-2:] == 'nc':
        narr_outs.append(xr.open_dataset(f'database_files/NARR/{f}'))
        

In [None]:
vars_list = []
for j in narr_outs:
    variable_sel = list(j.variables)
    #print(variable_sel)
    k = j[variable_sel[0]]
    kdims = list(k.dims)
    # print(kdims)
    if len(kdims) > 3:
        if 'layer_between_two_depths_below_surface_layer' in kdims:
            k.isel(layer_between_two_depths_below_surface_layer = 0).groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)
            
        elif 'height_above_ground2' in kdims:
            k.isel(height_above_ground2 = 0).groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)
            
        elif 'height_above_ground1' in kdims:
            k.isel(height_above_ground1 = 0).groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)
        
        elif 'height_above_ground' in kdims:
            k.isel(height_above_ground = 0).groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)
            

    else:
        k.groupby('time.season').mean('time').plot.contourf(x = 'x', y = 'y', col = 'season', levels = 22)


# NCEP
expected timesteps = 1460

In [None]:
%%time

for v in variable_options: 
    print(f'starting {v}')
    save_min_max_vars('NCEP', v, output_dir)        # call the function that concats, cleans, and generates the 4 output files per year per variable 

plot checker

In [None]:
ncep_outs = []
for f in os.listdir(f'database_files/NCEP'):
    if f[-2:] == 'nc':
        ncep_outs.append(xr.open_dataset(f'database_files/NCEP/{f}'))
        

In [None]:
vars_list = []
for j in ncep_outs:
    variable_sel = list(j.variables)
    
    k = j[variable_sel[0]]
    
    if 'level' in list(k.dims):
          
        k.isel(level = 0).groupby('time.season').mean('time').plot.contourf(x = 'lon', y = 'lat', col = 'season', levels = 22)
    else:
        k.groupby('time.season').mean('time').plot.contourf(x = 'lon', y = 'lat', col = 'season', levels = 22)


# UFS
expected timesteps = 1460

In [None]:
model = 'UFS_S2S'
p = 6

In [None]:
%%time

for v in variable_options:
    var = get_model_var(model, v)
    grab_raw_files_and_resampler_UFS(var, output_dir, p)        # call the function that concats, cleans, and generates the 4 output files per year per variable 