<a name="top"></a>
<div style="width:1000 px">

<div style="float:right; width:98 px; height:98px;">
<img src="https://cdn.miami.edu/_assets-common/images/system/um-logo-gray-bg.png" alt="Miami Logo" style="height: 98px;">
</div>

<div style="float:right; width:98 px; height:98px;">
<img src="https://media.licdn.com/dms/image/C4E0BAQFlOZSAJABP4w/company-logo_200_200/0/1548285168598?e=2147483647&v=beta&t=g4jl8rEhB7HLJuNZhU6OkJWHW4cul_y9Kj_aoD7p0_Y" alt="STI Logo" style="height: 98px;">
</div>


<h1>Data Download Functions</h1>
By: Kayla Besong, PhD
    <br>
Last Edited: 11/15/23
<br>
<br>    
<br>
This is a suite of functions complimentary to Data_Grab.ipynb. Keep in the same directory or point to correct location in Data_Grab.ipynb as needed. More complex packages needed are left within functions to help eliminate hurdles.     
<div style="clear:both"></div>
</div>

<hr style="height:2px;">

In [3]:
from datetime import datetime
import numpy as np
import xarray as xr
import pandas as pd
import os
import metpy
from metpy.units import units
import cartopy.crs as ccrs
import glob
import shutil
import zarr

In [None]:
def dir_maker(path_name):  # Define a function named dir_maker that takes a path_name as an argument

    '''This function generates a directory with the given path name if it does not exist already.


    Inputs:
    path_name: (str) path and name of directory wishing to be created

    Returns:
    Nothing, directory created in background
    
    '''
    
    try:
        os.mkdir(path_name)  # Use mkdir method from os module to create a directory

    except FileExistsError:
        pass  # Handle the exception without doing anything
        # Uncomment below to print a message if the directory already exists
        #print(f'{path_name} subdir exists')  # Optional: print message if directory exists

In [6]:
def hrrr_zarr_grabber(start_date, end_date, hour_range, variables, output_dir, forecast = None):

    '''This function takes user input for the start, stop, and step for the desired HRRR forecast or analysis data from AWS S3 buckets and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    hour_range: (list) list of numerical timesteps desired. For example, np.arange(0, 24, 6) will produce [00, 06, 12, 18] in the function. 
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    forecast: (str, int, float, boolean) OPTIONAL. If set to anything other than None, the forecast file will be downloaded, if left alone anaylsis will be downloaded.  
    
    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''
        
    import s3fs                                                                                          # Import s3fs for working with S3 filesystems
    import warnings                                                                                      # Import warnings module
                                         
    non_exist = []                                                                                       # Initialize an empty list to keep track of non-existent files
                                         
    s3 = s3fs.S3FileSystem(anon=True)                                                                    # Create an S3FileSystem object for anonymous access
    def lookup(path):                                                                                    # Define a function to lookup paths in S3
        return s3fs.S3Map(path, s3=s3)                                                                   # Return an S3Map object for the given path
                                                    
    dir_maker(output_dir)                                                                                # Call dir_maker function to create the output directory if not already created
                                                    
    hrs = [str(i).zfill(2) for i in hour_range]                                                          # Generate a list of hours with leading zeros
                                         
    for date in pd.date_range(start_date, end_date):                                                     # Iterate over dates in the specified range
    
        str_date = f'{date.year}{str(date.month).zfill(2)}{str(date.day).zfill(2)}'                      # Format date as a string            
     
        for hr in hrs:                                                                                   # Iterate over hours
                                                            
            for key, item in variables.items():                                                          # Iterate over variables
                                                        
                for v in item:                                                                           # Iterate over items in variables
                                                                
                    dir_maker(f'{output_dir}/{v}')                                                       # Create a subdirectory for each variable
                                                                    
                    if f'hrrr_{v}_{str_date}_{hr}.nc' in os.listdir(f'{output_dir}/{v}'):                # Check if file already exists
                                          
                        print(f'hrrr_{v}_{str_date}_{hr}.nc has already been saved')                     # Print message if file exists
                        
                    else:                                                                                # If file does not exist, proceed with downloading 
                        try:                                    
                            if forecast != None:                                                         # Check if forecast is not None
                                path = f"hrrrzarr/sfc/{str_date}/{str_date}_{hr}z_fcst.zarr"             # Set path for forecast data
                            else:           
                                path = f"hrrrzarr/sfc/{str_date}/{str_date}_{hr}z_anl.zarr"              # Set path for analysis data
                                    
                            file = xr.open_mfdataset([lookup(f"{path}/{key}/{v}"), lookup(f"{path}/{key}/{v}/{key}/")], engine="zarr")  # Open multi-file dataset
                            
                            # Define projection parameters
                            projection = ccrs.LambertConformal(central_longitude=262.5, 
                                   central_latitude=38.5, 
                                   standard_parallels=(38.5, 38.5),
                                    globe=ccrs.Globe(semimajor_axis=6371229,
                                                     semiminor_axis=6371229))
    
                            # Rename coordinates and assign CRS
                            file3 = file.rename(projection_x_coordinate="x", projection_y_coordinate="y")
                            file3 = file3.metpy.assign_crs(projection.to_cf())
                            file3 = file3.metpy.assign_latitude_longitude()
                            file3 = file3.drop('metpy_crs')
                            file4 = file3.astype('float32')                                              # Convert data type to float32
                            file4['time'] = file3['time'].values                                         # Assign time values
                            
                            file4.to_netcdf(f'{output_dir}/{v}/hrrr_{v}_{str_date}_{hr}.nc')             # Save file as NetCDF
                            
                        except FileNotFoundError:                                                        # Handle FileNotFoundError
                            print(f'part or all of hrrr_{v}_{str_date}_{hr}.nc does not exist')          # Print error message
                            non_exist.append(f'hrrr_{v}_{str_date}_{hr}.nc')                             # Add non-existent file to list
    
        print(f'all files for {date} have been saved')                                                   # Print message after saving all files for a date
            
    return non_exist                                                                                     # Return the list of non-existent files
                    

In [9]:
def download_blob(bucket_name, source_blob_name, destination_file_name):

    '''Downloads a blob from a Google Cloud bucket.

    Inputs:
    bucket_name: (str) name of the bucket 
    source_blob_name: (str) path, name of file 
    destination_file_name: (str) desired path, name of output file 

    Outputs:
    None
    
    
    '''

    from google.cloud import storage

    storage_client = storage.Client.create_anonymous_client()    # Create an anonymous client for Google Cloud Storage
    bucket = storage_client.bucket(bucket_name)                  # Access the bucket with the specified name
    blob = bucket.blob(source_blob_name)                         # Create a blob object for the specified source blob name in the bucket
    blob.download_to_filename(destination_file_name)             # Download the blob to a local file with the specified destination file name

In [15]:
def hrrr_google_grabber(start_date, end_date, hour_range, variables, output_dir, fcst_hr_step = None):


    '''This function takes user input for the start, stop, and step for the desired HRRR forecast or analysis data from Google Cloud and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    hour_range: (list) list of numerical timesteps desired. For example, np.arange(0, 24, 6) will produce [00, 06, 12, 18] in the function. 
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    fcst_hr_step: (list) list of integers describing the lead times included for each init time described by hour_range. If left None, ['00'] will be input. 
    
    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''
    

    from google.cloud.exceptions import NotFound                                                                  # Import NotFound exception for handling missing blobs
                                    
    dir_maker(output_dir)                                                                                         # Create the main output directory                     
    dir_maker('temp_hrrr_grib_files')                                                                             # Create a temporary directory for HRRR grib files
                                    
    non_exist = []                                                                                                # Initialize a list to track files that don't exist or fail to download    
                                        
    hrs = [str(i).zfill(2) for i in hour_range]                                                                   # Generate a list of hours with leading zeros
                                        
    if fcst_hr_step == None:                                                                                      # Check if forecast hour step is not specified
        fcst = [str(0).zfill(2)]                                                                                  # Default to a forecast step of 0 if none specified
    else:                                
        fcst = [str(i).zfill(2) for i in fcst_hr_step]                                                            # Generate forecast steps if specified
                                    
    bucket_name = "high-resolution-rapid-refresh"                                                                 # Specify the GCS bucket name
                                    
    for date in pd.date_range(start_date, end_date):                                                              # Iterate over each date in the specified range                                
        for hr in hrs:                                                                                            # Iterate over each hour
            for f in fcst:                                                                                        # Iterate over each forecast step
                str_date = 'hrrr.{dt:%Y%m%d}'.format(dt = date)                                                   # Format the date
                filename = f'hrrr.t{hr}z.wrfnatf{f}.grib2'                                                        # Construct the filename for the grib file
                not_saved = {}                                                                                    # Initialize a dictionary to track variables not saved
                for key, item in variables.items():                                                               # Iterate over each variable
                    not_saved_ar = []                                                                             # Initialize a list to track not saved variables
                    for v in item[0]:                                                                             # Iterate over variables in the item
                        dir_maker(f'{output_dir}/{v}')                                                            # Create a directory for each variable
                        if f'hrrr_{v}_{str_date}_{hr}.nc' in os.listdir(f'{output_dir}/{v}'):                     # Check if file already exists
                            pass                                                                                  # Do nothing if file exists
                        else:                                
                            not_saved_ar.append(v)                                                                # Add variable to not saved list if file doesn't exist
                    if len(not_saved_ar) > 0:                                
                        not_saved[key] = [not_saved_ar, item[1]]                                                  # Update not_saved dictionary if there are variables not saved
                if len(not_saved) > 0:                                                                            # Check if there are any variables not saved
                    try:                                
                        source_blob_name = f"{str_date}/conus/{filename}"                                         # Construct the source blob name
                        dir_maker(f'temp_hrrr_grib_files/{str_date}')                                             # Create a temporary directory for the date                        
                        destination_file_name = f"temp_hrrr_grib_files/{str_date}/{filename}"                     # Set the destination file name
                        download_blob(bucket_name, source_blob_name, destination_file_name)                       # Download the blob                                                
                        for key, item in not_saved.items():                                                       # Iterate over not saved items
                            # Open the dataset with specified filters
                            ds = xr.open_dataset(destination_file_name, decode_times = True, engine = 'cfgrib', 
                                                 filter_by_keys= item[1], backend_kwargs={'indexpath': ''})
                            for v in item[0]:                                                                     # Iterate over variables in the item
                                try:                                
                                    ds_out = ds[v]                                                                # Extract the variable from the dataset
                                    ds[v].to_netcdf(f'{output_dir}/{v}/hrrr_{v}_{str_date}_{hr}.nc')              # Save the variable to a NetCDF file
                                except KeyError:                                                                  # Handle missing variables
                                    print(f'part or all of hrrr_{v}_{str_date}_{hr}.nc does not exist')           # Print error message
                                    non_exist.append(f'hrrr_{v}_{str_date}_{hr}.nc')                              # Add to non_exist list
                            try:
                                del_path = '/'.join(destination_file_name.split('/')[0:-1])                       # Construct the path to the temporary directory
                                shutil.rmtree(del_path)                                                           # Delete the temporary directory
                            except:                
                                print('error in deleting temp file')                                              # Print error message if deletion fails
                    except (FileNotFoundError, OSError, NotFound) as e:                                           # Handle exceptions
                        print(f'part or all of {filename} does not exist')                                        # Print error message
                        non_exist.append(f'{filename}')                                                           # Add filename to non_exist list                          
        print(f'all of {date} has been saved')                                                                    # Print message after processing each date
    return non_exist                                                                                              # Return the list of files that don't exist or failed to download                                                                       
                

In [2]:
def ufs_s2s_grabber(start_date, end_date, to_lead_time, variables, output_dir, domain, prototype, cached_location):


    '''This function takes user input for the start, stop, and step for the desired UFS S2S forecast or analysis data from AWS S3 Buckets and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    to_lead_time: (int) number to the lead time you want from 0 to 840 by 6. All hours between 0 and the number you provide will be downloaded.
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    domain: (list) list of integers --> [N, S, E, W] to trim the domain by
    prototype: (int) integer representing the current prototype, choose from: 5, 6, 7, or 8 
    cached_location: (str) path to directory where temp cache files can be downloaded and deleted 
    
    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''

    import s3fs
    import fsspec
    from cfgrib.dataset import DatasetBuildError
    
    non_exist = []                                                                                          # Initialize a list to track files that don't exist or fail to download
                              
    date_range = []                          
                              
    for d in pd.date_range(start_date, end_date, freq = 'M'):                                               # Iterate over each month in the specified date range
        init_date = d.strftime('%Y-%m-01')                                                                  # Format and append the first day of the month to date_range
        init_date2 = d.strftime('%Y-%m-15')                                                                 # Format and append the fifteenth day of the month to date_range
        date_range.append(init_date)                          
        date_range.append(init_date2)                          
                                  
    dir_maker('temp_ufs_files')                                                                             # Create a temporary directory for UFS files
    dir_maker(output_dir)                                                                                   # Create the main output directory
                              
    p = prototype                                                                                           # Assign the prototype number to p
                              
    # for p in np.arange(5,9): (This loop is commented out and can be used to iterate over multiple prototypes)
                              
    dir_maker(os.path.join(output_dir, str(p)))                                                             # Create a directory for the prototype within the output directory
                              
    for init in date_range:                                                                                 # Iterate over each initialization date in date_range
                                  
        str_init = '{dt:%Y%m%d}'.format(dt = pd.to_datetime(init))                                          # Format the initialization date
                                  
        path_ufs_date = os.path.join(output_dir, str(p), str_init)                                          # Construct the path for the UFS data for the given date
                              
        dir_maker(path_ufs_date)                                                                            # Create a directory for the UFS data for the given date
                                                         
        lead_time = -6                                                                                      # Initialize lead time
                              
        while lead_time < to_lead_time:                                                                     # Loop over lead times until the specified to_lead_time
                                      
            lead_time += 6                                                                                  # Increment lead time by 6 hours
                                      
            if p == 5:                                                                                      # Check if prototype is 5
                # Construct the URL for prototype 5                          
                url = 'simplecache::s3://noaa-ufs-prototypes-pds/Prototype{p}/{dt:%Y%m%d}/pgrb2/gfs.{dt:%Y%m%d}/00/gfs.t00z.pgrb2.0p25.f{hour}'.format(p = p, dt = pd.to_datetime(init), hour = str(lead_time).zfill(3))                
            else:
                # Construct the URL for other prototypes
                url = 'simplecache::s3://noaa-ufs-prototypes-pds/Prototype{p}/{dt:%Y%m%d}/pgrb2/gfs.{dt:%Y%m%d}/00/atmos/gfs.t00z.pgrb2.0p25.f{hour}'.format(p = p, dt = pd.to_datetime(init), hour = str(lead_time).zfill(3))                
            
            ### check if files exist first
            
            not_saved = {}                                                                                   # Initialize a dictionary to track variables not saved
                                       
            for key,item in variables.items():                                                               # Iterate over each variable
                                           
                not_saved_ar = []                                                                            # Initialize a list to track not saved variables
                                           
                for v in item[0]:                                                                            # Iterate over variables in the item
                               
                    sub_vars = []                                                                            # Initialize a list for sub-variables
                                                       
                    dir_maker(f'{path_ufs_date}/{v}')                                                        # Create a directory for each variable
    
                    if f'ufs_{p}_{v}_{str_init}_{lead_time}.nc' in os.listdir(f'{path_ufs_date}/{v}'):
                        pass                                                                                 # Do nothing if file exists
                        #print(f'ufs_{p}_{v}_{str_init}_{lead_time}.nc has already been saved') 
    
                    else:
                        not_saved_ar.append(v)                                                               # Add variable to not saved list if file doesn't exist
                                                    
                if len(not_saved_ar) > 0:                           
                    not_saved[key] = [not_saved_ar, item[1]]                                                 # Update not_saved dictionary if there are variables not saved
                    
            if len(not_saved) > 0:
    
                try: 
                    # Create a file system using fsspec with filecache and s3 protocols
                    fs = fsspec.filesystem("filecache", target_protocol='s3', target_options={'anon': True}, cache_storage= cached_location)
    
                    # Open the file locally from the cache
                    file = fsspec.open_local(url, s3={'anon': True}, simplecache={'cache_storage': cached_location})
                    
                    for key,item in not_saved.items():                                                      # Iterate over not saved items
          
                        # Open the dataset with specified filters
                        ds = xr.open_dataset(file, decode_times = True, engine="cfgrib", filter_by_keys= item[1], backend_kwargs={'indexpath': ''})
    
                        for v in item[0]:                                                                   # Iterate over variables in the item
    
                            try:
                                # Select the data within the specified domain and save to a NetCDF file
                                ds_out = ds[v].sel(latitude = slice(domain[0], domain[1]), longitude = slice(domain[2], domain[3]))
                                ds_out.to_netcdf(f'{path_ufs_date}/{v}/ufs_{p}_{v}_{str_init}_{lead_time}.nc')
    
                            except KeyError:
                                # Handle missing variables and print an error message
                                print(f'part or all of ufs_{p}_{v}_{str_init}_{lead_time}.nc does not exist')
                                non_exist.append(f'ufs_{p}_{v}_{str_init}_{lead_time}.nc')                  # Add to non_exist list
                    try:   
                        os.remove(file)                                                                     # Attempt to delete the cached file
                    except:                         
                        print('error in deleting cached file')                                              # Print error message if deletion fails
    
                except (FileNotFoundError, OSError) as e:
                    # Handle exceptions for missing files and print an error message
                    print(f'part or all of {url} does not exist')
                    non_exist.append(f'{url}')                                                              # Add URL to non_exist list          
                                                  
        print(f'all files for Protype {p} {str_init} have been saved')                                      # Print message after processing each date
                                      
    return non_exist                                                                                        # Return the list of files that don't exist or failed to download                                           

In [12]:
def nam_grabber(start_date, end_date, hour_range, output_dir, variables_grib = None, variables_grib2 = None, fcst_hr_step = None):


    '''This function takes user input for the start, stop, and step for the desired NAM forecast or analysis data from NCEI and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    hour_range: (list) list of numerical timesteps desired. For example, np.arange(0, 24, 6) will produce [00, 06, 12, 18] in the function. 
    variables_grib: (list) list of strings of variables for NAM files prior to 04-09-2017 when files were still in grib format
    variables_grib2: (list) list of strings of variables for NAM files after to 04-08-2017 when files switched to grib2 format
    fcst_hr_step: (list) list of integers describing the lead times included for each init time described by hour_range. If left None, ['00'] will be input. All available include: [0, 1, 2, 3, 6], not all available for all time periods/variables. 
    
    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''


    import s3fs
    import fsspec
    from cfgrib.dataset import DatasetBuildError

        
    dir_maker(output_dir)                                                                          # Create the main output directory                     
    dir_maker('temp_nam_files')                                                                    # Create a temporary directory for NAM files
                      
    non_exist = []                                                                                 # Initialize a list to track files that don't exist    
                          
    base_url = 'simplecache::https://www.ncei.noaa.gov/data/north-american-mesoscale-model/access/historical/analysis/'  # Base URL for NAM data
                      
    hrs = [str(i).zfill(2) for i in hour_range]                                                    # Generate a list of hours with leading zeros
                      
    if fcst_hr_step == None:                                                                       # Check if forecast hour step is not specified
        fcst = [str(0).zfill(3)]                                                                   # Default to a forecast step of 0 if none specified
    else:                  
        fcst = [str(i).zfill(3) for i in fcst_hr_step]                                             # Generate forecast steps if specified
                      
    for date in pd.date_range(start_date, end_date):                                               # Iterate over each date in the specified range                                
                      
        for hr in hrs:                                                                             # Iterate over each hour
                      
            for f in fcst:                                                                         # Iterate over each forecast step
    
                str_date = '{dt:%Y%m%d}_{hr}_{f}'.format(dt = date, hr = hr, f = f)                # Format the date, hour, and forecast step                          
    
                if date >= pd.to_datetime('2017-04-9'):                                            # Check if date is on or after April 9, 2017
                    # Construct filename for GRIB2 files
                    filename = '{}{dt:%Y%m}/{dt:%Y%m%d}/namanl_218_{dt:%Y%m%d}_{hr}00_{f}.grb2'.format(base_url, dt=date, hr=hr, f = f)
                    variables = variables_grib2                                                    # Use GRIB2 variables
                else: 
                    # Construct filename for GRIB files
                    filename = '{}{dt:%Y%m}/{dt:%Y%m%d}/namanl_218_{dt:%Y%m%d}_{hr}00_{f}.grb'.format(base_url, dt=date, hr=hr, f = f)
                    variables = variables_grib                                                     # Use GRIB variables
                    
                not_saved = {}                                                                     # Initialize a dictionary to track variables not saved
                    
                for key,item in variables.items():                                                 # Iterate over each variable
                    
                    not_saved_ar = []                                                              # Initialize a list to track not saved variables
                    
                    for v in item[0]:                                                              # Iterate over variables in the item
                    
                        sub_vars = []                                                              # Initialize a list for sub-variables
                    
                        dir_maker(f'{output_dir}/{v}')                                             # Create a directory for each variable
    
                        if f'nam_{v}_{str_date}.nc' in os.listdir(f'{output_dir}/{v}'):            # Check if file already exists
                            pass                                                                   # Do nothing if file exists
                        else:                 
                            not_saved_ar.append(v)                                                 # Add variable to not saved list if file doesn't exist
                     
                    if len(not_saved_ar) > 0:                 
                        not_saved[key] = [not_saved_ar, item[1]]                                   # Update not_saved dictionary if there are variables not saved
    
                if len(not_saved) > 0:
                    try:
                        # Open the file locally from the cache
                        file = fsspec.open_local(filename, filecache={'cache_storage': f'/temp_nam_files/'})                        
                        for key,item in not_saved.items():                                         # Iterate over not saved items
                            # Open the dataset with specified filters
                            ds = xr.open_dataset(file, decode_times = True, engine = 'cfgrib', 
                                                 filter_by_keys= item[1], backend_kwargs={'indexpath': ''})
                            for v in item[0]:                                                      # Iterate over variables in the item
                                try:
                                    ds_out = ds[v]                                                 # Extract the variable from the dataset
                                    ds[v].to_netcdf(f'{output_dir}/{v}/nam_{v}_{str_date}.nc')     # Save the variable to a NetCDF file
                                except KeyError:                                                   # Handle missing variables
                                    print(f'part or all of nam_{v}_{str_date}.nc does not exist')  
                                    non_exist.append(f'nam_{v}_{str_date}.nc')                     # Add to non_exist list
                            try:                
                                del_path = '/'.join(file.split('/')[0:-1])                         # Construct the path to the temporary directory
                                shutil.rmtree(del_path)                                            # Delete the temporary directory
                            except:                
                                print('error in deleting cached file')                             # Print error message if deletion fails
                    except (FileNotFoundError, OSError) as e:                
                        print(f'part or all of {filename} does not exist')                         # Print error message
                        non_exist.append(f'{filename}')                                            # Add filename to non_exist list                          
                        
        print(f'all of {date} has been saved')                                                     # Print message after processing each date
                    
    return non_exist                                                                               # Return the list of files that don't exist or failed to download

In [None]:
def narr_grabber(start_date, end_date, hour_range, variables, output_dir):


    '''This function takes user input for the start, stop, and step for the desired NARR reanalysis data from THREDDS and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    hour_range: (list) list of numerical timesteps desired. For example, np.arange(0, 24, 6) will produce [00, 06, 12, 18] in the function. 
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    
    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''

    non_exist = []                                                                     # Initialize a list to track files that don't exist    
    base_url = 'https://www.ncei.noaa.gov/thredds/dodsC/model-narr-a-files/'           # Base URL for NARR data    
    
    dir_maker(output_dir)                                                              # Create the main output directory                    
        
    hrs = [str(i).zfill(2) for i in hour_range]                                        # Generate a list of hours with leading zeros 
    
    for date in pd.date_range(start_date, end_date):                                   # Iterate over each date in the specified range        
    
        for hr in hrs:                                                                 # Iterate over each hour
            
            str_date = '{dt:%Y%m%d}_{hr}'.format(dt = date, hr = hr)                   # Format the date and hour                          
    
            filename = '{}{dt:%Y%m}/{dt:%Y%m%d}/narr-a_221_{dt:%Y%m%d}_{hr}00_000.grb'.format(base_url, dt=date, hr=hr)  # Construct the filename for the data file
    
            try:
                file = xr.open_dataset(filename, decode_times=True)                    # Open the dataset
         
                variables_available = list(file.variables)                             # Get a list of available variables in the dataset
                variables_sel = list(set(variables_available) & set(variables))        # Select variables that are both available and requested
         
                for v in variables_sel:                                                # Iterate over selected variables
                         
                    dir_maker(f'{output_dir}/{v}')                                     # Create a directory for each variable             
    
                    if f'narr_{v}_{str_date}.nc' in os.listdir(f'{output_dir}/{v}'):   # Check if file already exists
                        print(f'narr_{v}_{str_date}.nc has already been saved')        
                    else:
                        file[v].to_netcdf(f'{output_dir}/{v}/narr_{v}_{str_date}.nc')  # Save the variable to a NetCDF file
            except (FileNotFoundError, OSError) as e:
                print(f'part or all of {filename} does not exist')                     # Print error message
                non_exist.append(f'{filename}')                                        # Add filename to non_exist list
          
        print(f'all of {date} has been saved')                                         # Print message after processing each date
          
    return non_exist                                                                   # Return the list of files that don't exist or failed to download


In [None]:
def conus404_grabber(start_date, end_date, hour_range, variables, output_dir):

    '''This function takes user input for the start, stop, and step for the desired CONUS404 reanalysis data from RDA and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    hour_range: (list) list of numerical timesteps desired. For example, np.arange(0, 24, 6) will produce [00, 06, 12, 18] in the function. 
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    
    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''
    
    non_exist = []                                                                        # Initialize a list to track files that don't exist
    var_not_exist= []                                                                     # Initialize a list to track variables that don't exist
            
    base_url = 'https://thredds.rda.ucar.edu/thredds/dodsC/files/g/ds559.0'               # Base URL for the dataset    
            
    dir_maker(output_dir)                                                                 # Create the main output directory                          
            
    hrs = [str(i).zfill(2) for i in hour_range]                                           # Generate a list of hours with leading zeros 
            
    for date in pd.date_range(start_date, end_date):                                      # Iterate over each date in the specified range
                
        for hr in hrs:                                                                    # Iterate over each hour
                    
            str_date = '{dt:%Y%m%d}_{hr}'.format(dt = date, hr = hr)                      # Format the date and hour
                                                    
            if date.month == 10 or date.month == 11 or date.month == 12:                  # Check if the month is Oct, Nov, or Dec
                wy_str = f"wy{date.year + 1}"                                             # Set water year string for next year
            else:        
                wy_str = 'wy{dt:%Y}'.format(dt=date)                                      # Set water year string for current year   
    
            filename = '{}/{wy_str}/{dt:%Y%m}/wrf2d_d01_{dt:%Y-%m-%d}_{hr}:00:00.nc'.format(base_url, wy_str = wy_str, dt=date, hr=hr)  # Construct the filename for the data file
    
            try:
                file = xr.open_dataset(filename)                                          # Open the dataset

                variables_available = list(file.variables)                                # Get a list of available variables in the dataset
                variables_sel = list(set(variables_available) & set(variables))           # Select variables that are both available and requested
            
                for v in variables_sel:                                                   # Iterate over selected variables
                    dir_maker(f'{output_dir}/{v}')                                        # Create a directory for each variable             
                    
                    if f'conus404_{v}_{str_date}.nc' in os.listdir(f'{output_dir}/{v}'):  # Check if file already exists
                        print(f'conus404_{v}_{str_date}.nc has already been saved')       
                    
                    else:                                                                 # Save if it is does not exist
                        file[v].to_netcdf(f'{output

In [None]:
def ncep_secret_attribute_remover(files):

    '''This function takes an NCEP Renalysis file and removes an attribute that can become problematic in preprocessing
    
    Inputs:
    files: (xarray dataset) opened NCEP file 

    Returns:
    files: (xarray dataset) same as the input files with problematic attribute removed
    
    '''

    try:
        del files.attrs['_NCProperties']              # Attempt to delete the '_NCProperties' attribute from the files' attributes

    except KeyError:
        pass                                          # If the key does not exist, do nothing
    
    return files                                      # Return the xarray dataset



In [None]:
def ncep_grabber(start_date, end_date, variables, output_dir, domain, levels = None):


    '''This function takes user input for the start, stop, and step for the desired NCEP Renalysis II data from THREDDS and downloads them locally to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    domain: (list) list of integers --> [N, S, E, W] to trim the domain by
    levels: (list) list of floats describing pressure levels. This only applies to variables on pressure surfaces. The default = None in the function which will return only the [1000] level for each year. 

    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''
    
    non_exist = []                                                                        # Initialize a list to track files that don't exist
             
    base_url = 'http://psl.noaa.gov/thredds/dodsC/Datasets/ncep.reanalysis2'              # Base URL for the NCEP Reanalysis 2 dataset
             
    dir_maker(output_dir)                                                                 # Create the main output directory                                  
             
    if levels == None:         
        levels = [1000]                                                                   # Default to 1000mb level if no levels are specified
             
    else:         
        pass                                                                              # If levels are specified, do nothing
    
    years = np.unique([date.year for date in pd.date_range(start_date, end_date)])        # Generate a list of unique years within the date range
    
    for y in years:                                                                       # Iterate over each year                                                                
        for key, item in variables.items():                                               # Iterate over each variable in the variables dictionary
            for v in item:                                                                # Iterate over each item in the variable list
                        
                try:         
                    os.mkdir(f'{output_dir}/{v}')                                         # Attempt to create a directory for the variable
                        
                except FileExistsError:         
                    pass                                                                  # If the directory already exists, do nothing
                    
                # Generate file names to check based on the variable and year, including specific levels if applicable
                file_names_to_check = [f'ncep_{v}_{y}.nc'] + [f'ncep_{v}_{y}_{int(l)}mb.nc' for l in levels]
                
                found = False                                                             # Initialize a flag to track if the file is found
            
                for file_name in file_names_to_check:                                     # Iterate over each file name to check
                    if file_name in os.listdir(f'{output_dir}/{v}'):                      # Check if the file exists in the directory
                        found = True                                                      # Set the flag to True if the file is found
                        break                                                             # Break the loop if the file is found
                            
                if found:        
                    print(f'{file_name} has already been saved')                          # Print message if the file has already been saved
                        
                else:        
                    filename = f'{base_url}/{key}/{v}.{y}.nc'                             # Construct the filename for the dataset
            
                    try:        
                        if key == 'pressure':                                             # Check if the key is 'pressure'
                            for l in levels:                                              # Iterate over each level
                                # Open the dataset, select the domain and level, remove secret attributes, and save to NetCDF
                                files = ncep_secret_attribute_remover(xr.open_dataset(filename).sel(lat = slice(domain[0], domain[1]), lon = slice(domain[2], domain[3]), level = l))                                
                                files.to_netcdf(f'{output_dir}/{v}/ncep_{v}_{y}_{int(l)}mb.nc')
                        
                        elif key == 'gaussian_grid':                                      # Check if the key is 'gaussian_grid'
                            # Open the dataset, select the domain, remove secret attributes, and save to NetCDF
                            files = ncep_secret_attribute_remover(xr.open_dataset(filename, chunks={'time': '100MB'}).sel(lat = slice(domain[0], domain[1]), lon = slice(domain[2], domain[3])))                       
                            files.to_netcdf(f'{output_dir}/{v}/ncep_{v}_{y}.nc')
                        
                        else:
                            try:
                                # Attempt to open the dataset, select the domain, remove secret attributes, and save to NetCDF
                                files = ncep_secret_attribute_remover(xr.open_dataset(filename, chunks={'time': '100MB'}).sel(lat = slice(domain[0], domain[1]), lon = slice(domain[2], domain[3])))                                                                                                                     
                                files.to_netcdf(f'{output_dir}/{v}/ncep_{v}_{y}.nc')
                            
                            except RuntimeError:
                                # Handle RuntimeError, likely due to the data being too large to request over DAP
                                print(f'The data may be too large to request over DAP. You need subset the data more due to RuntimeError: NetCDF: DAP failure')
                    except FileNotFoundError:
                        print(f'part or all of {filename} does not exist')               # Print error message if the file does not exist
                        non_exist.append(f'{filename}')                                  # Add the filename to the non_exist list
          
        print(f'ncep vars in {y} have been saved')                                       # Print message after saving variables for the year
    
    return non_exist                                                                     # Return the list of files that don't exist or failed to download
    

In [None]:
def era_grabber_client(year, variables, hours, output_dir, domain):

     '''This function takes user input from the era_grabber function and establishes a connection with the COPERNICUS API to download from.
     It is an altered version that is featured on their webpage while selecting and exporting data from the on site interface. 
     
     Correct installation of cdsapi and your personal token is required prior to this step. 
    
    '''  
    
    import cdsapi
    c = cdsapi.Client()

    c.retrieve(
        'reanalysis-era5-single-levels',
        {
            'product_type': 'reanalysis',
            'format': 'netcdf',
        'variable': variables,
            
            'month': [
                '01', '02', '03',
                '04', '05', '06',
                '07', '08', '09',
                '10', '11', '12',
            ],
            'year': [str(year)],
            'day': [
                '01', '02', '03',
                '04', '05', '06',
                '07', '08', '09',
                '10', '11', '12',
                '13', '14', '15',
                '16', '17', '18',
                '19', '20', '21',
                '22', '23', '24',
                '25', '26', '27',
                '28', '29', '30',
                '31',
            ],
            'time': hours,
            
            'area': [domain[0], domain[2], domain[1], domain[3]],

        },
        
        f'{output_dir}/era5_{year}.nc')

    print(f'era5_{year}.nc has finished')

In [None]:
def era_grabber(start_date, end_date, hour_range, variables, output_dir, domain):

    '''This function takes user input for the start, stop, and step for the desired ERA5 Renalysis from COPERNICUS and 
    downloads them locally using multiprocess to the given directory. 
    
    Inputs:
    start_date: (str) date to start loop featuring day, month, and year in any order using either / or - 
    end_date: (str) date to end loop featuring day, month, and year in any order using either / or -     
    hour_range: (list) list of numerical timesteps desired. For example, np.arange(0, 24, 6) will produce [00, 06, 12, 18] in the function. 
    variables: (list) list of strings of varibles to download. 
    output_dir: (str) path where to download the files locally.
    domain: (list) list of integers --> [N, S, E, W] to trim the domain by

    Returns:
    non_exist: (list) list of strings of filenames that were not downloaded due to lack of availability. 
    
    '''   
    
    import multiprocess
    from multiprocess.pool import Pool
    from contextlib import closing
                
    dir_maker(output_dir)                                                                          # Create the main output directory                          
                      
    hours = [f'{str(i).zfill(2)}:00' for i in hour_range]                                          # Generate a list of hours in 'HH:00' format
                      
    end_year = pd.to_datetime(end_date).year                                                       # Extract the end year from the end_date
                      
    years = np.arange(pd.to_datetime(start_date).year, end_year+1)                                 # Generate an array of years from start_date to end_date
    with multiprocess.get_context("spawn").Pool() as pool:                                         # Create a multiprocessing pool using the "spawn" context
        for year in years[::4]:                                                                    # Iterate over years in groups of 4
            subyears = [y for y in np.arange(year, year+4) if y <= end_year]                       # Create a list of subyears within the 4-year range, ensuring it doesn't exceed end_year
            print(subyears)                                                                        
            input_data = [(year, variables, hours, output_dir, domain) for year in subyears]       # Prepare input data for each subyear
            pool.starmap(era_grabber_client, input_data)                                           # Use starmap to apply era_grabber_client function to the input data in parallel, by year in groups of 4

