## Questions I have 
- bucket 
- split_loc
- fh_loc
- fxx_max
- z_fill 
- full file_list_len


In [48]:
import sys
import numpy as np
import s3fs
import argparse
from datetime import datetime, timedelta
import os.path
from pathlib import Path
import time
import glob

from boto3 import client
from botocore import UNSIGNED
from botocore.client import Config

# Functions

In [49]:
"""
 download_s3_file 
 downloads the nwp file 

 ARGS: 
 bucket: f string :: 
 ingest_path : fstring  :: file that has been downloaded 
 output_path : fstring :: where you want file stored 

 RETURNS: 

 """


def download_s3_file(bucket, ingest_path, output_path):
    fs = s3fs.S3FileSystem(anon=True, asynchronous=False)

    if fs.exists(f"{bucket}/{ingest_path}"):
        # Download file, will throw FileNotFoundError if non existent
        fs.download(f"{bucket}/{ingest_path}", output_path)
        print(f"✅ downloading {ingest_path} & saving to {output_path}")
    else:
        print(f"‼️ file not found {ingest_path}")


"""
list_s3_files 
returns a list of files that have been downloaded for nwp 

 ARGS: 
 bucket
 model : string :: the model to be downloaded 
 date : fstring :: year-month-day
 init_time : double :: 00
 data_type : string :: file format 

 RETURNS: 
 list :: the files that exist from dowload in a list 

 """


def list_s3_files(bucket, model, date, init_time, data_type):
    conn = client("s3", config=Config(signature_version=UNSIGNED))

    if model == "nam":
        prefix = f"{model}.{date}/"
    elif model == "gfs":
        prefix = f"{model}.{date}/{init_time}/atmos/"
    elif model == "hrrr":
        prefix = f"{model}.{date}/conus/"
    elif model == 'blend':
        prefix = f"{model}.{date}/{init_time}/core/"

    full_prefix = f"{prefix}{model}.t{init_time}z.{data_type}"
    print(f"Constructed prefix: {full_prefix}")  # Print the constructed prefix
    response = conn.list_objects_v2(
        Bucket=bucket, Prefix=f"{prefix}{model}.t{init_time}z.{data_type}"
    )
    files = response.get("Contents")
    print(files)
    if files:
        all_files = [file.get("Key") for file in files]
        existing_files_for_download = [
            file
            for file in all_files
            if f"t{init_time}z.{data_type}" in file
            and not file.endswith("idx")
            and not file.endswith("anl")
        ]
        existing_files_for_download.sort()
        return existing_files_for_download
    else:
        return []


"""
get_avail_files 

 ARGS: 
 s3_bucket
 model : string :: the model to be downloaded 
 year : string :: 0000
 month : string :: 00
 day : string  :: 00
 init_time : double :: 00
 data_type : string :: file format 
 split_loc: 
 fh_loc: 
 fxx_max: 
 zfill: 
 download_dir: fstring :: directory path where you want the  data to be downloaded 

 fname_out: string :: part of datapath name to be saved in your directory 

 fname_end: string :: part of datapath name
 full_filelist_len: 

 RETURNS: 
 the files that exist for dowload 

 """


def get_avail_files(
    s3_bucket,
    model,
    year,
    month,
    day,
    init_time,
    data_type,
    split_loc,
    fh_loc,
    fxx_max,
    zfill,
    download_dir,
    fname_out,
    fname_end,
    full_filelist_len,
):
    ii = 0
    len_files_for_download = [0]
    while True:
        files_for_download = list_s3_files(
            s3_bucket, model, f"{year}{month}{day}", init_time, data_type
        )
        files_for_download = [
            file
            for file in files_for_download
            if int(file.split(".")[split_loc][fh_loc:]) <= fxx_max
        ]
        print(files_for_download)
        len_files_for_download.append(len(files_for_download))
        for file in files_for_download:
            fxx = file.split(".")[split_loc][fh_loc:]
            if not os.path.isdir(download_dir):
                print("making directory: ", download_dir)
                Path(download_dir).mkdir(parents=True, exist_ok=True)
            # check to see if output_path is a directory. if not, create directory
            output_path = f"{download_dir}{fname_out}{str(fxx).zfill(zfill)}{fname_end}"
            if not os.path.exists(output_path):
                # if the file already exists, do not redownload
                download_s3_file(s3_bucket, file, output_path)
                # call the rest of the pipeline here, run_pipeline
                # running cleaning through the pipeline could be the "sleep" period
                print("FXX IS: ", fxx)
            else:
                print(f"file has already been downloaded: {output_path}")

        # STOP WHILE LOOP IF ALL DESIRED FILES HAVE BEEN DOWNLOADED ON OUR SIDE
        if os.path.isdir(download_dir):
            files_downloaded = glob.glob(f"{download_dir}{fname_out}*{fname_end}")
            num_files_downloaded = len(files_downloaded)
            print(num_files_downloaded)
            if num_files_downloaded >= full_filelist_len:
                print("exiting from while loop")
                break

        # if no additional files are available compared to last try but the full_filelist_len has not been reached yet...
        if len_files_for_download[-1] == len_files_for_download[-2]:
            ii += 1
            print("same number of available files as last try. ii=", ii)
            if (
                ii > 10
            ):  # stop waiting for additional file if we have tried 10 separate times
                print("waited too long for new file, exiting while loop")
                break

        # try again in 90 seconds
        print("sleep: ", datetime.now())
        time.sleep(90)

# Global Variables 

In [50]:
# this is how you want the files to be input for each model
# second component is the file format
data_type_dict = {"gfs": "pgrb2.0p50", "nam": "awphys", "hrrr": "wrfsfc", "blend": "core"}

# download request time
init_time = "00"
init_date = datetime(2021, 3, 21)
end_date = datetime(2021, 3, 31)

# Main 

In [51]:
# main
def main(model, data_type, init_date, init_time):
    month = str(init_date.month).zfill(2)
    print("Month: ", month)
    year = init_date.year
    print("Year", year)
    day = str(init_date.day).zfill(2)

    # where you want the files to download
    download_dir = (
        f"/home/aevans/nwp_bias/data/model_data/{model.upper()}/{year}/{month}/"
    )
    print("Download_dir: ", download_dir)

    if model == "nam":
        s3_bucket = f"noaa-{model}-pds"
    if model == "blend":
        s3_bucket = f"noaa-nbm-grib2-pds"
    else:
        s3_bucket = f"noaa-{model}-bdp-pds"

    if model == "nam":
        fxx_max = 84
        split_loc, fh_loc = -3, -2
        fname_out = f"nam_218_{year}{month}{day}_{init_time}00_"
        fname_end = ".grb2"
        zfill = 3
        full_filelist_len = len(
            np.arange(0, 37, 1).tolist() + np.arange(39, 85, 3).tolist()
        )
    elif model == "gfs":
        fxx_max = 96
        split_loc, fh_loc = -1, 1
        fname_out = f"gfs_4_{year}{month}{day}_{init_time}00_"
        fname_end = ".grb2"
        zfill = 3
        full_filelist_len = len(np.arange(0, 99, 3))
    elif model == "hrrr":
        fxx_max = 18
        split_loc, fh_loc = -2, -2
        fname_out = f"{year}{month}{day}_hrrr.t{init_time}z.wrfsfcf"
        fname_end = ".grib2"
        zfill = 2
        full_filelist_len = len(range(0, 19))
        # blend.t00z.master.f001.co.grib2
    elif model == "blend":
        fxx_max = 84
        split_loc, fh_loc = -4, -4
        fname_out = f"_blend.t{init_time}z.master..co"
        fname_end = ".grib2"
        zfill = 3
        full_filelist_len = len(range(0,84))

    get_avail_files(
        s3_bucket,
        model,
        year,
        month,
        day,
        init_time,
        data_type,
        split_loc,
        fh_loc,
        fxx_max,
        zfill,
        download_dir,
        fname_out,
        fname_end,
        full_filelist_len,
    )

    print(
        f"full download for {init_time}z initialization of the {model.upper()} complete!"
    )

# GFS

In [52]:
# delta2 = timedelta(days=1)
# while init_date < end_date:
#     main("gfs", data_type_dict.get("gfs"), init_date, init_time)
#     init_date += delta2

# NAM

In [53]:
# delta2 = timedelta(days=1)
# while init_date < end_date:
#     main("nam", data_type_dict.get("nam"), init_date, init_time)
#     init_date += delta2

# HRRR

In [54]:
# main("hrrr", data_type_dict.get("hrrr"), init_date, init_time)

Month:  03
Year 2024
Download_dir:  /home/aevans/nwp_bias/data/model_data/HRRR/2024/03/
Constructed prefix: hrrr.20240321/conus/hrrr.t00z.wrfsfc

In [None]:
#core dates = 09/30/2020->

#grib2/.master dates = ->09/30/2020

# Blend

In [55]:
main("blend", data_type_dict.get("blend"), init_date, init_time)

Month:  03
Year 2021
Download_dir:  /home/aevans/nwp_bias/data/model_data/BLEND/2021/03/
Constructed prefix: blend.20210321/00/core/blend.t00z.core
[{'Key': 'blend.20210321/00/core/blend.t00z.core.f001.ak.grib2', 'LastModified': datetime.datetime(2021, 3, 21, 1, 13, 10, tzinfo=tzlocal()), 'ETag': '"57198caf613129fd9b5ba7767a39005c"', 'Size': 36068553, 'StorageClass': 'STANDARD'}, {'Key': 'blend.20210321/00/core/blend.t00z.core.f001.ak.grib2.idx', 'LastModified': datetime.datetime(2021, 3, 21, 1, 13, 9, tzinfo=tzlocal()), 'ETag': '"d9b86e8cd36a366e08a4488840fcd08c"', 'Size': 3912, 'StorageClass': 'STANDARD'}, {'Key': 'blend.20210321/00/core/blend.t00z.core.f001.co.grib2', 'LastModified': datetime.datetime(2021, 3, 21, 1, 13, 11, tzinfo=tzlocal()), 'ETag': '"250f0bf09ac29fda5b701d738a8b9b86"', 'Size': 101189257, 'StorageClass': 'STANDARD'}, {'Key': 'blend.20210321/00/core/blend.t00z.core.f001.co.grib2.idx', 'LastModified': datetime.datetime(2021, 3, 21, 1, 13, 10, tzinfo=tzlocal()), 'ETa

ValueError: invalid literal for int() with base 10: 'core'