In [5]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import pandas as pd

In [7]:
def read_data(init):
    """
    Reads and concatenates parquet files containing forecast and error data for GFS, NAM, and HRRR weather models
    for the years 2018 to 2021, with random forecasts that have a forecast hour of 0 removed.

    Args:
        init (str): A string representing the initial time of the forecast (in UTC).

    Returns:
        tuple of pandas.DataFrame: A tuple containing three dataframes, one for each weather model, with random
        forecasts that have a forecast hour of 0 removed.
    """
    years = ["2018", "2019", "2020", "2021"]
    savedir = "/home/aevans/ai2es/processed_data/frcst_err/"

    # create empty lists to hold dataframes for each model
    nam_fcast_and_error = []
    gfs_fcast_and_error = []
    hrrr_fcast_and_error = []

    # loop over years and read in parquet files for each model
    for year in years:
        nam_fcast_and_error.append(
            pd.read_parquet(
                f"{savedir}nam_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
            )
        )
        gfs_fcast_and_error.append(
            pd.read_parquet(
                f"{savedir}gfs_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
            )
        )
        hrrr_fcast_and_error.append(
            pd.read_parquet(
                f"{savedir}hrrr_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
            )
        )

    # concatenate dataframes for each model
    nam_fcast_and_error_df = pd.concat(nam_fcast_and_error)
    gfs_fcast_and_error_df = pd.concat(gfs_fcast_and_error)
    hrrr_fcast_and_error_df = pd.concat(hrrr_fcast_and_error)

    # remove random forecasts that have forecast hours 0 for each model
    gfs_fcast_and_error_df = gfs_fcast_and_error_df[
        gfs_fcast_and_error_df["lead_time_ONLY_HOURS"] != 0.0
    ]
    nam_fcast_and_error_df = nam_fcast_and_error_df[
        nam_fcast_and_error_df["lead_time_ONLY_HOURS"] != 0.0
    ]
    hrrr_fcast_and_error_df = hrrr_fcast_and_error_df[
        hrrr_fcast_and_error_df["lead_time_ONLY_HOURS"] != 0.0
    ]

    # return dataframes for each model
    return gfs_fcast_and_error_df, nam_fcast_and_error_df, hrrr_fcast_and_error_df

In [10]:
years = ["2018", "2019", "2020", "2021"]
savedir = "/home/aevans/ai2es/processed_data/frcst_err/"
init = "00"
year = "2018"

In [11]:
df = pd.read_parquet(
    f"{savedir}nam_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
)

In [12]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,t2m_NAM,d2m_NAM,u_total_NAM,u_dir_NAM,latitude,longitude,new_tp_NAM,prmsl_NAM,orog,t2m_error,...,prmsl_error,t2m_nysm,d2m_nysm,u_total_nysm,u_dir_nysm,new_tp_nysm,prmsl_nysm,lead_time_DAY,lead_time_HOUR,lead_time_ONLY_HOURS
station,valid_time,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ADDI,2018-01-01 01:00:00,2018-01-01,-16.981250,-20.863975,3.373392,300.379802,42.040359,-77.237259,0.000000,1032.101614,462.976663,1.256569,...,67.609610,-18.237820,-22.090469,1.456784,325.875793,0.00,964.492004,0,1,1.0
ADDI,2018-01-01 02:00:00,2018-01-01,-17.201129,-20.930252,3.749043,300.643016,42.040359,-77.237259,0.000000,1031.815312,462.976663,1.167102,...,67.528508,-18.368231,-21.647659,1.484595,305.772797,0.00,964.286804,0,2,2.0
ADDI,2018-01-01 03:00:00,2018-01-01,-17.440689,-21.045853,3.879040,300.327564,42.040359,-77.237259,0.000000,1031.428182,462.976663,1.389711,...,67.018209,-18.830400,-21.994141,1.247007,309.544586,0.00,964.409973,0,3,3.0
ADDI,2018-01-01 04:00:00,2018-01-01,-17.649172,-21.087488,3.929234,297.859397,42.040359,-77.237259,0.000000,1031.270318,462.976663,0.869468,...,66.791131,-18.518641,-21.890549,1.821808,314.346588,0.00,964.479187,0,4,4.0
ADDI,2018-01-01 05:00:00,2018-01-01,-17.584399,-20.893174,3.906099,297.858044,42.040359,-77.237259,0.000000,1031.647925,462.976663,1.051441,...,67.066199,-18.635839,-21.847321,1.780934,294.888092,0.00,964.581726,0,5,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YORK,2018-12-31 15:00:00,2018-12-30,1.196887,-2.150057,4.056059,173.854647,42.855042,-77.847763,0.000000,1022.871613,275.188109,0.191762,...,22.845612,1.005125,-2.327484,3.325408,201.293900,0.00,1000.026001,1,15,39.0
YORK,2018-12-31 18:00:00,2018-12-29,2.678166,-2.793708,4.551881,166.731068,42.855042,-77.847763,0.000000,1017.212134,275.188109,-1.226486,...,22.325720,3.904652,-0.755005,3.259900,193.793198,0.00,994.886414,2,18,66.0
YORK,2018-12-31 18:00:00,2018-12-30,3.871377,-1.336919,3.856630,150.337822,42.855042,-77.847763,0.000000,1019.299058,275.188109,-0.033275,...,24.412644,3.904652,-0.755005,3.259900,193.793198,0.00,994.886414,1,18,42.0
YORK,2018-12-31 21:00:00,2018-12-29,0.892578,0.675481,3.573278,135.236089,42.855042,-77.847763,2.695212,1013.540413,275.188109,-3.351555,...,21.705330,4.244133,2.717255,1.670582,182.039902,0.57,991.835083,2,21,69.0


In [None]:
df = df.reset_index()