In [1]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

In [2]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
import pandas as pd
import numpy as np

In [3]:
def read_data(init):
    """
    Reads and concatenates parquet files containing forecast and error data for GFS, NAM, and HRRR weather models
    for the years 2018 to 2021, with random forecasts that have a forecast hour of 0 removed.

    Args:
        init (str): A string representing the initial time of the forecast (in UTC).

    Returns:
        tuple of pandas.DataFrame: A tuple containing three dataframes, one for each weather model, with random
        forecasts that have a forecast hour of 0 removed.
    """
    years = ["2018", "2019", "2020", "2021"]
    savedir = "/home/aevans/ai2es/processed_data/frcst_err/"

    # create empty lists to hold dataframes for each model
    nam_fcast_and_error = []
    gfs_fcast_and_error = []
    hrrr_fcast_and_error = []

    # loop over years and read in parquet files for each model
    for year in years:
        nam_fcast_and_error.append(
            pd.read_parquet(
                f"{savedir}nam_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
            )
        )
        gfs_fcast_and_error.append(
            pd.read_parquet(
                f"{savedir}gfs_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
            )
        )
        hrrr_fcast_and_error.append(
            pd.read_parquet(
                f"{savedir}hrrr_fcast_and_error_df_{init}z_{year}_mask_water_ny.parquet"
            )
        )

    # concatenate dataframes for each model
    nam_fcast_and_error_df = pd.concat(nam_fcast_and_error)
    gfs_fcast_and_error_df = pd.concat(gfs_fcast_and_error)
    hrrr_fcast_and_error_df = pd.concat(hrrr_fcast_and_error)

    # remove random forecasts that have forecast hours 0 for each model
    gfs_fcast_and_error_df = gfs_fcast_and_error_df[
        gfs_fcast_and_error_df["lead_time_ONLY_HOURS"] != 0.0
    ]
    nam_fcast_and_error_df = nam_fcast_and_error_df[
        nam_fcast_and_error_df["lead_time_ONLY_HOURS"] != 0.0
    ]
    hrrr_fcast_and_error_df = hrrr_fcast_and_error_df[
        hrrr_fcast_and_error_df["lead_time_ONLY_HOURS"] != 0.0
    ]

    # return dataframes for each model
    return gfs_fcast_and_error_df, nam_fcast_and_error_df, hrrr_fcast_and_error_df

In [4]:
gfs_fcast_and_error_df, nam_fcast_and_error_df, hrrr_fcast_and_error_df = read_data(
    "12"
)

In [5]:
nam_fcast_and_error_df = nam_fcast_and_error_df.reset_index()

In [6]:
nysm_df = pd.read_csv("/home/aevans/nwp_bias/src/landtype/data/nysm.csv")

In [7]:
nysm_df = nysm_df[nysm_df["climate_division"] == 1]
hudson_ls = nysm_df["stid"].to_list()

In [8]:
nam_fcast_and_error_df = nam_fcast_and_error_df[
    nam_fcast_and_error_df["station"].isin(hudson_ls)
]

In [9]:
stations = nam_fcast_and_error_df["station"].unique()

In [10]:
master_df = nam_fcast_and_error_df[nam_fcast_and_error_df["station"] == stations[0]]
master_df = master_df.drop(
    columns=[
        "station",
        "time",
        "t2m_error",
        "d2m_error",
        "u_total_error",
        "u_dir_error",
        "new_tp_error",
        "prmsl_error",
        "lead_time_DAY",
        "lead_time_HOUR",
        "lead_time_ONLY_HOURS",
    ]
)

In [11]:
for station in stations:
    df = nam_fcast_and_error_df[nam_fcast_and_error_df["station"] == station]
    df = df.drop(
        columns=[
            "station",
            "time",
            "t2m_error",
            "d2m_error",
            "u_total_error",
            "u_dir_error",
            "new_tp_error",
            "prmsl_error",
            "lead_time_DAY",
            "lead_time_HOUR",
            "lead_time_ONLY_HOURS",
        ]
    )
    master_df = master_df.merge(df, on="valid_time", suffixes=(None, f"_{station}"))

: 

: 

In [None]:
nam_fcast_and_error_df.keys()

Index(['station', 'valid_time', 'time', 't2m_NAM', 'd2m_NAM', 'u_total_NAM',
       'u_dir_NAM', 'latitude', 'longitude', 'new_tp_NAM', 'prmsl_NAM', 'orog',
       't2m_error', 'd2m_error', 'u_total_error', 'u_dir_error',
       'new_tp_error', 'prmsl_error', 't2m_nysm', 'd2m_nysm', 'u_total_nysm',
       'u_dir_nysm', 'new_tp_nysm', 'prmsl_nysm', 'lead_time_DAY',
       'lead_time_HOUR', 'lead_time_ONLY_HOURS'],
      dtype='object')