In [21]:
import os
import pandas as pd
import numpy as np

In [22]:
def read_hrrr_data():
    """
    Reads and concatenates parquet files containing forecast and error data for HRRR weather models
    for the years 2018 to 2022.

    Returns:
        pandas.DataFrame: of hrrr weather forecast information for each NYSM site.
    """

    years = ["2018", "2019", "2020", "2021", "2022"]
    savedir = "/home/aevans/nwp_bias/src/machine_learning/data/hrrr_data/ny/"

    # create empty lists to hold dataframes for each model
    hrrr_fcast_and_error = []

    # loop over years and read in parquet files for each model
    for year in years:
        for month in np.arange(1, 13):
            str_month = str(month).zfill(2)
            if (
                os.path.exists(
                    f"{savedir}HRRR_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                )
                == True
            ):
                hrrr_fcast_and_error.append(
                    pd.read_parquet(
                        f"{savedir}HRRR_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                    )
                )
            else:
                continue

    # concatenate dataframes for each model
    hrrr_fcast_and_error_df = pd.concat(hrrr_fcast_and_error)
    hrrr_fcast_and_error_df = hrrr_fcast_and_error_df.reset_index().dropna()

    # return dataframes for each model
    return hrrr_fcast_and_error_df

In [23]:
def load_nysm_data():
    # these parquet files are created by running "get_resampled_nysm_data.ipynb"
    nysm_path = "/home/aevans/nwp_bias/data/nysm/"

    nysm_1H = []
    for year in np.arange(2018, 2023):
        df = pd.read_parquet(f"{nysm_path}nysm_1H_obs_{year}.parquet")
        df.reset_index(inplace=True)
        nysm_1H.append(df)
    nysm_1H_obs = pd.concat(nysm_1H)
    nysm_1H_obs["snow_depth"] = nysm_1H_obs["snow_depth"].fillna(-999)
    nysm_1H_obs.dropna(inplace=True)
    return nysm_1H_obs

In [24]:
# tabular data paths
nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
nysm_cats_df = pd.read_csv(nysm_cats_path)
nysm_cats_df = nysm_cats_df[nysm_cats_df["climate_division_name"] == "Mohawk Valley"]
stations = nysm_cats_df["stid"].to_list()

In [25]:
# load nysm and hrrr data
nysm_df = load_nysm_data()
nysm_df = nysm_df[nysm_df["station"].isin(stations)]
nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})
hrrr_df = read_hrrr_data()
hrrr_df = hrrr_df[hrrr_df["station"].isin(stations)]

In [26]:
hrrr_df.head()

Unnamed: 0,valid_time,time,station,level_0,index,latitude,longitude,t2m,sh2,d2m,...,asnow,cape,dswrf,dlwrf,gh,u_total,u_dir,lead time,lsm,new_tp
9291,2018-01-01 02:00:00,2018-01-01 00:00:00,CAMD,46857447,95844,43.343055,-75.756517,-21.211386,0.00042,-26.875479,...,4.2e-05,0.0,0.0,146.399994,5181.073242,1.87844,317.283875,2,1.0,0.003
9292,2018-01-01 03:00:00,2018-01-01 01:00:00,CAMD,46857448,301500,43.343055,-75.756517,-22.074194,0.00038,-28.055716,...,6e-06,0.0,0.0,146.699997,5179.269531,1.340138,290.091858,2,1.0,0.0
9293,2018-01-01 04:00:00,2018-01-01 02:00:00,CAMD,46857449,507156,43.343055,-75.756517,-21.868689,0.00039,-27.79357,...,1.8e-05,0.0,0.0,148.800003,5172.160645,1.517081,258.444275,2,1.0,0.001
9294,2018-01-01 05:00:00,2018-01-01 03:00:00,CAMD,46857450,712812,43.343055,-75.756517,-22.189062,0.00037,-28.383368,...,1.4e-05,0.0,0.0,148.199997,5173.573242,1.771017,280.230957,2,1.0,0.0
9295,2018-01-01 06:00:00,2018-01-01 04:00:00,CAMD,46857451,918468,43.343055,-75.756517,-22.572791,0.00036,-28.633658,...,2.2e-05,0.0,0.0,146.800003,5168.227539,1.457273,281.465393,2,1.0,0.001


In [27]:
nysm_df.head()

Unnamed: 0,station,valid_time,lat,lon,elev,tair,ta9m,td,relh,srad,pres,mslp,wspd_sonic,wmax_sonic,wdir_sonic,precip_total,snow_depth
14137,CAMD,2018-01-01 01:00:00,43.339851,-75.742989,150.326996,-22.911461,-21.920361,-25.015427,82.875397,0.001113,1009.560974,1012.570068,0.624323,1.593864,317.09549,0.0,0.235601
14138,CAMD,2018-01-01 02:00:00,43.339851,-75.742989,150.326996,-23.924709,-23.23881,-26.048706,82.580147,0.00045,1009.617981,1012.71283,0.377881,0.675845,15.13685,0.0,0.23575
14139,CAMD,2018-01-01 03:00:00,43.339851,-75.742989,150.326996,-25.616911,-24.5425,-28.066177,79.890846,0.000491,1009.711975,1012.951477,0.49983,0.778281,102.568001,0.0,0.237
14140,CAMD,2018-01-01 04:00:00,43.339851,-75.742989,150.326996,-26.42762,-25.60531,-29.012283,78.752899,0.000592,1009.929993,1013.240723,0.404716,1.408102,37.523071,0.0,0.232782
14141,CAMD,2018-01-01 05:00:00,43.339851,-75.742989,150.326996,-27.130831,-26.212669,-29.678207,78.906258,0.000562,1010.046997,1013.41925,0.660483,1.307877,340.557098,0.0,0.236806


In [28]:
df = pd.DataFrame()
for station in stations:
    nysm_df1 = nysm_df[nysm_df["station"] == (station)]
    hrrr_df1 = hrrr_df[hrrr_df["station"] == (station)]

    error_ls = []
    for i, _ in enumerate(nysm_df1["valid_time"]):
        t0 = nysm_df1["valid_time"].iloc[i]
        nysm_df2 = nysm_df1[nysm_df1["valid_time"] == (t0)]
        hrrr_df2 = hrrr_df1[hrrr_df1["valid_time"] == (t0)]

        error = hrrr_df2["t2m"] - nysm_df2["tair"]
        error_ls.append(error)

    df[station] = error_ls

ValueError: Length of values (43511) does not match length of index (42898)

In [None]:
df