In [1]:
import pandas as pd
import numpy as np
from scipy.interpolate import griddata
import matplotlib.pyplot as plt
import geopandas as gpd
import shapely.geometry
from geopandas import GeoDataFrame
from shapely.geometry import Point
import os
import re
import datetime



In [2]:
arr = np.loadtxt(
    "/home/aevans/nwp_bias/src/machine_learning/frankenstein/data/error_dt/2018/01/01012018_03:00:00.txt"
)

In [3]:
arr.shape

(7443, 23)

In [None]:
def read_hrrr_data(year):
    """
    Reads and concatenates parquet files containing forecast and error data for HRRR weather models
    for the years 2018 to 2022.

    Returns:
        pandas.DataFrame: of hrrr weather forecast information for each NYSM site.
    """

    savedir = "/home/aevans/ai2es/processed_data/HRRR/ny/"

    # create empty lists to hold dataframes for each model
    hrrr_fcast_and_error = []

    # loop over years and read in parquet files for each model
    for month in np.arange(1, 13):
        str_month = str(month).zfill(2)
        if (
            os.path.exists(
                f"{savedir}HRRR_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
            )
            == True
        ):
            hrrr_fcast_and_error.append(
                pd.read_parquet(
                    f"{savedir}HRRR_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                )
            )
        else:
            continue

    # concatenate dataframes for each model
    hrrr_fcast_and_error_df = pd.concat(hrrr_fcast_and_error)
    hrrr_fcast_and_error_df = hrrr_fcast_and_error_df.reset_index().dropna()

    # return dataframes for each model
    return hrrr_fcast_and_error_df

In [None]:
def load_nysm_data(year):
    # these parquet files are created by running "get_resampled_nysm_data.ipynb"
    nysm_path = "/home/aevans/nwp_bias/data/nysm/"
    nysm_1H = []
    df = pd.read_parquet(f"{nysm_path}nysm_1H_obs_{year}.parquet")
    df.reset_index(inplace=True)
    nysm_1H.append(df)
    nysm_1H_obs = pd.concat(nysm_1H)
    return nysm_1H_obs

In [None]:
def add_suffix(df, stations):
    cols = ["valid_time", "time"]
    df = df.rename(
        columns={c: c + f"_{stations[0]}" for c in df.columns if c not in cols}
    )
    return df


def columns_drop(df):
    df = df.drop(
        columns=[
            "level_0",
            "index",
            "lead time",
            "lsm",
            "index_nysm",
            "station_nysm",
        ]
    )
    return df


def columns_drop_v2(df):
    df = df.drop(
        columns=[
            "station",
            "latitude",
            "longitude",
            "t2m",
            "sh2",
            "d2m",
            "r2",
            "u10",
            "v10",
            "tp",
            "mslma",
            "orog",
            "tcc",
            "asnow",
            "cape",
            "dswrf",
            "dlwrf",
            "gh",
            "u_total",
            "u_dir",
            "new_tp",
            "lat",
            "lon",
            "elev",
            "tair",
            "ta9m",
            "td",
            "relh",
            "srad",
            "pres",
            "mslp",
            "wspd_sonic",
            "wmax_sonic",
            "wdir_sonic",
            "precip_total",
            "snow_depth",
            "target_error",
        ]
    )
    df = df[df.columns.drop(list(df.filter(regex="new_tp")))]
    return df

In [None]:
def nwp_error(target, station, df):
    vars_dict = {
        "t2m": "tair",
        "mslma": "pres",
    }
    nysm_var = vars_dict.get(target)

    df["target_error"] = df[f"{target}"] - df[f"{nysm_var}"]
    return df


def create_data(year):
    print("-- loading data from nysm --")
    # read in hrrr and nysm data
    nysm_df = load_nysm_data(year)
    nysm_df.reset_index(inplace=True)
    nysm_df.dropna(inplace=True)
    print("-- loading data from hrrr --")
    hrrr_df = read_hrrr_data(year)
    hrrr_df.dropna(inplace=True)
    nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})
    mytimes = hrrr_df["valid_time"].tolist()
    nysm_df = nysm_df[nysm_df["valid_time"].isin(mytimes)]
    stations = nysm_df["station"].unique()
    sorted_stations = sorted(stations)

    master_df = hrrr_df.merge(nysm_df, on="valid_time", suffixes=(None, "_nysm"))
    master_df = master_df.drop_duplicates(
        subset=["valid_time", "station", "t2m"], keep="first"
    )
    print("-- finalizing dataframe --")
    df = columns_drop(master_df)
    master_df = df[df["station"] == sorted_stations[0]]
    master_df = nwp_error("t2m", sorted_stations[0], master_df)
    master_df = add_suffix(master_df, sorted_stations)
    for station in sorted_stations:
        df1 = df[df["station"] == station]
        # print(df1.keys())
        df2 = nwp_error("t2m", station, df1)
        master_df = master_df.merge(
            df2, on="valid_time", suffixes=(None, f"_{station}")
        )

    master_df = columns_drop_v2(master_df)
    the_df = master_df.copy()
    return the_df