In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import sys

sys.path.append("..")

import pandas as pd
import numpy as np
from datetime import datetime
import statistics as st

from src.processing import col_drop
from src.processing import get_flag
from src.processing import encode
from src.processing import normalize
from src.processing import get_error
from src.processing import get_closest_nysm_stations

from src.data import hrrr_data
from src.data import nysm_data

In [22]:
def columns_drop_hrrr(df):
    df = df.drop(
        columns=[
            "level_0",
            "index",
            "lead time",
            "lsm",
            "latitude",
            "longitude",
            "time",
            # "sh2",
            # "d2m",
            # 'r2',
            # "u10",
            # "v10",
            # "tp",
            # "mslma",
            # "orog",
            # "tcc",
            # "asnow",
            # "cape",
            # "dswrf",
            # "dlwrf",
            # "gh",
            # "u_total",
            # "u_dir",
            # "new_tp"
        ]
    )
    return df


def columns_drop_nysm(df):
    df = df.drop(
        columns=[
            "ta9m",
            "td",
            "relh",
            "srad",
            "pres",
            "mslp",
            "wspd_sonic_mean",
            "wspd_sonic",
            "wmax_sonic",
            "wdir_sonic",
            "snow_depth",
            "precip_total",
        ]
    )
    return df


def add_suffix(master_df, station):
    cols = ["valid_time", "time"]
    master_df = master_df.rename(
        columns={c: c + f"_{station}" for c in master_df.columns if c not in cols}
    )
    return master_df


def dataframe_wrapper(stations, df):
    master_df = df[df["station"] == stations[0]]
    master_df = add_suffix(master_df, stations[0])
    for station in stations[1:]:
        df1 = df[df["station"] == station]
        df1 = add_suffix(df1, station)
        master_df = master_df.merge(
            df1, on="valid_time", suffixes=(None, f"_{station}")
        )
    return master_df


def which_fold(df, fold):
    length = len(df)
    test_len = int(length * 0.2)
    df_train = pd.DataFrame()

    for n in np.arange(0, 5):
        if n != fold:
            df1 = df.iloc[int(0.2 * n * length) : int(0.2 * (n + 1) * length)]
            df_train = pd.concat([df_train, df1])
        else:
            df_test = df.iloc[int(0.2 * n * length) : int(0.2 * (n + 1) * length)]

    return df_train, df_test

In [23]:
def create_data_for_model(station, fh, today_date):
    """
    This function creates and processes data for a LSTM machine learning model.

    Args:
        station (str): The station identifier for which data is being processed.

    Returns:
        new_df (pandas DataFrame): A DataFrame containing processed data.
        df_train (pandas DataFrame): A DataFrame for training the machine learning model.
        df_test (pandas DataFrame): A DataFrame for testing the machine learning model.
        features (list): A list of feature names.
        forecast_lead (int): The lead time for the target variable.
    """

    # Print a message indicating the current station being processed.
    print(f"Targeting Error for {station}")

    # Load data from NYSM and HRRR sources.
    print("-- loading data from NYSM --")
    nysm_df = nysm_data.load_nysm_data()
    nysm_df.reset_index(inplace=True)
    print("-- loading data from NAM --")
    print(str(fh).zfill(3))
    nam_df = hrrr_data.read_hrrr_data(str(fh).zfill(2))

    # Rename columns for consistency.
    nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})

    # Filter NYSM data to match valid times from HRRR data
    mytimes = nam_df["valid_time"].tolist()
    nysm_df = nysm_df[nysm_df["valid_time"].isin(mytimes)]

    stations = get_closest_nysm_stations.get_closest_stations(nysm_df, 4, station)
    # stations = nysm_cats_df1["stid"].tolist()
    # stations = ['OLEA', 'BELM', 'RAND', 'DELE']
    nam_df1 = nam_df[nam_df["station"].isin(stations)]
    nysm_df1 = nysm_df[nysm_df["station"].isin(stations)]

    # format for LSTM
    nam_df1 = columns_drop_hrrr(nam_df1)
    # nysm_df1 = columns_drop_nysm(nysm_df1)
    master_df = dataframe_wrapper(stations, nam_df1)

    nysm_df1 = nysm_df1.drop(
        columns=[
            "index",
        ]
    )
    master_df2 = dataframe_wrapper(stations, nysm_df1)

    # combine HRRR + NYSM data on time
    master_df = master_df.merge(master_df2, on="valid_time", suffixes=(None, f"_xab"))

    # Calculate the error using NWP data.
    the_df = get_error.nwp_error("t2m", station, master_df)
    valid_times = the_df["valid_time"].tolist()
    # encode day of year to be cylcic
    the_df = encode.encode(the_df, "valid_time", 366)
    # drop columns
    the_df = the_df[the_df.columns.drop(list(the_df.filter(regex="station")))]
    now = datetime.now()
    print("now =", now)
    # dd/mm/YY H:M:S
    dt_string = now.strftime("%m_%d_%Y_%H:%M:%S")

    # Add EMD and/or Climate Indices
    # the_df = normalize.normalize_df(the_df, valid_times)
    new_df = the_df.drop(columns="valid_time")

    # normalize data
    cols = ["valid_time_cos", "valid_time_sin"]
    for k, r in new_df.items():
        if k in (cols):
            continue
        else:
            means = st.mean(new_df[k])
            stdevs = st.pstdev(new_df[k])
            new_df[k] = (new_df[k] - means) / stdevs

    features = [c for c in new_df.columns if c != "target_error"]
    lstm_df = new_df.copy()
    target_sensor = "target_error"
    forecast_lead = 0
    target = f"{target_sensor}_lead_{forecast_lead}"
    lstm_df.insert(loc=(0), column=target, value=lstm_df[target_sensor])
    # lstm_df.insert(loc=(0), column=target, value=lstm_df[target_sensor].shift(-forecast_lead))
    lstm_df = lstm_df.drop(columns=[target_sensor])
    # lstm_df = lstm_df.iloc[:-forecast_lead]
    # Split the data into training and testing sets.
    lstm_df["valid_time"] = the_df["valid_time"]
    df_train, df_test = which_fold(lstm_df, 4)

    print("Test Set Fraction", len(df_test) / len(lstm_df))

    # Fill missing values with zeros in the training and testing DataFrames.
    df_train = df_train.fillna(0)
    df_test = df_test.fillna(0)

    # Print a message indicating that data processing is complete.
    print("Data Processed")
    print("--init model LSTM--")

    return df_train, df_test, features, forecast_lead, stations, target

In [24]:
df_train, df_test, features, forecast_lead, stations, target = create_data_for_model(
    station="VOOR", fh=2, today_date="test"
)

Targeting Error for VOOR
-- loading data from NYSM --
-- loading data from NAM --
002
now = 2024-03-28 15:39:41.878129
Test Set Fraction 0.200004279997432
Data Processed
--init model LSTM--


In [25]:
features

['valid_time_cos',
 'valid_time_sin',
 't2m_VOOR',
 'sh2_VOOR',
 'd2m_VOOR',
 'r2_VOOR',
 'u10_VOOR',
 'v10_VOOR',
 'tp_VOOR',
 'mslma_VOOR',
 'orog_VOOR',
 'tcc_VOOR',
 'asnow_VOOR',
 'cape_VOOR',
 'dswrf_VOOR',
 'dlwrf_VOOR',
 'gh_VOOR',
 'u_total_VOOR',
 'u_dir_VOOR',
 'new_tp_VOOR',
 't2m_DUAN',
 'sh2_DUAN',
 'd2m_DUAN',
 'r2_DUAN',
 'u10_DUAN',
 'v10_DUAN',
 'tp_DUAN',
 'mslma_DUAN',
 'orog_DUAN',
 'tcc_DUAN',
 'asnow_DUAN',
 'cape_DUAN',
 'dswrf_DUAN',
 'dlwrf_DUAN',
 'gh_DUAN',
 'u_total_DUAN',
 'u_dir_DUAN',
 'new_tp_DUAN',
 't2m_MEDU',
 'sh2_MEDU',
 'd2m_MEDU',
 'r2_MEDU',
 'u10_MEDU',
 'v10_MEDU',
 'tp_MEDU',
 'mslma_MEDU',
 'orog_MEDU',
 'tcc_MEDU',
 'asnow_MEDU',
 'cape_MEDU',
 'dswrf_MEDU',
 'dlwrf_MEDU',
 'gh_MEDU',
 'u_total_MEDU',
 'u_dir_MEDU',
 'new_tp_MEDU',
 't2m_SCHO',
 'sh2_SCHO',
 'd2m_SCHO',
 'r2_SCHO',
 'u10_SCHO',
 'v10_SCHO',
 'tp_SCHO',
 'mslma_SCHO',
 'orog_SCHO',
 'tcc_SCHO',
 'asnow_SCHO',
 'cape_SCHO',
 'dswrf_SCHO',
 'dlwrf_SCHO',
 'gh_SCHO',
 'u_total_

In [19]:
df_train

Unnamed: 0,target_error_lead_0,valid_time_cos,valid_time_sin,t2m_VOOR,t2m_DUAN,t2m_MEDU,t2m_SCHO,lat_VOOR,lon_VOOR,elev_VOOR,...,tair_DUAN,lat_MEDU,lon_MEDU,elev_MEDU,tair_MEDU,lat_SCHO,lon_SCHO,elev_SCHO,tair_SCHO,valid_time
0,-1.372526,0.999853,0.017166,-2.670681,-2.712649,-2.649345,-2.655111,0.0,0.0,0.0,...,-2.722404,0.0,0.0,0.0,-2.549112,0.0,0.0,0.0,-2.603523,2018-01-01 02:00:00
1,-1.068621,0.999853,0.017166,-2.681138,-2.775507,-2.677342,-2.689406,0.0,0.0,0.0,...,-2.793514,0.0,0.0,0.0,-2.550531,0.0,0.0,0.0,-2.669600,2018-01-01 03:00:00
2,-0.764647,0.999853,0.017166,-2.714232,-2.808737,-2.693143,-2.752691,0.0,0.0,0.0,...,-2.886301,0.0,0.0,0.0,-2.662106,0.0,0.0,0.0,-2.748828,2018-01-01 04:00:00
3,-0.499103,0.999853,0.017166,-2.726550,-2.821105,-2.693877,-2.818422,0.0,0.0,0.0,...,-2.947565,0.0,0.0,0.0,-2.767908,0.0,0.0,0.0,-2.848726,2018-01-01 05:00:00
4,-0.431764,0.999853,0.017166,-2.750535,-2.839370,-2.735478,-2.878326,0.0,0.0,0.0,...,-2.957018,0.0,0.0,0.0,-2.783774,0.0,0.0,0.0,-2.907863,2018-01-01 06:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37378,-0.305632,0.111355,-0.993781,0.030852,-0.093158,0.009248,-0.000848,0.0,0.0,0.0,...,0.011174,0.0,0.0,0.0,0.085830,0.0,0.0,0.0,-0.004034,2022-10-08 18:00:00
37379,-0.352870,0.111355,-0.993781,0.040669,-0.083302,0.007467,0.015070,0.0,0.0,0.0,...,-0.109234,0.0,0.0,0.0,0.059059,0.0,0.0,0.0,0.095076,2022-10-08 19:00:00
37380,0.099297,0.111355,-0.993781,0.048593,-0.075345,-0.013697,0.011334,0.0,0.0,0.0,...,-0.059740,0.0,0.0,0.0,0.056424,0.0,0.0,0.0,0.120904,2022-10-08 20:00:00
37381,0.166858,0.111355,-0.993781,0.015804,-0.131538,-0.058325,-0.004376,0.0,0.0,0.0,...,-0.140668,0.0,0.0,0.0,0.012357,0.0,0.0,0.0,0.060931,2022-10-08 21:00:00


In [None]:
x = df_train[features].values

In [None]:
print(x)

In [None]:
df = pd.read_parquet(
    "/home/aevans/nwp_bias/src/machine_learning/data/nam_data/fh001/NAM_2022_01_direct_compare_to_nysm_sites_mask_water.parquet"
)
df

In [None]:
for k in df.keys():
    print(k)