In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("..")
import pandas as pd 
import numpy as np 
from src.processing import hrrr_data
from src.processing import nysm_data
from src.processing import get_error
from src.processing import normalize
from src.processing import get_flag
import pandas as pd

In [3]:
nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
nysm_cats_df = pd.read_csv(nysm_cats_path)

In [5]:
nysm_cats_df['climate_division_name'].unique()

array(['Western Plateau', 'Eastern Plateau', 'Great Lakes',
       'Hudson Valley', 'Coastal', 'Central Lakes', 'Mohawk Valley',
       'Champlain Valley', 'Northern Plateau', 'St. Lawrence Valley'],
      dtype=object)

In [7]:
df = pd.read_parquet('/home/aevans/transformer_ml/src/data/temp_df/20240325/Hudson Valley/ml_output.parquet')

In [10]:
df.iloc[-20:]

Unnamed: 0,0_transformer_output,0_target,1_transformer_output,1_target,2_transformer_output,2_target
7788,0.068059,,-0.222741,,0.051457,
7789,-0.3034,,-0.481618,,-0.43756,
7790,-0.41139,,-0.476483,,-0.263312,
7791,0.223385,,0.034735,,0.041536,
7792,0.615609,,0.113829,,0.639374,
7793,-0.215738,,-0.357341,,-0.16534,
7794,0.410201,,0.039925,,0.083371,
7795,-0.48018,,-0.439321,,-0.187975,
7796,-0.254938,,-0.323327,,-0.142254,
7797,-0.249045,,-0.414813,,-0.258309,


In [None]:
def columns_drop(df):
    df = df.drop(
        columns=[
            "level_0",
            "index_x",
            "index_y",
            "lead time",
            "lsm",
            "station_y",
            'lat',
            'lon'
        ]
    )
    df = df.rename(columns={'station_x':'station'})
    return df

In [None]:
def create_data_for_model():
    """
    This function creates and processes data for a LSTM machine learning model.

    Args:
        station (str): The station identifier for which data is being processed.

    Returns:
        new_df (pandas DataFrame): A DataFrame containing processed data.
        df_train (pandas DataFrame): A DataFrame for training the machine learning model.
        df_test (pandas DataFrame): A DataFrame for testing the machine learning model.
        features (list): A list of feature names.
        forecast_lead (int): The lead time for the target variable.
    """
    # load nysm data
    nysm_df = nysm_data.load_nysm_data()
    nysm_df.reset_index(inplace=True)
    nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})

    # load hrrr data
    hrrr_df = hrrr_data.read_hrrr_data()

    # Filter NYSM data to match valid times from HRRR data and save it to a CSV file.
    mytimes = hrrr_df["valid_time"].tolist()
    nysm_df = nysm_df[nysm_df["valid_time"].isin(mytimes)]

    # Filter data by NY climate division 
    nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
    nysm_cats_df = pd.read_csv(nysm_cats_path)
    nysm_cats_df = nysm_cats_df[nysm_cats_df['climate_division_name']=='Western Plateau']
    stations = nysm_cats_df["stid"].tolist()
    nysm_df = nysm_df[nysm_df['station'].isin(stations)]
    hrrr_df = hrrr_df[hrrr_df['station'].isin(stations)]

    # merge dataframes so that each row is hrrr + nysm data for the same time step
    # do this for each station individually 
    for station in stations:
        nysm_df1 = nysm_df[nysm_df['station']==station]
        hrrr_df1 = hrrr_df[hrrr_df['station']==station]

        master_df = hrrr_df1.merge(nysm_df1, on="valid_time")
        master_df = master_df.drop_duplicates(
            subset=["valid_time", "t2m"], keep="first"
        )
        master_df = columns_drop(master_df)

        # Calculate the error using NWP data.
        master_df = get_error.nwp_error("t2m", master_df)
        # encode for day_of_year
        master_df = normalize.encode(master_df, 'day_of_year', 366)
        # get flag for non-consecutive time steps
        master_df = get_flag.get_flag(master_df)

        cols_to_carry = ['valid_time', 'station', 'latitude', 'longitude', 'flag']

        new_df = master_df.drop(columns=cols_to_carry)

        new_df, features = normalize.normalize_df(new_df)

        # Split the data into training and testing sets.
        length = len(new_df)
        test_len = int(length * 0.8)
        df_train = new_df.iloc[:test_len].copy()
        df_test = new_df.iloc[test_len:].copy()
        print("Test Set Fraction", len(df_test) / len(new_df))

        # Reintegrate the specified columns back into the training and testing DataFrames.
        for c in cols_to_carry:
            df_train[c] = master_df[c]
            df_test[c] = master_df[c]

    return df_train, df_test, features

In [None]:
# df_train, df_test, features = create_data_for_model()

In [None]:
nysm_df = nysm_data.load_nysm_data()
nysm_df.reset_index(inplace=True)
nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})
nysm_df.head()

In [None]:
hrrr_df = hrrr_data.read_hrrr_data()
hrrr_df.head()

In [None]:
# Set the path for tabular data.
nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
nysm_cats_df = pd.read_csv(nysm_cats_path)
nysm_cats_df = nysm_cats_df[nysm_cats_df['climate_division_name']=='Western Plateau']
nysm_cats_df

In [None]:
stations = nysm_cats_df["stid"].tolist()

In [None]:
nysm_df = nysm_df[nysm_df['station'].isin(stations)]
hrrr_df = hrrr_df[hrrr_df['station'].isin(stations)]

In [None]:
nysm_df.head()

In [None]:
hrrr_df.head()

In [None]:
# need to create a master list for valid_times so that all the dataframes are the same shape
master_time = hrrr_df['valid_time'].tolist()
for station in stations:
    hrrr_dft = hrrr_df[hrrr_df["station"] == station]
    nysm_dft = nysm_df[nysm_df["station"] == station]
    times = hrrr_dft['valid_time'].tolist()
    times2 = nysm_dft['valid_time'].tolist()
    result = list(set(times) & set(master_time) & set(times2))
    master_time = result

master_time_final = master_time

In [None]:
sort_me1 = (sorted(master_time_final))

In [None]:
sort_me1

In [None]:
print(len(master_time_final))
sorted(master_time_final)

In [None]:
print(len(master_time))
sorted(master_time)

In [None]:
for station in stations:
    nysm_df1 = nysm_df[nysm_df['station']==station]
    hrrr_df1 = hrrr_df[hrrr_df['station']==station]

    master_df = hrrr_df1.merge(nysm_df1, on="valid_time")
    master_df = master_df.drop_duplicates(
        subset=["valid_time", "t2m"], keep="first"
    )

In [None]:
master_df = columns_drop(master_df)

In [None]:
master_df.head()

In [None]:
# Calculate the error using NWP data.
master_df = get_error.nwp_error("t2m", master_df)

In [None]:
master_df = normalize.encode(master_df, 'day_of_year', 366)

In [None]:
master_df.head()

In [None]:
master_df = get_flag.get_flag(master_df)

In [None]:
master_df

In [None]:
cols_to_carry = ['valid_time', 'station', 'latitude', 'longitude', 'flag']

In [None]:
new_df = master_df.drop(columns=cols_to_carry)

In [None]:
new_df, features = normalize.normalize_df(new_df)

In [None]:
new_df.head()

In [None]:
features

In [None]:
# Split the data into training and testing sets.
length = len(new_df)
test_len = int(length * 0.8)
df_train = new_df.iloc[:test_len].copy()
df_test = new_df.iloc[test_len:].copy()
print("Test Set Fraction", len(df_test) / len(new_df))

In [None]:
# Reintegrate the specified columns back into the training and testing DataFrames.
for c in cols_to_carry:
    df_train[c] = master_df[c]
    df_test[c] = master_df[c]

In [None]:
df_train.head()

In [None]:
df_test.head()