drop all cols with strings and unvaluable info like index
Always make sure it is sorted correctly by time and station
edit time of year with cos/sin "encode"
df['day_of_year'] = df['valid_time'].dt.dayofyear
?? subrtract a row in setting forecast hours ??

In [9]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
import os
import argparse
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from comet_ml import Experiment, Artifact
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR

import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp.fully_sharded_data_parallel import (
    CPUOffload,
    BackwardPrefetch,
)
from torch.distributed.fsdp.wrap import (
    size_based_auto_wrap_policy,
    enable_wrap,
    wrap,
)

import pandas as pd
import numpy as np

from src.processing import col_drop
from src.processing import get_flag
from src.processing import encode
from src.processing import normalize
from src.processing import get_error

from src.data import hrrr_data
from src.data import nysm_data

from src.visuals import loss_curves
from comet_ml.integration.pytorch import log_model
from datetime import datetime

from sklearn.neighbors import BallTree
from sklearn import preprocessing
from sklearn import utils
from sklearn.feature_selection import mutual_info_classif as MIC

In [11]:
def columns_drop(df):
    df = df.drop(
        columns=[
            "level_0",
            "index",
            "lead time",
            "lsm",
            "index_nysm",
            "station_nysm",
        ]
    )
    return df

In [12]:
def add_suffix(df, stations):
    cols = ["valid_time", "time"]
    df = df.rename(
        columns={c: c + f"_{stations[0]}" for c in df.columns if c not in cols}
    )
    return df

In [13]:
def which_fold(df, fold):
    length = len(df)
    test_len = int(length * 0.2)
    df_train = pd.DataFrame()

    for n in np.arange(0, 5):
        if n != fold:
            df1 = df.iloc[int(0.2 * n * length) : int(0.2 * (n + 1) * length)]
            df_train = pd.concat([df_train, df1])
        else:
            df_test = df.iloc[int(0.2 * n * length) : int(0.2 * (n + 1) * length)]

    return df_train, df_test

In [14]:
nysm_df = nysm_data.load_nysm_data()

In [15]:
nysm_df["station"].unique()

array(['ADDI', 'ANDE', 'BATA', 'BEAC', 'BELD', 'BELL', 'BELM', 'BERK',
       'BING', 'BKLN', 'BRAN', 'BREW', 'BROC', 'BRON', 'BROO', 'BSPA',
       'BUFF', 'BURD', 'BURT', 'CAMD', 'CAPE', 'CHAZ', 'CHES', 'CINC',
       'CLAR', 'CLIF', 'CLYM', 'COBL', 'COHO', 'COLD', 'COPA', 'COPE',
       'CSQR', 'DELE', 'DEPO', 'DOVE', 'DUAN', 'EAUR', 'EDIN', 'EDWA',
       'ELDR', 'ELLE', 'ELMI', 'ESSX', 'FAYE', 'FRED', 'GABR', 'GFAL',
       'GFLD', 'GROT', 'GROV', 'HAMM', 'HARP', 'HARR', 'HART', 'HERK',
       'HFAL', 'ILAK', 'JOHN', 'JORD', 'KIND', 'LAUR', 'LOUI', 'MALO',
       'MEDI', 'MEDU', 'MORR', 'NBRA', 'NEWC', 'NHUD', 'OLDF', 'OLEA',
       'ONTA', 'OPPE', 'OSCE', 'OSWE', 'OTIS', 'OWEG', 'PENN', 'PHIL',
       'PISE', 'POTS', 'RAND', 'REDF', 'REDH', 'ROXB', 'RUSH', 'SARA',
       'SBRI', 'SCHA', 'SCHO', 'SCHU', 'SCIP', 'SHER', 'SOME', 'SOUT',
       'SPRA', 'SPRI', 'STAT', 'STEP', 'SUFF', 'TANN', 'TICO', 'TULL',
       'TUPP', 'TYRO', 'VOOR', 'WALL', 'WALT', 'WANT', 'WARS', 'WARW',
      

In [16]:
def create_data_for_model(station):
    """
    This function creates and processes data for a LSTM machine learning model.

    Args:
        station (str): The station identifier for which data is being processed.

    Returns:
        new_df (pandas DataFrame): A DataFrame containing processed data.
        df_train (pandas DataFrame): A DataFrame for training the machine learning model.
        df_test (pandas DataFrame): A DataFrame for testing the machine learning model.
        features (list): A list of feature names.
        forecast_lead (int): The lead time for the target variable.
    """

    # Print a message indicating the current station being processed.
    print(f"Targeting Error for {station}")

    # Load data from NYSM and HRRR sources.
    print("-- loading data from NYSM --")
    nysm_df = nysm_data.load_nysm_data()
    nysm_df.reset_index(inplace=True)
    print("-- loading data from HRRR --")
    hrrr_df = hrrr_data.read_hrrr_data()

    # Rename columns for consistency.
    nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})

    # Filter NYSM data to match valid times from HRRR data and save it to a CSV file.
    mytimes = hrrr_df["valid_time"].tolist()
    nysm_df = nysm_df[nysm_df["valid_time"].isin(mytimes)]
    nysm_df.to_csv("/home/aevans/nwp_bias/src/machine_learning/frankenstein/test.csv")

    # Set the path for tabular data.
    nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"

    # Load tabular data as a DataFrame.
    print("-- adding geo data --")
    nysm_cats_df = pd.read_csv(nysm_cats_path)

    # Identify the target data by climate division.
    print("-- locating target data --")
    category = "Western Plateau"
    nysm_cats_df1 = nysm_cats_df[nysm_cats_df["climate_division_name"] == category]
    stations = nysm_cats_df1["stid"].tolist()
    hrrr_df1 = hrrr_df[hrrr_df["station"].isin(stations)]
    nysm_df1 = nysm_df[nysm_df["station"].isin(stations)]

    # Clean the target data, merging HRRR and NYSM data and removing duplicates.
    print("-- cleaning target data --")
    master_df = hrrr_df1.merge(nysm_df1, on="valid_time", suffixes=(None, "_nysm"))
    master_df = master_df.drop_duplicates(
        subset=["valid_time", "station", "t2m"], keep="first"
    )

    # Finalize the DataFrame by dropping specific columns and adding suffixes.
    print("-- finalizing dataframe --")
    df = columns_drop(master_df)
    stations = df["station"].unique()
    master_df = df[df["station"] == stations[0]]
    master_df = add_suffix(master_df, stations)

    # Merge data for different stations into a single DataFrame.
    for station in stations:
        df1 = df[df["station"] == station]
        master_df = master_df.merge(
            df1, on="valid_time", suffixes=(None, f"_{station}")
        )

    # Copy the master DataFrame for further processing.
    the_df = master_df.copy()

    # Drop rows with missing values.
    the_df.dropna(inplace=True)
    print("getting flag and error")
    the_df = get_flag.get_flag(the_df)

    # Calculate the error using NWP data.
    the_df = get_error.nwp_error("t2m", "OLEA", the_df)
    new_df = the_df.copy()

    # Get the list of valid times.
    valid_times = new_df["valid_time"].tolist()

    # Define columns to reintegrate back into the DataFrame after model training.
    cols_to_carry = ["valid_time", "flag"]

    # Set the forecast lead time.
    forecast_lead = 1

    # Define the target variable name.
    target_sensor = "target_error"

    # Normalize the data, create the target variable, and remove last rows.
    lstm_df, features = normalize.normalize_df(new_df, valid_times, forecast_lead)
    target = f"{target_sensor}_lead_{forecast_lead}"
    lstm_df[target] = lstm_df[target_sensor].shift(-forecast_lead)
    lstm_df = lstm_df.iloc[:-forecast_lead]

    # Split the data into training and testing sets.
    length = len(lstm_df)
    test_len = int(length * 0.2)
    df_train, df_test = which_fold(lstm_df, 1)
    print("Test Set Fraction", len(df_test) / len(lstm_df))

    # Fill missing values with zeros in the training and testing DataFrames.
    df_train = df_train.fillna(0)
    df_test = df_test.fillna(0)

    # Reintegrate the specified columns back into the training and testing DataFrames.
    for c in cols_to_carry:
        df_train[c] = the_df[c]
        df_test[c] = the_df[c]

    # Print a message indicating that data processing is complete.
    print("Data Processed")
    print("--init model LSTM--")

    return new_df, df_train, df_test, features, forecast_lead

In [17]:
# Print a message indicating the current station being processed.
station = "OLEA"
print(f"Targeting Error for {station}")

# Load data from NYSM and HRRR sources.
print("-- loading data from NYSM --")
nysm_df = nysm_data.load_nysm_data()
nysm_df.reset_index(inplace=True)
print("-- loading data from HRRR --")
hrrr_df = hrrr_data.read_hrrr_data()

# Rename columns for consistency.
nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})

# Filter NYSM data to match valid times from HRRR data and save it to a CSV file.
mytimes = hrrr_df["valid_time"].tolist()
nysm_df = nysm_df[nysm_df["valid_time"].isin(mytimes)]
nysm_df.to_csv("/home/aevans/nwp_bias/src/machine_learning/frankenstein/test.csv")

# Set the path for tabular data.
nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"

# Load tabular data as a DataFrame.
print("-- adding geo data --")
nysm_cats_df = pd.read_csv(nysm_cats_path)

# Identify the target data by climate division.
print("-- locating target data --")
category = "Western Plateau"
nysm_cats_df1 = nysm_cats_df[nysm_cats_df["climate_division_name"] == category]
stations = nysm_cats_df1["stid"].tolist()
hrrr_df1 = hrrr_df[hrrr_df["station"].isin(stations)]
nysm_df1 = nysm_df[nysm_df["station"].isin(stations)]

Targeting Error for OLEA
-- loading data from NYSM --
-- loading data from HRRR --
-- adding geo data --
-- locating target data --


In [18]:
def columns_drop_hrrr(df):
    df = df.drop(
        columns=[
            "level_0",
            "index",
            "lead time",
            "lsm",
            "latitude",
            "longitude",
            "time",
        ]
    )
    return df

In [19]:
def add_suffix(master_df, station):
    cols = ["valid_time", "time"]
    master_df = master_df.rename(
        columns={c: c + f"_{station}" for c in master_df.columns if c not in cols}
    )
    return master_df

In [20]:
def dataframe_wrapper(stations, df):
    master_df = df[df["station"] == stations[0]]
    master_df = add_suffix(master_df, stations[0])
    for station in stations[1:3]:
        df1 = df[df["station"] == station]
        df1 = add_suffix(df1, station)
        master_df = master_df.merge(
            df1, on="valid_time", suffixes=(None, f"_{station}")
        )
    return master_df

In [21]:
def encode(data, col, max_val):
    data["day_of_year"] = data["valid_time"].dt.dayofyear
    sin = np.sin(2 * np.pi * data["day_of_year"] / max_val)
    data.insert(loc=(1), column=f"{col}_sin", value=sin)
    cos = np.cos(2 * np.pi * data["day_of_year"] / max_val)
    data.insert(loc=(1), column=f"{col}_cos", value=cos)
    data = data.drop(columns=["day_of_year"])

    return data

In [22]:
hrrr_df1 = columns_drop_hrrr(hrrr_df1)

In [23]:
master_df = dataframe_wrapper(stations, hrrr_df1)

In [24]:
master_df.keys()

Index(['valid_time', 'station_ADDI', 't2m_ADDI', 'sh2_ADDI', 'd2m_ADDI',
       'r2_ADDI', 'u10_ADDI', 'v10_ADDI', 'tp_ADDI', 'mslma_ADDI', 'orog_ADDI',
       'tcc_ADDI', 'asnow_ADDI', 'cape_ADDI', 'dswrf_ADDI', 'dlwrf_ADDI',
       'gh_ADDI', 'u_total_ADDI', 'u_dir_ADDI', 'new_tp_ADDI', 'station_BELM',
       't2m_BELM', 'sh2_BELM', 'd2m_BELM', 'r2_BELM', 'u10_BELM', 'v10_BELM',
       'tp_BELM', 'mslma_BELM', 'orog_BELM', 'tcc_BELM', 'asnow_BELM',
       'cape_BELM', 'dswrf_BELM', 'dlwrf_BELM', 'gh_BELM', 'u_total_BELM',
       'u_dir_BELM', 'new_tp_BELM', 'station_COHO', 't2m_COHO', 'sh2_COHO',
       'd2m_COHO', 'r2_COHO', 'u10_COHO', 'v10_COHO', 'tp_COHO', 'mslma_COHO',
       'orog_COHO', 'tcc_COHO', 'asnow_COHO', 'cape_COHO', 'dswrf_COHO',
       'dlwrf_COHO', 'gh_COHO', 'u_total_COHO', 'u_dir_COHO', 'new_tp_COHO'],
      dtype='object')

In [25]:
nysm_df1 = nysm_df1.drop(
    columns=[
        "index",
    ]
)

In [26]:
master_df2 = dataframe_wrapper(stations, nysm_df1)

In [27]:
master_df2.keys()

Index(['station_ADDI', 'valid_time', 'lat_ADDI', 'lon_ADDI', 'elev_ADDI',
       'tair_ADDI', 'ta9m_ADDI', 'td_ADDI', 'relh_ADDI', 'srad_ADDI',
       'pres_ADDI', 'mslp_ADDI', 'wspd_sonic_ADDI', 'wmax_sonic_ADDI',
       'wdir_sonic_ADDI', 'precip_total_ADDI', 'snow_depth_ADDI',
       'station_BELM', 'lat_BELM', 'lon_BELM', 'elev_BELM', 'tair_BELM',
       'ta9m_BELM', 'td_BELM', 'relh_BELM', 'srad_BELM', 'pres_BELM',
       'mslp_BELM', 'wspd_sonic_BELM', 'wmax_sonic_BELM', 'wdir_sonic_BELM',
       'precip_total_BELM', 'snow_depth_BELM', 'station_COHO', 'lat_COHO',
       'lon_COHO', 'elev_COHO', 'tair_COHO', 'ta9m_COHO', 'td_COHO',
       'relh_COHO', 'srad_COHO', 'pres_COHO', 'mslp_COHO', 'wspd_sonic_COHO',
       'wmax_sonic_COHO', 'wdir_sonic_COHO', 'precip_total_COHO',
       'snow_depth_COHO'],
      dtype='object')

In [28]:
master_df = master_df.merge(master_df2, on="valid_time", suffixes=(None, f"_xab"))

In [29]:
df_train, df_test = which_fold(master_df, 1)

In [30]:
df_train

Unnamed: 0,valid_time,station_ADDI,t2m_ADDI,sh2_ADDI,d2m_ADDI,r2_ADDI,u10_ADDI,v10_ADDI,tp_ADDI,mslma_ADDI,...,td_COHO,relh_COHO,srad_COHO,pres_COHO,mslp_COHO,wspd_sonic_COHO,wmax_sonic_COHO,wdir_sonic_COHO,precip_total_COHO,snow_depth_COHO
0,2018-01-01 03:00:00,ADDI,-19.199194,0.00068,-21.993216,76.800003,3.321081,-2.272873,0.001,103034.0,...,-22.236069,83.005333,0.059599,952.700378,961.160767,4.049551,5.268734,320.398987,0.00,0.011778
1,2018-01-01 04:00:00,ADDI,-19.118689,0.00068,-21.918570,76.500000,3.298830,-1.821096,0.000,103019.0,...,-21.697083,82.664978,0.046355,952.789917,961.068542,3.752380,5.467729,302.989014,0.00,0.013398
2,2018-01-01 05:00:00,ADDI,-19.439062,0.00065,-22.570868,73.800003,2.992857,-0.877062,0.000,103027.0,...,-21.546570,81.791618,0.059598,952.710815,960.902649,4.310488,6.174314,306.762512,0.00,0.014291
3,2018-01-01 06:00:00,ADDI,-19.760291,0.00063,-22.821158,74.400002,3.115693,-0.789671,0.000,103071.0,...,-21.348312,82.621620,0.059596,952.575623,960.738586,4.632712,6.812093,296.427094,0.00,0.013012
4,2018-01-01 07:00:00,ADDI,-19.860175,0.00062,-22.993704,74.099998,3.268302,-0.054269,0.001,103052.0,...,-21.324707,82.821564,0.072838,952.858276,961.028748,3.748342,5.010695,292.594391,0.00,0.013992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38348,2022-12-31 19:00:00,ADDI,11.485986,0.00710,8.142847,80.300003,0.073768,5.399014,0.000,100762.0,...,7.202820,76.627151,56.429420,938.602417,938.342163,2.888842,4.793458,187.385101,0.00,0.006084
38349,2022-12-31 20:00:00,ADDI,10.447198,0.00676,7.376611,80.400002,-0.416093,5.292988,0.000,100785.0,...,8.407135,89.403557,20.635929,938.083618,938.075562,4.008613,6.724010,213.705399,0.07,0.005659
38350,2022-12-31 21:00:00,ADDI,9.658258,0.00709,8.101648,88.000000,0.234075,5.295895,0.000,100879.0,...,8.600677,91.210388,7.036471,938.252014,938.270630,4.102826,7.027788,211.484207,0.00,0.003971
38351,2022-12-31 22:00:00,ADDI,9.118555,0.00711,8.145227,92.000000,0.285457,5.515205,0.038,100849.0,...,8.706879,93.957100,0.338529,938.007629,938.103821,3.016000,5.212557,189.334000,0.00,0.005106


In [31]:
df_test

Unnamed: 0,valid_time,station_ADDI,t2m_ADDI,sh2_ADDI,d2m_ADDI,r2_ADDI,u10_ADDI,v10_ADDI,tp_ADDI,mslma_ADDI,...,td_COHO,relh_COHO,srad_COHO,pres_COHO,mslp_COHO,wspd_sonic_COHO,wmax_sonic_COHO,wdir_sonic_COHO,precip_total_COHO,snow_depth_COHO
7670,2019-04-02 11:00:00,ADDI,-2.800116,0.00181,-10.229285,54.200001,-0.840036,4.357924,0.0,102469.0,...,-10.027161,59.026249,7.567856,952.435913,956.011108,3.505187,6.217631,187.397400,0.000000,0.009952
7671,2019-04-02 12:00:00,ADDI,-2.094031,0.00195,-9.271704,56.500000,-0.815294,3.543884,0.0,102545.0,...,-9.858948,53.888851,102.628220,952.309509,955.505249,3.480735,5.642790,180.033997,0.000000,0.011029
7672,2019-04-02 13:00:00,ADDI,-0.055823,0.00215,-8.094641,55.299999,-0.840937,4.726081,0.0,102528.0,...,-9.607452,53.674629,230.324387,951.843994,954.945740,4.695154,7.205263,163.311996,0.000000,0.011903
7673,2019-04-02 14:00:00,ADDI,2.602563,0.00240,-6.657996,52.099998,-0.615318,6.954502,0.0,102432.0,...,-8.999878,50.521992,336.970825,951.247986,953.948547,4.490968,8.405516,172.641693,0.000000,0.011782
7674,2019-04-02 15:00:00,ADDI,4.740533,0.00250,-6.153021,47.299999,-0.794386,7.123545,0.0,102371.0,...,-8.736786,43.522881,667.342163,950.524414,952.599670,7.000066,9.654021,184.360901,0.000000,0.013715
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15336,2020-02-20 00:00:00,ADDI,-4.032324,0.00189,-9.651648,65.599998,5.325153,-1.649920,0.0,103026.0,...,-11.441040,70.623993,0.000000,955.438721,960.122375,6.698401,10.974760,283.211700,0.000000,0.102824
15337,2020-02-20 01:00:00,ADDI,-4.166479,0.00185,-9.974219,64.699997,5.410713,-1.507067,0.0,103038.0,...,-11.768555,68.303574,0.000000,955.605530,960.266174,5.499091,8.733680,277.621307,0.000000,0.103490
15338,2020-02-20 02:00:00,ADDI,-5.727576,0.00177,-10.524817,68.300003,4.351043,-1.059071,0.0,103114.0,...,-9.626312,83.362663,0.000000,955.613586,960.379639,6.359388,9.124758,275.665100,0.000000,0.102813
15339,2020-02-20 03:00:00,ADDI,-6.150916,0.00174,-10.665686,69.500000,3.748579,-1.038778,0.0,103120.0,...,-9.033295,88.666862,0.000000,955.681885,960.504211,3.431629,5.589961,291.429596,0.000000,0.102500


In [32]:
def get_locations_for_ball_tree(df):
    locations_a = df.reset_index()[["lat", "lon"]]
    locations_b = df.reset_index()[["lat", "lon"]]

    # ball tree to find nysm site locations
    # locations_a ==> build the tree
    # locations_b ==> query the tree
    # Creates new columns converting coordinate degrees to radians.
    for column in locations_a[["lat", "lon"]]:
        rad = np.deg2rad(locations_a[column].values)
        locations_a[f"{column}_rad"] = rad

    for column in locations_b[["lat", "lon"]]:
        rad = np.deg2rad(locations_b[column].values)
        locations_b[f"{column}_rad"] = rad

    return locations_a, locations_b


def get_ball_tree_indices(model_data):
    locations_a, locations_b = get_locations_for_ball_tree(model_data)
    # Takes the first group's latitude and longitude values to construct the ball tree.

    ball = BallTree(locations_a[["lat_rad", "lon_rad"]].values, metric="haversine")
    # k: The number of neighbors to return from tree
    k = 3
    # Executes a query with the second group. This will also return two arrays.
    distances, indices = ball.query(locations_b[["lat_rad", "lon_rad"]].values, k=k)
    # get indices in a format where we can query the df
    indices_list = [indices[x][0] for x in range(len(indices))]
    return indices_list


def df_with_nysm_locations(df, df_nysm, indices_list):
    df_closest_locs = df.iloc[indices_list][["latitude", "longitude"]].reset_index()
    df_nysm_station_locs = df_nysm.groupby("station")[["lat", "lon"]].mean()

    for x in range(len(df_nysm_station_locs.index)):
        df_dummy = df[
            (df.latitude == df_closest_locs.latitude[x])
            & (df.longitude == df_closest_locs.longitude[x])
        ]
        df_dummy = df_dummy.reset_index()
        df_dummy["station"] = df_nysm_station_locs.index[x]
        if x == 0:
            df_save = df_dummy
        else:
            df_save = pd.concat([df_save, df_dummy])
    print("complete")
    return df_save.set_index(["station", "valid_time"])

In [33]:
def get_closest_stations(nysm_df, neighbors, target_station):
    lats = nysm_df["lat"].unique()
    lons = nysm_df["lon"].unique()

    locations_a = pd.DataFrame()
    locations_a["lat"] = lats
    locations_a["lon"] = lons

    for column in locations_a[["lat", "lon"]]:
        rad = np.deg2rad(locations_a[column].values)
        locations_a[f"{column}_rad"] = rad

    locations_b = locations_a

    ball = BallTree(locations_a[["lat_rad", "lon_rad"]].values, metric="haversine")

    # k: The number of neighbors to return from tree
    k = neighbors
    # Executes a query with the second group. This will also return two arrays.
    distances, indices = ball.query(locations_b[["lat_rad", "lon_rad"]].values, k=k)

    indices_list = [indices[x][0:k] for x in range(len(indices))]

    stations = nysm_df["station"].unique()

    station_dict = {}

    for k, _ in enumerate(stations):
        station_dict[stations[k]] = indices_list[k]

    utilize_ls = []
    vals = station_dict.get(target_station)
    vals
    for v in vals:
        x = stations[v]
        utilize_ls.append(x)

    return utilize_ls

In [34]:
def nwp_error(target, station, df):
    """
    Calculate the error between NWP model data and NYSM data for a specific target variable.

    Args:
        target (str): The target variable name (e.g., 't2m' for temperature).
        station (str): The station identifier for which data is being compared.
        df (pd.DataFrame): The input DataFrame containing NWP and NYSM data.

    Returns:
        df (pd.DataFrame): The input DataFrame with the 'target_error' column added.

    This function calculates the error between the NWP (Numerical Weather Prediction) modeldata and NYSM (New York State Mesonet) data for a specific target variable at a given station. The error is computed by subtracting the NYSM data from the NWP model data.
    """

    # Define a dictionary to map NWP variable names to NYSM variable names.
    vars_dict = {
        "t2m": "tair",
        "mslma": "pres",
        # Add more variable mappings as needed.
    }

    # Get the NYSM variable name corresponding to the target variable.
    nysm_var = vars_dict.get(target)

    # Calculate the 'target_error' by subtracting NYSM data from NWP model data.
    target_error = df[f"{target}_{station}"] - df[f"{nysm_var}_{station}"]
    df.insert(loc=(1), column=f"target_error", value=target_error)

    return df

In [35]:
master_df

Unnamed: 0,valid_time,station_ADDI,t2m_ADDI,sh2_ADDI,d2m_ADDI,r2_ADDI,u10_ADDI,v10_ADDI,tp_ADDI,mslma_ADDI,...,td_COHO,relh_COHO,srad_COHO,pres_COHO,mslp_COHO,wspd_sonic_COHO,wmax_sonic_COHO,wdir_sonic_COHO,precip_total_COHO,snow_depth_COHO
0,2018-01-01 03:00:00,ADDI,-19.199194,0.00068,-21.993216,76.800003,3.321081,-2.272873,0.001,103034.0,...,-22.236069,83.005333,0.059599,952.700378,961.160767,4.049551,5.268734,320.398987,0.00,0.011778
1,2018-01-01 04:00:00,ADDI,-19.118689,0.00068,-21.918570,76.500000,3.298830,-1.821096,0.000,103019.0,...,-21.697083,82.664978,0.046355,952.789917,961.068542,3.752380,5.467729,302.989014,0.00,0.013398
2,2018-01-01 05:00:00,ADDI,-19.439062,0.00065,-22.570868,73.800003,2.992857,-0.877062,0.000,103027.0,...,-21.546570,81.791618,0.059598,952.710815,960.902649,4.310488,6.174314,306.762512,0.00,0.014291
3,2018-01-01 06:00:00,ADDI,-19.760291,0.00063,-22.821158,74.400002,3.115693,-0.789671,0.000,103071.0,...,-21.348312,82.621620,0.059596,952.575623,960.738586,4.632712,6.812093,296.427094,0.00,0.013012
4,2018-01-01 07:00:00,ADDI,-19.860175,0.00062,-22.993704,74.099998,3.268302,-0.054269,0.001,103052.0,...,-21.324707,82.821564,0.072838,952.858276,961.028748,3.748342,5.010695,292.594391,0.00,0.013992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38348,2022-12-31 19:00:00,ADDI,11.485986,0.00710,8.142847,80.300003,0.073768,5.399014,0.000,100762.0,...,7.202820,76.627151,56.429420,938.602417,938.342163,2.888842,4.793458,187.385101,0.00,0.006084
38349,2022-12-31 20:00:00,ADDI,10.447198,0.00676,7.376611,80.400002,-0.416093,5.292988,0.000,100785.0,...,8.407135,89.403557,20.635929,938.083618,938.075562,4.008613,6.724010,213.705399,0.07,0.005659
38350,2022-12-31 21:00:00,ADDI,9.658258,0.00709,8.101648,88.000000,0.234075,5.295895,0.000,100879.0,...,8.600677,91.210388,7.036471,938.252014,938.270630,4.102826,7.027788,211.484207,0.00,0.003971
38351,2022-12-31 22:00:00,ADDI,9.118555,0.00711,8.145227,92.000000,0.285457,5.515205,0.038,100849.0,...,8.706879,93.957100,0.338529,938.007629,938.103821,3.016000,5.212557,189.334000,0.00,0.005106


In [36]:
# Calculate the error using NWP data.
the_df = nwp_error("t2m", "ADDI", master_df)
the_df

Unnamed: 0,valid_time,target_error,station_ADDI,t2m_ADDI,sh2_ADDI,d2m_ADDI,r2_ADDI,u10_ADDI,v10_ADDI,tp_ADDI,...,td_COHO,relh_COHO,srad_COHO,pres_COHO,mslp_COHO,wspd_sonic_COHO,wmax_sonic_COHO,wdir_sonic_COHO,precip_total_COHO,snow_depth_COHO
0,2018-01-01 03:00:00,-0.368794,ADDI,-19.199194,0.00068,-21.993216,76.800003,3.321081,-2.272873,0.001,...,-22.236069,83.005333,0.059599,952.700378,961.160767,4.049551,5.268734,320.398987,0.00,0.011778
1,2018-01-01 04:00:00,-0.600048,ADDI,-19.118689,0.00068,-21.918570,76.500000,3.298830,-1.821096,0.000,...,-21.697083,82.664978,0.046355,952.789917,961.068542,3.752380,5.467729,302.989014,0.00,0.013398
2,2018-01-01 05:00:00,-0.803223,ADDI,-19.439062,0.00065,-22.570868,73.800003,2.992857,-0.877062,0.000,...,-21.546570,81.791618,0.059598,952.710815,960.902649,4.310488,6.174314,306.762512,0.00,0.014291
3,2018-01-01 06:00:00,-1.059481,ADDI,-19.760291,0.00063,-22.821158,74.400002,3.115693,-0.789671,0.000,...,-21.348312,82.621620,0.059596,952.575623,960.738586,4.632712,6.812093,296.427094,0.00,0.013012
4,2018-01-01 07:00:00,-0.031725,ADDI,-19.860175,0.00062,-22.993704,74.099998,3.268302,-0.054269,0.001,...,-21.324707,82.821564,0.072838,952.858276,961.028748,3.748342,5.010695,292.594391,0.00,0.013992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38348,2022-12-31 19:00:00,0.660996,ADDI,11.485986,0.00710,8.142847,80.300003,0.073768,5.399014,0.000,...,7.202820,76.627151,56.429420,938.602417,938.342163,2.888842,4.793458,187.385101,0.00,0.006084
38349,2022-12-31 20:00:00,0.637534,ADDI,10.447198,0.00676,7.376611,80.400002,-0.416093,5.292988,0.000,...,8.407135,89.403557,20.635929,938.083618,938.075562,4.008613,6.724010,213.705399,0.07,0.005659
38350,2022-12-31 21:00:00,-0.462142,ADDI,9.658258,0.00709,8.101648,88.000000,0.234075,5.295895,0.000,...,8.600677,91.210388,7.036471,938.252014,938.270630,4.102826,7.027788,211.484207,0.00,0.003971
38351,2022-12-31 22:00:00,-0.493114,ADDI,9.118555,0.00711,8.145227,92.000000,0.285457,5.515205,0.038,...,8.706879,93.957100,0.338529,938.007629,938.103821,3.016000,5.212557,189.334000,0.00,0.005106


In [37]:
the_df = encode(the_df, "valid_time", 366)

In [38]:
the_df = the_df[the_df.columns.drop(list(the_df.filter(regex="station")))]

In [39]:
the_df

Unnamed: 0,valid_time,valid_time_cos,valid_time_sin,target_error,t2m_ADDI,sh2_ADDI,d2m_ADDI,r2_ADDI,u10_ADDI,v10_ADDI,...,td_COHO,relh_COHO,srad_COHO,pres_COHO,mslp_COHO,wspd_sonic_COHO,wmax_sonic_COHO,wdir_sonic_COHO,precip_total_COHO,snow_depth_COHO
0,2018-01-01 03:00:00,0.999853,0.017166,-0.368794,-19.199194,0.00068,-21.993216,76.800003,3.321081,-2.272873,...,-22.236069,83.005333,0.059599,952.700378,961.160767,4.049551,5.268734,320.398987,0.00,0.011778
1,2018-01-01 04:00:00,0.999853,0.017166,-0.600048,-19.118689,0.00068,-21.918570,76.500000,3.298830,-1.821096,...,-21.697083,82.664978,0.046355,952.789917,961.068542,3.752380,5.467729,302.989014,0.00,0.013398
2,2018-01-01 05:00:00,0.999853,0.017166,-0.803223,-19.439062,0.00065,-22.570868,73.800003,2.992857,-0.877062,...,-21.546570,81.791618,0.059598,952.710815,960.902649,4.310488,6.174314,306.762512,0.00,0.014291
3,2018-01-01 06:00:00,0.999853,0.017166,-1.059481,-19.760291,0.00063,-22.821158,74.400002,3.115693,-0.789671,...,-21.348312,82.621620,0.059596,952.575623,960.738586,4.632712,6.812093,296.427094,0.00,0.013012
4,2018-01-01 07:00:00,0.999853,0.017166,-0.031725,-19.860175,0.00062,-22.993704,74.099998,3.268302,-0.054269,...,-21.324707,82.821564,0.072838,952.858276,961.028748,3.748342,5.010695,292.594391,0.00,0.013992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38348,2022-12-31 19:00:00,0.999853,-0.017166,0.660996,11.485986,0.00710,8.142847,80.300003,0.073768,5.399014,...,7.202820,76.627151,56.429420,938.602417,938.342163,2.888842,4.793458,187.385101,0.00,0.006084
38349,2022-12-31 20:00:00,0.999853,-0.017166,0.637534,10.447198,0.00676,7.376611,80.400002,-0.416093,5.292988,...,8.407135,89.403557,20.635929,938.083618,938.075562,4.008613,6.724010,213.705399,0.07,0.005659
38350,2022-12-31 21:00:00,0.999853,-0.017166,-0.462142,9.658258,0.00709,8.101648,88.000000,0.234075,5.295895,...,8.600677,91.210388,7.036471,938.252014,938.270630,4.102826,7.027788,211.484207,0.00,0.003971
38351,2022-12-31 22:00:00,0.999853,-0.017166,-0.493114,9.118555,0.00711,8.145227,92.000000,0.285457,5.515205,...,8.706879,93.957100,0.338529,938.007629,938.103821,3.016000,5.212557,189.334000,0.00,0.005106


In [40]:
cols_to_carry = ["valid_time"]

In [41]:
new_df = the_df.drop(columns=cols_to_carry)

In [42]:
import statistics as st

cols = ["valid_time_cos", "valid_time_sin"]
for k, r in new_df.items():
    if k in (cols):
        continue
    else:
        means = st.mean(new_df[k])
        stdevs = st.pstdev(new_df[k])
        new_df[k] = (new_df[k] - means) / stdevs

In [43]:
features = list(new_df.columns.difference(["target_error"]))
print(features)

['asnow_ADDI', 'asnow_BELM', 'asnow_COHO', 'cape_ADDI', 'cape_BELM', 'cape_COHO', 'd2m_ADDI', 'd2m_BELM', 'd2m_COHO', 'dlwrf_ADDI', 'dlwrf_BELM', 'dlwrf_COHO', 'dswrf_ADDI', 'dswrf_BELM', 'dswrf_COHO', 'elev_ADDI', 'elev_BELM', 'elev_COHO', 'gh_ADDI', 'gh_BELM', 'gh_COHO', 'lat_ADDI', 'lat_BELM', 'lat_COHO', 'lon_ADDI', 'lon_BELM', 'lon_COHO', 'mslma_ADDI', 'mslma_BELM', 'mslma_COHO', 'mslp_ADDI', 'mslp_BELM', 'mslp_COHO', 'new_tp_ADDI', 'new_tp_BELM', 'new_tp_COHO', 'orog_ADDI', 'orog_BELM', 'orog_COHO', 'precip_total_ADDI', 'precip_total_BELM', 'precip_total_COHO', 'pres_ADDI', 'pres_BELM', 'pres_COHO', 'r2_ADDI', 'r2_BELM', 'r2_COHO', 'relh_ADDI', 'relh_BELM', 'relh_COHO', 'sh2_ADDI', 'sh2_BELM', 'sh2_COHO', 'snow_depth_ADDI', 'snow_depth_BELM', 'snow_depth_COHO', 'srad_ADDI', 'srad_BELM', 'srad_COHO', 't2m_ADDI', 't2m_BELM', 't2m_COHO', 'ta9m_ADDI', 'ta9m_BELM', 'ta9m_COHO', 'tair_ADDI', 'tair_BELM', 'tair_COHO', 'tcc_ADDI', 'tcc_BELM', 'tcc_COHO', 'td_ADDI', 'td_BELM', 'td_COHO', 

In [57]:
def mi_score(the_df):
    the_df = the_df.fillna(-999)
    X = the_df.loc[:, the_df.columns != "target_error"]
    y = the_df["target_error"]
    # convert y values to categorical values
    lab = preprocessing.LabelEncoder()
    y_transformed = lab.fit_transform(y)
    mi_score = MIC(X, y_transformed)
    df = pd.DataFrame()
    df["feature"] = [n for n in the_df.columns if n != "target_error"]
    df["mi_score"] = mi_score

    df = df[df["mi_score"] > 0.2]
    features = df["feature"].tolist()
    return features

In [58]:
features = mi_score(new_df)

In [59]:
features

['t2m_ADDI',
 'tp_ADDI',
 'orog_ADDI',
 'tcc_ADDI',
 'asnow_ADDI',
 'cape_ADDI',
 'dswrf_ADDI',
 'new_tp_ADDI',
 't2m_BELM',
 'tp_BELM',
 'orog_BELM',
 'tcc_BELM',
 'asnow_BELM',
 'cape_BELM',
 'dswrf_BELM',
 'new_tp_BELM',
 't2m_COHO',
 'tp_COHO',
 'orog_COHO',
 'tcc_COHO',
 'asnow_COHO',
 'cape_COHO',
 'dswrf_COHO',
 'new_tp_COHO',
 'lat_ADDI',
 'lon_ADDI',
 'elev_ADDI',
 'tair_ADDI',
 'ta9m_ADDI',
 'mslp_ADDI',
 'precip_total_ADDI',
 'snow_depth_ADDI',
 'lat_BELM',
 'lon_BELM',
 'elev_BELM',
 'tair_BELM',
 'ta9m_BELM',
 'pres_BELM',
 'mslp_BELM',
 'precip_total_BELM',
 'snow_depth_BELM',
 'lat_COHO',
 'lon_COHO',
 'elev_COHO',
 'tair_COHO',
 'ta9m_COHO',
 'precip_total_COHO',
 'snow_depth_COHO']

In [None]:
lstm_df = new_df.copy()
target_sensor = "target_error"
forecast_lead = 1

In [None]:
lstm_df

In [None]:
target = f"{target_sensor}_lead_{forecast_lead}"
lstm_df.insert(
    loc=(0), column=target, value=lstm_df[target_sensor].shift(-forecast_lead)
)
lstm_df = lstm_df.drop(columns=[target_sensor])

In [None]:
lstm_df = lstm_df.iloc[:-forecast_lead]

In [None]:
# Split the data into training and testing sets.
length = len(lstm_df)
test_len = int(length * 0.2)
df_train = lstm_df.iloc[test_len:].copy()
df_test = lstm_df.iloc[:test_len].copy()
print("Test Set Fraction", len(df_test) / len(lstm_df))

# Fill missing values with zeros in the training and testing DataFrames.
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [None]:
df_train = df_train.drop(columns="target_error_lead_1")

In [None]:
df_train.head()

In [None]:
df_train.iloc[:2, -int(3 * 15) :] = -999

In [None]:
for k in df_train.keys():
    print(k)
    print(df_train[k].iloc[0])
    print()

In [None]:
# create LSTM Model
class SequenceDataset(Dataset):
    def __init__(
        self, dataframe, target, features, stations, sequence_length, forecast_hr
    ):
        self.dataframe = dataframe
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.stations = stations
        self.forecast_hr = forecast_hr
        self.y = (
            torch.tensor(dataframe[target].values)
            .float()
            .to(int(os.environ["RANK"]) % torch.cuda.device_count())
        )
        self.X = (
            torch.tensor(dataframe[features].values)
            .float()
            .to(int(os.environ["RANK"]) % torch.cuda.device_count())
        )

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start : (i + 1), :]
            # zero out NYSM vars from before present
            x[:forecast_hr, -int(len(stations) * 15) :] = -999.0
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0 : (i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i]

In [None]:
def format_climate_df(data_path):
    """
    Formats a climate data file located at the specified `data_path` into a pandas DataFrame.

    Args:
        data_path (str): The file path for the climate data file.

    Returns:
        pandas.DataFrame: A DataFrame containing the climate data, with the first column renamed to "year".
    """
    raw_index = np.loadtxt(f"{data_path}")
    cl_index = pd.DataFrame(raw_index)
    cl_index = cl_index.rename(columns={0: "year"})
    return cl_index

In [None]:
def get_clim_indexes(df, valid_times):
    """
    Fetch climate indexes data and add corresponding index values to the input DataFrame.

    This function takes a DataFrame (`df`) containing weather data with a 'valid_time' column representing
    timestamps. It reads climate indexes data from text files in the specified directory and extracts index
    values corresponding to the month and year of each timestamp in the DataFrame. The extracted index values
    are then added to the DataFrame with new columns named after each index.

    Parameters:
    df (pandas.DataFrame): Input DataFrame containing weather data with a 'valid_time' column.

    Returns:
    pandas.DataFrame: The input DataFrame with additional columns for each climate index containing their values.
    """

    clim_df_path = "/home/aevans/nwp_bias/src/correlation/data/indexes/"
    directory = os.listdir(clim_df_path)
    df["valid_time"] = valid_times

    # Loop through each file in the specified directory
    for d in directory:
        if d.endswith(".txt"):
            # Read the climate index data from the file and format it into a DataFrame
            clim_df = format_climate_df(f"{clim_df_path}{d}")
            index_name = d.split(".")[0]

            clim_ind_ls = []
            for t, _ in enumerate(df["valid_time"]):
                time_obj = df["valid_time"].iloc[t]
                dt_object = parse(str(time_obj))
                year = dt_object.strftime("%Y")
                month = dt_object.strftime("%m")
                # Filter the climate DataFrame to get data for the specific year
                df1 = clim_df.loc[clim_df["year"] == int(year)]
                df1 = df1.drop(columns="year")
                row_list = df1.values
                keys = df1.keys()
                key_vals = keys.tolist()

                # Extract the index value corresponding to the month of the timestamp
                the_list = []
                for n, _ in enumerate(key_vals):
                    val1 = key_vals[n]
                    val2 = row_list[0, n]
                    tup = (val1, val2)
                    the_list.append(tup)
                for k, r in the_list:
                    if str(k).zfill(2) == month:
                        clim_ind_ls.append(r)

            # Add the climate index values as a new column in the DataFrame
            df[index_name] = clim_ind_ls

    df = df.drop(columns="valid_time")
    return df

In [None]:
df = pd.read_parquet(
    "/home/aevans/nwp_bias/src/machine_learning/data/clean_parquets/nysm_cats/cleaned_rough_lstm_nysmcat_Western Plateau.parquet"
)
df = df.dropna()

In [None]:
valid_times = df["valid_time"].tolist()

In [None]:
df

In [None]:
for k in df.keys():
    print(k)

In [None]:
# columns to reintigrate back into the df after model is done running
cols_to_carry = cols_to_carry = [
    "valid_time",
    "flag",
    "day_of_year_sin",
    "day_of_year_cos",
]

In [None]:
df = get_flag(df)
df = nwp_error("t2m", "ADDI", df)

In [None]:
new_df = df.copy()

In [None]:
def normalize_df(df, valid_times):
    print("init normalizer")
    df = col_drop(df)
    the_df = df.dropna()
    for k, r in the_df.items():
        if len(the_df[k].unique()) == 1:
            org_str = str(k)
            my_str = org_str[:-5]
            vals = the_df.filter(regex=my_str)
            vals = vals.loc[0].tolist()
            means = st.mean(vals)
            stdevs = st.pstdev(vals)
            the_df[k] = (the_df[k] - means) / stdevs

            the_df = the_df.fillna(0)
            # |sh2|d2m|r2|u10|v10|tp|mslma|tcc|asnow|cape|dswrf|dlwrf|gh|utotal|u_dir|new_tp
        if re.search(
            "t2m",
            k,
        ):
            ind_val = the_df.columns.get_loc(k)
            x = the_df[k]
            imf = emd.sift.sift(x)
            the_df = the_df.drop(columns=k)
            for i in range(imf.shape[1]):
                imf_ls = imf[:, i].tolist()
                # Inserting the column at the
                # beginning in the DataFrame
                my_loc = ind_val + i
                the_df.insert(loc=(my_loc), column=f"{k}_imf_{i}", value=imf_ls)

        else:
            means = st.mean(the_df[k])
            stdevs = st.pstdev(the_df[k])
            the_df[k] = (the_df[k] - means) / stdevs

    final_df = the_df.fillna(0)
    print("!!! Dropping Columns !!!")
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="latitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="longitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="u_total")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="mslp")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="orog")))]

    print("--- configuring data ---")
    final_df = encode(final_df, "day_of_year", 366, valid_times)
    final_df = get_clim_indexes(final_df, valid_times)
    new_features = list(final_df.columns.difference(["target_error"]))
    print("---normalize successful---")

    return final_df, new_features

In [None]:
def normalize_df_station(df):
    print("init normalizer")
    the_df = df.dropna()
    for k, r in the_df.items():
        if len(the_df[k].unique()) == 1:
            org_str = str(k)
            my_str = org_str[:-5]
            vals = the_df.filter(regex=my_str)
            vals = vals.loc[0].tolist()
            means = st.mean(vals)
            stdevs = st.pstdev(vals)
            the_df[k] = (the_df[k] - means) / stdevs

            the_df = the_df.fillna(0)
        if not (len(the_df[k].unique()) == 1) and re.search("_ADDI", k):
            ind_val = the_df.columns.get_loc(k)
            x = the_df[k]
            imf = emd.sift.sift(x)
            the_df = the_df.drop(columns=k)
            for i in range(imf.shape[1]):
                imf_ls = imf[:, i].tolist()
                # Inserting the column at the
                # beginning in the DataFrame
                my_loc = ind_val + i
                the_df.insert(loc=(my_loc), column=f"{k}_imf_{i}", value=imf_ls)

        else:
            means = st.mean(the_df[k])
            stdevs = st.pstdev(the_df[k])
            the_df[k] = (the_df[k] - means) / stdevs

    final_df = the_df.fillna(0)
    print("!!! Dropping Columns !!!")
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="latitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="longitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="u_total")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="mslp")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="orog")))]
    new_features = list(final_df.columns.difference(["target_error"]))
    print("---normalize successful---")
    return final_df, new_features

## create train and test set

In [None]:
target_sensor = "target_error"
the_df, new_features = normalize_df(new_df, valid_times)

forecast_lead = 30
target = f"{target_sensor}_lead_{forecast_lead}"

the_df[target] = the_df[target_sensor].shift(-forecast_lead)
the_df = the_df.iloc[:-forecast_lead]

# Normalize

In [None]:
for k in the_df.keys():
    print(k)

In [None]:
length = len(the_df)

test_len = int(length * 0.75)

df_train = the_df.iloc[:test_len].copy()
df_test = the_df.iloc[test_len:].copy()

print("Test Set Fraction", len(df_test) / len(the_df))

## Normalize

In [None]:
for c in cols_to_carry:
    df_train[c] = df[c]
    df_test[c] = df[c]

In [None]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [None]:
df_test

In [None]:
# import torch

# data = torch.tensor(df_train.values)


# # Perform PCA
# U, S, V = torch.pca_lowrank(data)

# # Print the results
# print(U)
# print(S)
# print(V)

In [None]:
# U.shape

In [None]:
# S.shape

In [None]:
# V.shape

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10)
# pca.fit(df_train)
# print(pca.components_) # Reformat and view results
# loadings = pd.DataFrame(pca.components_.T,
# columns=['PC%s' % _ for _ in range((pca.components_.shape[0]))],
# index=df_train.columns)
# print(loadings)

# plt.plot(pca.explained_variance_ratio_)
# plt.ylabel('Explained Variance')
# plt.xlabel('Components')
# plt.show()

In [None]:
# df = pd.DataFrame(pca.components_)
# df

In [None]:
# loadings

## Create LSTM

In [None]:
class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length):
        self.dataframe = dataframe
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        keep_sample = self.dataframe.iloc[i]["flag"]
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start : (i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0 : (i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i], keep_sample

In [None]:
import torch
import torch.nn as nn


class PeepholeLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PeepholeLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.W_i = nn.Linear(input_size, hidden_size)
        self.W_f = nn.Linear(input_size, hidden_size)
        self.W_o = nn.Linear(input_size, hidden_size)
        self.W_c = nn.Linear(input_size, hidden_size)

        self.U_i = nn.Linear(hidden_size, hidden_size)
        self.U_f = nn.Linear(hidden_size, hidden_size)
        self.U_o = nn.Linear(hidden_size, hidden_size)
        self.U_c = nn.Linear(hidden_size, hidden_size)

    def forward(self, x, h_prev, c_prev):
        i = torch.sigmoid(self.W_i(x) + self.U_i(h_prev) + self.W_c(c_prev))
        f = torch.sigmoid(self.W_f(x) + self.U_f(h_prev) + self.W_c(c_prev))
        c_tilde = torch.tanh(self.W_c(x) + self.U_c(h_prev))
        c = f * c_prev + i * c_tilde
        o = torch.sigmoid(self.W_o(x) + self.U_o(h_prev) + self.W_c(c))
        h = o * torch.tanh(c)
        return h, c


class ShallowRegressionPeepholeLSTM(nn.Module):
    def __init__(self, num_sensors, hidden_units):
        super().__init__()
        self.num_sensors = num_sensors
        self.hidden_units = hidden_units
        self.num_layers = 3

        # Create a list of LSTM cells
        self.lstm_cells = nn.ModuleList([PeepholeLSTMCell(num_sensors, hidden_units)])

        # # Add additional LSTM layers if needed
        # for _ in range(1, self.num_layers):
        #     self.lstm_cells.append(PeepholeLSTMCell(hidden_units, hidden_units))

        self.linear = nn.Linear(in_features=hidden_units, out_features=1)

    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(batch_size, self.hidden_units).requires_grad_()
        c0 = torch.zeros(batch_size, self.hidden_units).requires_grad_()
        h, c = h0, c0

        # Forward pass through each LSTM cell
        for lstm_cell in self.lstm_cells:
            h, c = lstm_cell(x, h, c)

        out = self.linear(h[0]).flatten()
        return out

In [None]:
class EarlyStopper:
    def __init__(self, patience, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [None]:
def remove_elements_from_batch(X, y, s):
    cond = np.where(s)
    return X[cond], y[cond], s[cond]

In [None]:
def train_model(data_loader, model, loss_function, optimizer):
    num_batches = len(data_loader)
    total_loss = 0
    model.train()

    with tqdm(data_loader, unit="batch") as tepoch:
        for X, y, s in tepoch:
            # X, y, s = remove_elements_from_batch(X, y, s)
            output = model(X)
            loss = loss_function(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    # loss
    avg_loss = total_loss / num_batches
    print(f"Train loss: {avg_loss}")
    return avg_loss


def test_model(data_loader, model, loss_function):
    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    with torch.no_grad():
        with tqdm(data_loader, unit="batch") as tepoch:
            for X, y, s in tepoch:
                # X, y, s = remove_elements_from_batch(X, y, s)
                output = model(X)
                total_loss += loss_function(output, y).item()

    # loss
    avg_loss = total_loss / num_batches
    print(f"Test loss: {avg_loss}")

    return avg_loss

In [None]:
torch.manual_seed(101)
batch_size = 400
sequence_length = 400
learning_rate = 5e-4
num_hidden_units = 550

experiment = Experiment(
    api_key="leAiWyR5Ck7tkdiHIT7n6QWNa",
    project_name="fh_2_hrrr",
    workspace="shmaronshmevans",
)
# # Report multiple hyperparameters using a dictionary:
# hyper_params = {
#     "num_layers": num_layers,
#     "learning_rate": learning_rate,
#     "sequence_length": sequence_length,
#     "batch_size": batch_size,
#     "num_hidden_units": num_hidden_units,
#     "forecast_lead": forecast_lead,
# }

batch_size = batch_size
sequence_length = sequence_length

train_dataset = SequenceDataset(
    df_train, target=target, features=new_features, sequence_length=sequence_length
)
test_dataset = SequenceDataset(
    df_test, target=target, features=new_features, sequence_length=sequence_length
)

train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

X, y, s = next(iter(train_loader))

print("Features shape:", X.shape)
print("Target shape:", y.shape)

learning_rate = learning_rate
num_hidden_units = num_hidden_units

model = ShallowRegressionPeepholeLSTM(
    num_sensors=len(new_features), hidden_units=num_hidden_units
)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
early_stopper = EarlyStopper(patience=10, min_delta=0)


print("Untrained test\n--------")
test_model(test_loader, model, loss_function)
print()

for ix_epoch in range(3):
    print(f"Epoch {ix_epoch}\n---------")
    train_loss = train_model(train_loader, model, loss_function, optimizer=optimizer)
    val_loss = test_model(test_loader, model, loss_function)
    print()
    experiment.log_epoch_end(ix_epoch)
    # experiment.log_parameters(hyper_params, step = ix_epoch)
    if early_stopper.early_stop(val_loss):
        break

# Seamlessly log your Pytorch model
log_model(experiment, model, model_name="exp1")
experiment.end()

## Evaluate Model

In [None]:
def predict(data_loader, model):
    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _, s in data_loader:
            y_star = model(X)
            # print(y_star)
            output = torch.cat((output, y_star), 0)

    return output


train_eval_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)


ystar_col = "Model forecast"
df_train[ystar_col] = predict(train_eval_loader, model).numpy()
df_test[ystar_col] = predict(test_loader, model).numpy()
print(df_test[ystar_col])

df_out = pd.concat((df_train, df_test))[[target, ystar_col]]

for c in df_out.columns:
    vals = df_out[c].values.tolist()
    mean = st.mean(vals)
    std = st.pstdev(vals)
    df_out[c] = df_out[c] * std + mean

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"
plot_template = dict(
    layout=go.Layout(
        {"font_size": 18, "xaxis_title_font_size": 24, "yaxis_title_font_size": 24}
    )
)


fig = px.line(df_out, labels=dict(created_at="Date", value="Forecast Error"))
fig.add_vline(x=(length * 0.75), line_width=4, line_dash="dash")
fig.add_annotation(
    xref="paper", x=0.75, yref="paper", y=0.8, text="Test set start", showarrow=False
)
fig.update_layout(
    template=plot_template, legend=dict(orientation="h", y=1.02, title_text="")
)
fig.show()

In [None]:
# import shap

# # Use the training data for deep explainer => can use fewer instances
# explainer = shap.DeepExplainer(model, X_train)
# # explain the the testing instances (can use fewer instanaces)
# # explaining each prediction requires 2 * background dataset size runs
# shap_values = explainer.shap_values(X_test)
# # init the JS visualization code
# shap.initjs()
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], features)

In [None]:
# scikit-learn related imports
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# pytorch relates imports
import torch
import torch.nn as nn
import torch.optim as optim

# imports from captum library
from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
from captum.attr import (
    IntegratedGradients,
    DeepLift,
    GradientShap,
    NoiseTunnel,
    FeatureAblation,
)

# Captum

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).view(-1, 1).float()

X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).view(-1, 1).float()

In [None]:
# import shap

# # Use the training data for deep explainer => can use fewer instances
# explainer = shap.DeepExplainer(model, y_train)
# # explain the the testing instances (can use fewer instanaces)
# # explaining each prediction requires 2 * background dataset size runs
# shap_values = explainer.shap_values(X_test)
# # init the JS visualization code
# shap.initjs()
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], features)

In [None]:
model.eval()
outputs = model(X_test)
err = np.sqrt(mean_squared_error(outputs.detach().numpy(), y_test.detach().numpy()))

print("model err: ", err)

In [None]:
ig = IntegratedGradients(model)
ig_nt = NoiseTunnel(ig)
dl = DeepLift(model)
gs = GradientShap(model)
fa = FeatureAblation(model)

In [None]:
ig_nt_attr_test = ig_nt.attribute(X_test)

In [None]:
# ig_attr_test_norm_sum.shape[1]

In [None]:
# X_test.shape[]

In [None]:
new_features

In [None]:
ig_nt_attr_test_norm_sum.shape

In [None]:
b = 0
e = 20
n = 10
# prepare attributions for visualization

x_axis_data = np.arange(X_test.shape[1])
x_axis_data_labels = list(map(lambda idx: new_features[idx], x_axis_data))

while e < len(x_axis_data):
    ig_nt_attr_test_sum = ig_nt_attr_test.detach().numpy().sum(0)
    ig_nt_attr_test_norm_sum = ig_nt_attr_test_sum / np.linalg.norm(
        ig_nt_attr_test_sum, ord=1
    )

    lin_weight = model.linear.weight[0].detach().numpy()
    y_axis_lin_weight = lin_weight / np.linalg.norm(lin_weight, ord=1)

    width = 0.14
    legends = ["Int Grads w/SmoothGrad", "Weights"]

    plt.figure(figsize=(20, 10))

    ax = plt.subplot()
    ax.set_title(
        "Comparing input feature importances across multiple algorithms and learned weights"
    )
    ax.set_ylabel("Attributions")

    FONT_SIZE = 16
    plt.rc("font", size=FONT_SIZE)  # fontsize of the text sizes
    plt.rc("axes", titlesize=FONT_SIZE)  # fontsize of the axes title
    plt.rc("axes", labelsize=FONT_SIZE)  # fontsize of the x and y labels
    plt.rc("legend", fontsize=FONT_SIZE - 4)  # fontsize of the legend

    print(x_axis_data.shape)

    ax.bar(
        x_axis_data[b:e] + width,
        ig_nt_attr_test_norm_sum[b:e, n],
        width,
        align="center",
        alpha=0.7,
        color="#A90000",
    )
    ax.bar(
        x_axis_data[b:e] + 5 * width,
        y_axis_lin_weight[b:e],
        width,
        align="center",
        alpha=1.0,
        color="grey",
    )
    ax.autoscale_view()
    plt.tight_layout()

    ax.set_xticks(x_axis_data[b:e] + 0.5)
    ax.set_xticklabels(x_axis_data_labels[b:e], rotation=90)

    plt.legend(legends, loc=3)
    plt.show()

    b += 20
    e += 20

Number of feature columns = 50
Compute LSTM Feature Importance
After we train (or load) each fold model, we will compute LSTM feature importance for all of our features. We do this with a for-loop of size N where N is the number of features we have. 

For each feature we wish to evaluate, we infer our OOF with that feature column randomly shuffled. If this feature column is important to our LSTM model, then the OOF MAE will become worse for that for-loop step. After our for-loop, we display bars equal to the size of how much MAE worsened without each feature, which is the importance of each feature.

Note that computing LSTM feature importance after each fold will add about 1 minute for every 5 features.