drop all cols with strings and unvaluable info like index
Always make sure it is sorted correctly by time and station
edit time of year with cos/sin "encode"
df['day_of_year'] = df['valid_time'].dt.dayofyear
?? subrtract a row in setting forecast hours ??

In [1]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append(".")

In [2]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
from comet_ml import Optimizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import os
import datetime as dt
from dateutil.parser import parse
from tqdm import tqdm
import re
import emd
import statistics as st
from multiprocessing import Process

In [3]:
def col_drop(df):
    df = df.drop(
        columns=[
            "day_of_year",
            "flag",
            "station",
            "latitude",
            "longitude",
            "t2m",
            "sh2",
            "d2m",
            "r2",
            "u10",
            "v10",
            "tp",
            "mslma",
            "orog",
            "tcc",
            "asnow",
            "cape",
            "dswrf",
            "dlwrf",
            "gh",
            "u_total",
            "u_dir",
            "new_tp",
            "lat",
            "lon",
            "elev",
            "tair",
            "ta9m",
            "td",
            "relh",
            "srad",
            "pres",
            "mslp",
            "wspd_sonic",
            "wmax_sonic",
            "wdir_sonic",
            "precip_total",
            "snow_depth",
            "day_of_year",
            "day_of_year_sin",
            "day_of_year_cos",
            "11_nlcd",
            "21_nlcd",
            "22_nlcd",
            "23_nlcd",
            "24_nlcd",
            "31_nlcd",
            "41_nlcd",
            "42_nlcd",
            "43_nlcd",
            "52_nlcd",
            "71_nlcd",
            "81_nlcd",
            "82_nlcd",
            "90_nlcd",
            "95_nlcd",
            "19_aspect",
            "21_aspect",
            "24_aspect",
            "27_aspect",
            "28_aspect",
            "22_aspect",
            "23_aspect",
            "25_aspect",
            "26_aspect",
            "31_aspect",
            "33_aspect",
            "32_aspect",
            "34_aspect",
            "38_aspect",
            "std_elev",
            "variance_elev",
            "skew_elev",
            "med_dist_elev",
        ]
    )
    df = df[df.columns.drop(list(df.filter(regex="time")))]
    df = df[df.columns.drop(list(df.filter(regex="station")))]
    df = df[df.columns.drop(list(df.filter(regex="tair")))]
    df = df[df.columns.drop(list(df.filter(regex="ta9m")))]
    df = df[df.columns.drop(list(df.filter(regex="td")))]
    df = df[df.columns.drop(list(df.filter(regex="relh")))]
    df = df[df.columns.drop(list(df.filter(regex="srad")))]
    df = df[df.columns.drop(list(df.filter(regex="pres")))]
    df = df[df.columns.drop(list(df.filter(regex="wspd")))]
    df = df[df.columns.drop(list(df.filter(regex="wmax")))]
    df = df[df.columns.drop(list(df.filter(regex="wdir")))]
    df = df[df.columns.drop(list(df.filter(regex="precip_total")))]
    df = df[df.columns.drop(list(df.filter(regex="snow_depth")))]

    return df

In [4]:
def get_flag(hrrr_df):
    """
    Create a flag column in the input DataFrame indicating consecutive hourly time intervals.

    This function takes a DataFrame containing weather data for different stations, with a 'station' column
    representing the station ID and a 'valid_time' column containing timestamps of the weather data.
    It calculates the time difference between consecutive timestamps for each station and marks it as 'True'
    in a new 'flag' column if the difference is exactly one hour, indicating consecutive hourly time intervals.
    Otherwise, it marks the 'flag' as 'False'.

    Parameters:
    hrrr_df (pandas.DataFrame): Input DataFrame containing weather data for different stations.

    Returns:
    pandas.DataFrame: The input DataFrame with an additional 'flag' column indicating consecutive hourly time intervals.

    Example:
      station           valid_time   flag
    0        1 2023-08-01 00:00:00   True
    1        1 2023-08-01 01:00:00   False
    2        1 2023-08-01 03:00:00   False
    3        2 2023-08-01 08:00:00   True
    4        2 2023-08-01 09:00:00   False
    5        2 2023-08-01 11:00:00   True
    """

    # Get unique station IDs
    stations_ls = hrrr_df["station"].unique()

    # Define a time interval of one hour
    one_hour = dt.timedelta(hours=1)

    # Initialize a list to store flags for each time interval
    flag_ls = []

    # Loop through each station and calculate flags for consecutive hourly time intervals
    for station in stations_ls:
        # Filter DataFrame for the current station
        df = hrrr_df[hrrr_df["station"] == station]

        # Get the list of valid_time timestamps for the current station
        time_ls = df["valid_time"].tolist()

        # Compare each timestamp with the next one to determine consecutive intervals
        for now, then in zip(time_ls, time_ls[1:]):
            if now + one_hour == then:
                flag_ls.append(True)
            else:
                flag_ls.append(False)

    # Append an extra True to indicate the last time interval (since it has no next timestamp for comparison)
    flag_ls.append(True)

    # Add the 'flag' column to the DataFrame
    hrrr_df["flag"] = flag_ls

    return hrrr_df

In [5]:
def nwp_error(target, station, df):
    vars_dict = {
        "t2m": "tair",
        "mslma": "pres",
    }
    nysm_var = vars_dict.get(target)

    df["target_error"] = df[f"{target}_{station}"] - df[f"{nysm_var}_{station}"]
    return df

In [6]:
def encode(data, col, max_val, valid_times):
    data["valid_time"] = valid_times
    data = data[data.columns.drop(list(data.filter(regex="day")))]
    data["day_of_year"] = data["valid_time"].dt.dayofyear
    data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val).astype(float)
    data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)
    data = data.drop(columns=["valid_time", "day_of_year"]).astype(float)

    return data

In [7]:
def format_climate_df(data_path):
    """
    Formats a climate data file located at the specified `data_path` into a pandas DataFrame.

    Args:
        data_path (str): The file path for the climate data file.

    Returns:
        pandas.DataFrame: A DataFrame containing the climate data, with the first column renamed to "year".
    """
    raw_index = np.loadtxt(f"{data_path}")
    cl_index = pd.DataFrame(raw_index)
    cl_index = cl_index.rename(columns={0: "year"})
    return cl_index

In [8]:
def get_clim_indexes(df, valid_times):
    """
    Fetch climate indexes data and add corresponding index values to the input DataFrame.

    This function takes a DataFrame (`df`) containing weather data with a 'valid_time' column representing
    timestamps. It reads climate indexes data from text files in the specified directory and extracts index
    values corresponding to the month and year of each timestamp in the DataFrame. The extracted index values
    are then added to the DataFrame with new columns named after each index.

    Parameters:
    df (pandas.DataFrame): Input DataFrame containing weather data with a 'valid_time' column.

    Returns:
    pandas.DataFrame: The input DataFrame with additional columns for each climate index containing their values.
    """

    clim_df_path = "/home/aevans/nwp_bias/src/correlation/data/indexes/"
    directory = os.listdir(clim_df_path)
    df["valid_time"] = valid_times

    # Loop through each file in the specified directory
    for d in directory:
        if d.endswith(".txt"):
            # Read the climate index data from the file and format it into a DataFrame
            clim_df = format_climate_df(f"{clim_df_path}{d}")
            index_name = d.split(".")[0]

            clim_ind_ls = []
            for t, _ in enumerate(df["valid_time"]):
                time_obj = df["valid_time"].iloc[t]
                dt_object = parse(str(time_obj))
                year = dt_object.strftime("%Y")
                month = dt_object.strftime("%m")
                # Filter the climate DataFrame to get data for the specific year
                df1 = clim_df.loc[clim_df["year"] == int(year)]
                df1 = df1.drop(columns="year")
                row_list = df1.values
                keys = df1.keys()
                key_vals = keys.tolist()

                # Extract the index value corresponding to the month of the timestamp
                the_list = []
                for n, _ in enumerate(key_vals):
                    val1 = key_vals[n]
                    val2 = row_list[0, n]
                    tup = (val1, val2)
                    the_list.append(tup)
                for k, r in the_list:
                    if str(k).zfill(2) == month:
                        clim_ind_ls.append(r)

            # Add the climate index values as a new column in the DataFrame
            df[index_name] = clim_ind_ls

    df = df.drop(columns="valid_time")
    return df

In [9]:
df = pd.read_parquet(
    "/home/aevans/nwp_bias/src/machine_learning/data/clean_parquets/nysm_cats/cleaned_rough_lstm_nysmcat_Western Plateau.parquet"
)
df = df.dropna()

In [10]:
valid_times = df["valid_time"].tolist()

In [11]:
df

Unnamed: 0,valid_time,time,station_ADDI,latitude_ADDI,longitude_ADDI,t2m_ADDI,sh2_ADDI,d2m_ADDI,r2_ADDI,u10_ADDI,...,26_aspect_RAND,31_aspect_RAND,33_aspect_RAND,32_aspect_RAND,34_aspect_RAND,38_aspect_RAND,std_elev_RAND,variance_elev_RAND,skew_elev_RAND,med_dist_elev_RAND
0,2018-01-01 03:00:00,2018-01-01 01:00:00,ADDI,42.045955,-77.218867,-19.199194,0.00068,-21.993216,76.800003,3.321081,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
1,2018-01-01 04:00:00,2018-01-01 02:00:00,ADDI,42.045955,-77.218867,-19.118689,0.00068,-21.918570,76.500000,3.298830,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
2,2018-01-01 05:00:00,2018-01-01 03:00:00,ADDI,42.045955,-77.218867,-19.439062,0.00065,-22.570868,73.800003,2.992857,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
3,2018-01-01 06:00:00,2018-01-01 04:00:00,ADDI,42.045955,-77.218867,-19.760291,0.00063,-22.821158,74.400002,3.115693,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
4,2018-01-01 07:00:00,2018-01-01 05:00:00,ADDI,42.045955,-77.218867,-19.860175,0.00062,-22.993704,74.099998,3.268302,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22486,2022-12-31 19:00:00,2022-12-31 17:00:00,ADDI,42.045955,-77.218867,11.485986,0.00710,8.142847,80.300003,0.073768,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
22487,2022-12-31 20:00:00,2022-12-31 18:00:00,ADDI,42.045955,-77.218867,10.447198,0.00676,7.376611,80.400002,-0.416093,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
22488,2022-12-31 21:00:00,2022-12-31 19:00:00,ADDI,42.045955,-77.218867,9.658258,0.00709,8.101648,88.000000,0.234075,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002
22489,2022-12-31 22:00:00,2022-12-31 20:00:00,ADDI,42.045955,-77.218867,9.118555,0.00711,8.145227,92.000000,0.285457,...,5.217391,0.0,0.0,0.0,0.0,0.0,77.124478,5896.461701,0.569549,405.940002


In [12]:
for k in df.keys():
    print(k)

valid_time
time
station_ADDI
latitude_ADDI
longitude_ADDI
t2m_ADDI
sh2_ADDI
d2m_ADDI
r2_ADDI
u10_ADDI
v10_ADDI
tp_ADDI
mslma_ADDI
orog_ADDI
tcc_ADDI
asnow_ADDI
cape_ADDI
dswrf_ADDI
dlwrf_ADDI
gh_ADDI
u_total_ADDI
u_dir_ADDI
new_tp_ADDI
lat_ADDI
lon_ADDI
elev_ADDI
tair_ADDI
ta9m_ADDI
td_ADDI
relh_ADDI
srad_ADDI
pres_ADDI
mslp_ADDI
wspd_sonic_ADDI
wmax_sonic_ADDI
wdir_sonic_ADDI
precip_total_ADDI
snow_depth_ADDI
day_of_year_ADDI
day_of_year_sin_ADDI
day_of_year_cos_ADDI
11_nlcd_ADDI
21_nlcd_ADDI
22_nlcd_ADDI
23_nlcd_ADDI
24_nlcd_ADDI
31_nlcd_ADDI
41_nlcd_ADDI
42_nlcd_ADDI
43_nlcd_ADDI
52_nlcd_ADDI
71_nlcd_ADDI
81_nlcd_ADDI
82_nlcd_ADDI
90_nlcd_ADDI
95_nlcd_ADDI
19_aspect_ADDI
21_aspect_ADDI
24_aspect_ADDI
27_aspect_ADDI
28_aspect_ADDI
22_aspect_ADDI
23_aspect_ADDI
25_aspect_ADDI
26_aspect_ADDI
31_aspect_ADDI
33_aspect_ADDI
32_aspect_ADDI
34_aspect_ADDI
38_aspect_ADDI
std_elev_ADDI
variance_elev_ADDI
skew_elev_ADDI
med_dist_elev_ADDI
time_ADDI
station
latitude
longitude
t2m
sh2
d2m
r2
u10

In [13]:
# columns to reintigrate back into the df after model is done running
cols_to_carry = cols_to_carry = [
    "valid_time",
    "flag",
    "day_of_year_sin",
    "day_of_year_cos",
]

In [14]:
df = get_flag(df)
df = nwp_error("t2m", "ADDI", df)

In [15]:
new_df = df.copy()

In [16]:
def normalize_df(df, valid_times):
    print("init normalizer")
    df = col_drop(df)
    the_df = df.dropna()
    for k, r in the_df.items():
        if len(the_df[k].unique()) == 1:
            org_str = str(k)
            my_str = org_str[:-5]
            vals = the_df.filter(regex=my_str)
            vals = vals.loc[0].tolist()
            means = st.mean(vals)
            stdevs = st.pstdev(vals)
            the_df[k] = (the_df[k] - means) / stdevs

            the_df = the_df.fillna(0)
            # |sh2|d2m|r2|u10|v10|tp|mslma|tcc|asnow|cape|dswrf|dlwrf|gh|utotal|u_dir|new_tp
        if re.search(
            "t2m",
            k,
        ):
            ind_val = the_df.columns.get_loc(k)
            x = the_df[k]
            imf = emd.sift.sift(x)
            the_df = the_df.drop(columns=k)
            for i in range(imf.shape[1]):
                imf_ls = imf[:, i].tolist()
                # Inserting the column at the
                # beginning in the DataFrame
                my_loc = ind_val + i
                the_df.insert(loc=(my_loc), column=f"{k}_imf_{i}", value=imf_ls)

        else:
            means = st.mean(the_df[k])
            stdevs = st.pstdev(the_df[k])
            the_df[k] = (the_df[k] - means) / stdevs

    final_df = the_df.fillna(0)
    print("!!! Dropping Columns !!!")
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="latitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="longitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="u_total")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="mslp")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="orog")))]

    print("--- configuring data ---")
    final_df = encode(final_df, "day_of_year", 366, valid_times)
    final_df = get_clim_indexes(final_df, valid_times)
    new_features = list(final_df.columns.difference(["target_error"]))
    print("---normalize successful---")

    return final_df, new_features

In [17]:
def normalize_df_station(df):
    print("init normalizer")
    the_df = df.dropna()
    for k, r in the_df.items():
        if len(the_df[k].unique()) == 1:
            org_str = str(k)
            my_str = org_str[:-5]
            vals = the_df.filter(regex=my_str)
            vals = vals.loc[0].tolist()
            means = st.mean(vals)
            stdevs = st.pstdev(vals)
            the_df[k] = (the_df[k] - means) / stdevs

            the_df = the_df.fillna(0)
        if not (len(the_df[k].unique()) == 1) and re.search("_ADDI", k):
            ind_val = the_df.columns.get_loc(k)
            x = the_df[k]
            imf = emd.sift.sift(x)
            the_df = the_df.drop(columns=k)
            for i in range(imf.shape[1]):
                imf_ls = imf[:, i].tolist()
                # Inserting the column at the
                # beginning in the DataFrame
                my_loc = ind_val + i
                the_df.insert(loc=(my_loc), column=f"{k}_imf_{i}", value=imf_ls)

        else:
            means = st.mean(the_df[k])
            stdevs = st.pstdev(the_df[k])
            the_df[k] = (the_df[k] - means) / stdevs

    final_df = the_df.fillna(0)
    print("!!! Dropping Columns !!!")
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="latitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="longitude")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="u_total")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="mslp")))]
    final_df = final_df[final_df.columns.drop(list(final_df.filter(regex="orog")))]
    new_features = list(final_df.columns.difference(["target_error"]))
    print("---normalize successful---")
    return final_df, new_features

## create train and test set

In [18]:
target_sensor = "target_error"
the_df, new_features = normalize_df(new_df, valid_times)

forecast_lead = 30
target = f"{target_sensor}_lead_{forecast_lead}"

the_df[target] = the_df[target_sensor].shift(-forecast_lead)
the_df = the_df.iloc[:-forecast_lead]

init normalizer


  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]
  out_args[idx] = out_args[idx][:, np.newaxis]


!!! Dropping Columns !!!
--- configuring data ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["day_of_year"] = data["valid_time"].dt.dayofyear
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val).astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)


---normalize successful---


# Normalize

In [19]:
for k in the_df.keys():
    print(k)

t2m_ADDI_imf_0
t2m_ADDI_imf_1
t2m_ADDI_imf_2
t2m_ADDI_imf_3
t2m_ADDI_imf_4
t2m_ADDI_imf_5
t2m_ADDI_imf_6
t2m_ADDI_imf_7
t2m_ADDI_imf_8
t2m_ADDI_imf_9
sh2_ADDI
d2m_ADDI
r2_ADDI
u10_ADDI
v10_ADDI
tp_ADDI
mslma_ADDI
tcc_ADDI
asnow_ADDI
cape_ADDI
dswrf_ADDI
dlwrf_ADDI
gh_ADDI
u_dir_ADDI
new_tp_ADDI
lat_ADDI
lon_ADDI
elev_ADDI
11_nlcd_ADDI
21_nlcd_ADDI
22_nlcd_ADDI
23_nlcd_ADDI
24_nlcd_ADDI
31_nlcd_ADDI
41_nlcd_ADDI
42_nlcd_ADDI
43_nlcd_ADDI
52_nlcd_ADDI
71_nlcd_ADDI
81_nlcd_ADDI
82_nlcd_ADDI
90_nlcd_ADDI
95_nlcd_ADDI
19_aspect_ADDI
21_aspect_ADDI
24_aspect_ADDI
27_aspect_ADDI
28_aspect_ADDI
22_aspect_ADDI
23_aspect_ADDI
25_aspect_ADDI
26_aspect_ADDI
31_aspect_ADDI
33_aspect_ADDI
32_aspect_ADDI
34_aspect_ADDI
38_aspect_ADDI
variance_elev_ADDI
skew_elev_ADDI
med_dist_elev_ADDI
t2m_BELM_imf_0
t2m_BELM_imf_1
t2m_BELM_imf_2
t2m_BELM_imf_3
t2m_BELM_imf_4
t2m_BELM_imf_5
t2m_BELM_imf_6
t2m_BELM_imf_7
t2m_BELM_imf_8
t2m_BELM_imf_9
sh2_BELM
d2m_BELM
r2_BELM
u10_BELM
v10_BELM
tp_BELM
mslma_BELM
tcc_B

In [20]:
length = len(the_df)

test_len = int(length * 0.75)

df_train = the_df.iloc[:test_len].copy()
df_test = the_df.iloc[test_len:].copy()

print("Test Set Fraction", len(df_test) / len(the_df))

Test Set Fraction 0.25003339121143314


## Normalize

In [21]:
for c in cols_to_carry:
    df_train[c] = df[c]
    df_test[c] = df[c]

In [22]:
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

In [23]:
df_test

Unnamed: 0,t2m_ADDI_imf_0,t2m_ADDI_imf_1,t2m_ADDI_imf_2,t2m_ADDI_imf_3,t2m_ADDI_imf_4,t2m_ADDI_imf_5,t2m_ADDI_imf_6,t2m_ADDI_imf_7,t2m_ADDI_imf_8,t2m_ADDI_imf_9,...,enso4,enso_1_2,pna,enso3,ao,pdo,nao,target_error_lead_30,valid_time,flag
16845,0.241526,0.297581,0.460891,2.755017,-1.468827,0.280847,-4.253160,5.165458,-1.203823,1.216701,...,-0.68,-1.12,0.68,-1.03,0.093,-2.52,-0.33,0.466901,2021-11-22 10:00:00,True
16846,-0.164721,-0.042090,0.362151,2.474688,-1.453629,0.263343,-4.250228,5.158583,-1.203730,1.216437,...,-0.68,-1.12,0.68,-1.03,0.093,-2.52,-0.33,0.799416,2021-11-22 11:00:00,True
16847,0.105367,-0.486741,0.219456,2.184923,-1.436017,0.245770,-4.247241,5.151696,-1.203636,1.216173,...,-0.68,-1.12,0.68,-1.03,0.093,-2.52,-0.33,-0.351758,2021-11-22 12:00:00,True
16848,-0.017604,-0.715117,0.060210,1.887383,-1.416064,0.228130,-4.244199,5.144799,-1.203542,1.215909,...,-0.68,-1.12,0.68,-1.03,0.093,-2.52,-0.33,-0.158703,2021-11-22 13:00:00,True
16849,-0.114607,-0.689329,-0.090533,1.583723,-1.393846,0.210426,-4.241100,5.137889,-1.203447,1.215644,...,-0.68,-1.12,0.68,-1.03,0.093,-2.52,-0.33,-0.263606,2021-11-22 14:00:00,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22456,-0.253649,-1.508868,0.229911,8.899499,4.637280,1.112167,-5.254929,-0.400453,0.520442,0.340615,...,-0.84,-0.46,-0.96,-0.81,-2.719,-9.90,-0.22,0.136727,2022-12-30 13:00:00,True
22457,-0.029539,-0.930426,0.733629,8.857566,4.741251,1.117889,-5.254474,-0.401975,0.520271,0.340688,...,-0.84,-0.46,-0.96,-0.81,-2.719,-9.90,-0.22,0.119368,2022-12-30 14:00:00,True
22458,0.110616,-0.024843,1.202153,8.800177,4.844479,1.123339,-5.253991,-0.403489,0.520099,0.340761,...,-0.84,-0.46,-0.96,-0.81,-2.719,-9.90,-0.22,-0.694242,2022-12-30 15:00:00,True
22459,-0.078151,0.957164,1.602189,8.727438,4.946911,1.128514,-5.253480,-0.404995,0.519927,0.340834,...,-0.84,-0.46,-0.96,-0.81,-2.719,-9.90,-0.22,-0.717157,2022-12-30 16:00:00,True


In [24]:
# import torch

# data = torch.tensor(df_train.values)


# # Perform PCA
# U, S, V = torch.pca_lowrank(data)

# # Print the results
# print(U)
# print(S)
# print(V)

In [25]:
# U.shape

In [26]:
# S.shape

In [27]:
# V.shape

In [28]:
# import pandas as pd
# import numpy as np
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10)
# pca.fit(df_train)
# print(pca.components_) # Reformat and view results
# loadings = pd.DataFrame(pca.components_.T,
# columns=['PC%s' % _ for _ in range((pca.components_.shape[0]))],
# index=df_train.columns)
# print(loadings)

# plt.plot(pca.explained_variance_ratio_)
# plt.ylabel('Explained Variance')
# plt.xlabel('Components')
# plt.show()

In [29]:
# df = pd.DataFrame(pca.components_)
# df

In [30]:
# loadings

## Create LSTM

In [31]:
class SequenceDataset(Dataset):
    def __init__(self, dataframe, target, features, sequence_length):
        self.dataframe = dataframe
        self.features = features
        self.target = target
        self.sequence_length = sequence_length
        self.y = torch.tensor(dataframe[target].values).float()
        self.X = torch.tensor(dataframe[features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        keep_sample = self.dataframe.iloc[i]["flag"]
        if i >= self.sequence_length - 1:
            i_start = i - self.sequence_length + 1
            x = self.X[i_start : (i + 1), :]
        else:
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0 : (i + 1), :]
            x = torch.cat((padding, x), 0)

        return x, self.y[i], keep_sample

In [86]:
import torch
import torch.nn as nn


class PeepholeLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PeepholeLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        self.W_i = nn.Linear(input_size, hidden_size)
        self.W_f = nn.Linear(input_size, hidden_size)
        self.W_o = nn.Linear(input_size, hidden_size)
        self.W_c = nn.Linear(input_size, hidden_size)

        self.U_i = nn.Linear(hidden_size, hidden_size)
        self.U_f = nn.Linear(hidden_size, hidden_size)
        self.U_o = nn.Linear(hidden_size, hidden_size)
        self.U_c = nn.Linear(hidden_size, hidden_size)

    def forward(self, x, h_prev, c_prev):
        i = torch.sigmoid(self.W_i(x) + self.U_i(h_prev) + self.W_c(c_prev))
        f = torch.sigmoid(self.W_f(x) + self.U_f(h_prev) + self.W_c(c_prev))
        c_tilde = torch.tanh(self.W_c(x) + self.U_c(h_prev))
        c = f * c_prev + i * c_tilde
        o = torch.sigmoid(self.W_o(x) + self.U_o(h_prev) + self.W_c(c))
        h = o * torch.tanh(c)
        return h, c


class ShallowRegressionPeepholeLSTM(nn.Module):
    def __init__(self, num_sensors, hidden_units):
        super().__init__()
        self.num_sensors = num_sensors
        self.hidden_units = hidden_units
        self.num_layers = 3

        # Create a list of LSTM cells
        self.lstm_cells = nn.ModuleList([PeepholeLSTMCell(num_sensors, hidden_units)])

        # # Add additional LSTM layers if needed
        # for _ in range(1, self.num_layers):
        #     self.lstm_cells.append(PeepholeLSTMCell(hidden_units, hidden_units))

        self.linear = nn.Linear(in_features=hidden_units, out_features=1)

    def forward(self, x):
        batch_size = x.shape[0]
        h0 = torch.zeros(batch_size, self.hidden_units).requires_grad_()
        c0 = torch.zeros(batch_size, self.hidden_units).requires_grad_()
        h, c = h0, c0

        # Forward pass through each LSTM cell
        for lstm_cell in self.lstm_cells:
            h, c = lstm_cell(x, h, c)

        out = self.linear(h[0]).flatten()
        return out

In [87]:
class EarlyStopper:
    def __init__(self, patience, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

In [88]:
def remove_elements_from_batch(X, y, s):
    cond = np.where(s)
    return X[cond], y[cond], s[cond]

In [89]:
def train_model(data_loader, model, loss_function, optimizer):
    num_batches = len(data_loader)
    total_loss = 0
    model.train()

    with tqdm(data_loader, unit="batch") as tepoch:
        for X, y, s in tepoch:
            # X, y, s = remove_elements_from_batch(X, y, s)
            output = model(X)
            loss = loss_function(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    # loss
    avg_loss = total_loss / num_batches
    print(f"Train loss: {avg_loss}")
    return avg_loss


def test_model(data_loader, model, loss_function):
    num_batches = len(data_loader)
    total_loss = 0

    model.eval()
    with torch.no_grad():
        with tqdm(data_loader, unit="batch") as tepoch:
            for X, y, s in tepoch:
                # X, y, s = remove_elements_from_batch(X, y, s)
                output = model(X)
                total_loss += loss_function(output, y).item()

    # loss
    avg_loss = total_loss / num_batches
    print(f"Test loss: {avg_loss}")

    return avg_loss

In [90]:
torch.manual_seed(101)
batch_size = 400
sequence_length = 400
learning_rate = 5e-4
num_hidden_units = 550

experiment = Experiment(
    api_key="leAiWyR5Ck7tkdiHIT7n6QWNa",
    project_name="fh_2_hrrr",
    workspace="shmaronshmevans",
)
# # Report multiple hyperparameters using a dictionary:
# hyper_params = {
#     "num_layers": num_layers,
#     "learning_rate": learning_rate,
#     "sequence_length": sequence_length,
#     "batch_size": batch_size,
#     "num_hidden_units": num_hidden_units,
#     "forecast_lead": forecast_lead,
# }

batch_size = batch_size
sequence_length = sequence_length

train_dataset = SequenceDataset(
    df_train, target=target, features=new_features, sequence_length=sequence_length
)
test_dataset = SequenceDataset(
    df_test, target=target, features=new_features, sequence_length=sequence_length
)

train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)

X, y, s = next(iter(train_loader))

print("Features shape:", X.shape)
print("Target shape:", y.shape)

learning_rate = learning_rate
num_hidden_units = num_hidden_units

model = ShallowRegressionPeepholeLSTM(
    num_sensors=len(new_features), hidden_units=num_hidden_units
)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
early_stopper = EarlyStopper(patience=10, min_delta=0)


print("Untrained test\n--------")
test_model(test_loader, model, loss_function)
print()

for ix_epoch in range(3):
    print(f"Epoch {ix_epoch}\n---------")
    train_loss = train_model(train_loader, model, loss_function, optimizer=optimizer)
    val_loss = test_model(test_loader, model, loss_function)
    print()
    experiment.log_epoch_end(ix_epoch)
    # experiment.log_parameters(hyper_params, step = ix_epoch)
    if early_stopper.early_stop(val_loss):
        break

# Seamlessly log your Pytorch model
log_model(experiment, model, model_name="exp1")
experiment.end()

Features shape: torch.Size([400, 400, 550])
Target shape: torch.Size([400])
Untrained test
--------


 93%|█████████▎| 14/15 [00:59<00:04,  4.24s/batch]


RuntimeError: The size of tensor a (400) must match the size of tensor b (16) at non-singleton dimension 1

## Evaluate Model

In [None]:
def predict(data_loader, model):
    output = torch.tensor([])
    model.eval()
    with torch.no_grad():
        for X, _, s in data_loader:
            y_star = model(X)
            # print(y_star)
            output = torch.cat((output, y_star), 0)

    return output


train_eval_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)


ystar_col = "Model forecast"
df_train[ystar_col] = predict(train_eval_loader, model).numpy()
df_test[ystar_col] = predict(test_loader, model).numpy()
print(df_test[ystar_col])

df_out = pd.concat((df_train, df_test))[[target, ystar_col]]

for c in df_out.columns:
    vals = df_out[c].values.tolist()
    mean = st.mean(vals)
    std = st.pstdev(vals)
    df_out[c] = df_out[c] * std + mean

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.templates.default = "plotly_white"
plot_template = dict(
    layout=go.Layout(
        {"font_size": 18, "xaxis_title_font_size": 24, "yaxis_title_font_size": 24}
    )
)


fig = px.line(df_out, labels=dict(created_at="Date", value="Forecast Error"))
fig.add_vline(x=(length * 0.75), line_width=4, line_dash="dash")
fig.add_annotation(
    xref="paper", x=0.75, yref="paper", y=0.8, text="Test set start", showarrow=False
)
fig.update_layout(
    template=plot_template, legend=dict(orientation="h", y=1.02, title_text="")
)
fig.show()

In [None]:
# import shap

# # Use the training data for deep explainer => can use fewer instances
# explainer = shap.DeepExplainer(model, X_train)
# # explain the the testing instances (can use fewer instanaces)
# # explaining each prediction requires 2 * background dataset size runs
# shap_values = explainer.shap_values(X_test)
# # init the JS visualization code
# shap.initjs()
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], features)

In [None]:
# scikit-learn related imports
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# pytorch relates imports
import torch
import torch.nn as nn
import torch.optim as optim

# imports from captum library
from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
from captum.attr import (
    IntegratedGradients,
    DeepLift,
    GradientShap,
    NoiseTunnel,
    FeatureAblation,
)

# Captum

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train = torch.tensor(X_train).float()
y_train = torch.tensor(y_train).view(-1, 1).float()

X_test = torch.tensor(X_test).float()
y_test = torch.tensor(y_test).view(-1, 1).float()

In [None]:
# import shap

# # Use the training data for deep explainer => can use fewer instances
# explainer = shap.DeepExplainer(model, y_train)
# # explain the the testing instances (can use fewer instanaces)
# # explaining each prediction requires 2 * background dataset size runs
# shap_values = explainer.shap_values(X_test)
# # init the JS visualization code
# shap.initjs()
# shap.force_plot(explainer.expected_value[0], shap_values[0][0], features)

In [None]:
model.eval()
outputs = model(X_test)
err = np.sqrt(mean_squared_error(outputs.detach().numpy(), y_test.detach().numpy()))

print("model err: ", err)

In [None]:
ig = IntegratedGradients(model)
ig_nt = NoiseTunnel(ig)
dl = DeepLift(model)
gs = GradientShap(model)
fa = FeatureAblation(model)

In [None]:
ig_nt_attr_test = ig_nt.attribute(X_test)

In [None]:
# ig_attr_test_norm_sum.shape[1]

In [None]:
# X_test.shape[]

In [None]:
new_features

In [None]:
ig_nt_attr_test_norm_sum.shape

In [None]:
b = 0
e = 20
n = 10
# prepare attributions for visualization

x_axis_data = np.arange(X_test.shape[1])
x_axis_data_labels = list(map(lambda idx: new_features[idx], x_axis_data))

while e < len(x_axis_data):
    ig_nt_attr_test_sum = ig_nt_attr_test.detach().numpy().sum(0)
    ig_nt_attr_test_norm_sum = ig_nt_attr_test_sum / np.linalg.norm(
        ig_nt_attr_test_sum, ord=1
    )

    lin_weight = model.linear.weight[0].detach().numpy()
    y_axis_lin_weight = lin_weight / np.linalg.norm(lin_weight, ord=1)

    width = 0.14
    legends = ["Int Grads w/SmoothGrad", "Weights"]

    plt.figure(figsize=(20, 10))

    ax = plt.subplot()
    ax.set_title(
        "Comparing input feature importances across multiple algorithms and learned weights"
    )
    ax.set_ylabel("Attributions")

    FONT_SIZE = 16
    plt.rc("font", size=FONT_SIZE)  # fontsize of the text sizes
    plt.rc("axes", titlesize=FONT_SIZE)  # fontsize of the axes title
    plt.rc("axes", labelsize=FONT_SIZE)  # fontsize of the x and y labels
    plt.rc("legend", fontsize=FONT_SIZE - 4)  # fontsize of the legend

    print(x_axis_data.shape)

    ax.bar(
        x_axis_data[b:e] + width,
        ig_nt_attr_test_norm_sum[b:e, n],
        width,
        align="center",
        alpha=0.7,
        color="#A90000",
    )
    ax.bar(
        x_axis_data[b:e] + 5 * width,
        y_axis_lin_weight[b:e],
        width,
        align="center",
        alpha=1.0,
        color="grey",
    )
    ax.autoscale_view()
    plt.tight_layout()

    ax.set_xticks(x_axis_data[b:e] + 0.5)
    ax.set_xticklabels(x_axis_data_labels[b:e], rotation=90)

    plt.legend(legends, loc=3)
    plt.show()

    b += 20
    e += 20

Number of feature columns = 50
Compute LSTM Feature Importance
After we train (or load) each fold model, we will compute LSTM feature importance for all of our features. We do this with a for-loop of size N where N is the number of features we have. 

For each feature we wish to evaluate, we infer our OOF with that feature column randomly shuffled. If this feature column is important to our LSTM model, then the OOF MAE will become worse for that for-loop step. After our for-loop, we display bars equal to the size of how much MAE worsened without each feature, which is the importance of each feature.

Note that computing LSTM feature importance after each fold will add about 1 minute for every 5 features.