In [42]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
import sys

sys.path.append("..")
import pandas as pd 
import numpy as np 
from src.processing import hrrr_data
from src.processing import nysm_data
from src.processing import get_error
from src.processing import normalize
from src.processing import get_flag

In [44]:
def columns_drop(df):
    df = df.drop(
        columns=[
            "level_0",
            "index_x",
            "index_y",
            "lead time",
            "lsm",
            "station_y",
            'lat',
            'lon'
        ]
    )
    df = df.rename(columns={'station_x':'station'})
    return df

In [45]:
def create_data_for_model():
    """
    This function creates and processes data for a LSTM machine learning model.

    Args:
        station (str): The station identifier for which data is being processed.

    Returns:
        new_df (pandas DataFrame): A DataFrame containing processed data.
        df_train (pandas DataFrame): A DataFrame for training the machine learning model.
        df_test (pandas DataFrame): A DataFrame for testing the machine learning model.
        features (list): A list of feature names.
        forecast_lead (int): The lead time for the target variable.
    """
    # load nysm data
    nysm_df = nysm_data.load_nysm_data()
    nysm_df.reset_index(inplace=True)
    nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})

    # load hrrr data
    hrrr_df = hrrr_data.read_hrrr_data()

    # Filter NYSM data to match valid times from HRRR data and save it to a CSV file.
    mytimes = hrrr_df["valid_time"].tolist()
    nysm_df = nysm_df[nysm_df["valid_time"].isin(mytimes)]

    # Filter data by NY climate division 
    nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
    nysm_cats_df = pd.read_csv(nysm_cats_path)
    nysm_cats_df = nysm_cats_df[nysm_cats_df['climate_division_name']=='Western Plateau']
    stations = nysm_cats_df["stid"].tolist()
    nysm_df = nysm_df[nysm_df['station'].isin(stations)]
    hrrr_df = hrrr_df[hrrr_df['station'].isin(stations)]

    # merge dataframes so that each row is hrrr + nysm data for the same time step
    # do this for each station individually 
    for station in stations:
        nysm_df1 = nysm_df[nysm_df['station']==station]
        hrrr_df1 = hrrr_df[hrrr_df['station']==station]

        master_df = hrrr_df1.merge(nysm_df1, on="valid_time")
        master_df = master_df.drop_duplicates(
            subset=["valid_time", "t2m"], keep="first"
        )
        master_df = columns_drop(master_df)

        # Calculate the error using NWP data.
        master_df = get_error.nwp_error("t2m", master_df)
        # encode for day_of_year
        master_df = normalize.encode(master_df, 'day_of_year', 366)
        # get flag for non-consecutive time steps
        master_df = get_flag.get_flag(master_df)

        cols_to_carry = ['valid_time', 'station', 'latitude', 'longitude', 'flag']

        new_df = master_df.drop(columns=cols_to_carry)

        new_df, features = normalize.normalize_df(new_df)

        # Split the data into training and testing sets.
        length = len(new_df)
        test_len = int(length * 0.8)
        df_train = new_df.iloc[:test_len].copy()
        df_test = new_df.iloc[test_len:].copy()
        print("Test Set Fraction", len(df_test) / len(new_df))

        # Reintegrate the specified columns back into the training and testing DataFrames.
        for c in cols_to_carry:
            df_train[c] = master_df[c]
            df_test[c] = master_df[c]

    return df_train, df_test, features

In [46]:
# df_train, df_test, features = create_data_for_model()

In [47]:
nysm_df = nysm_data.load_nysm_data()
nysm_df.reset_index(inplace=True)
nysm_df = nysm_df.rename(columns={"time_1H": "valid_time"})
nysm_df.head()

Unnamed: 0,index,station,valid_time,lat,lon,elev,tair,ta9m,td,relh,srad,pres,mslp,wspd_sonic,wmax_sonic,wdir_sonic,precip_total,snow_depth
0,1,ADDI,2018-01-01 01:00:00,42.040359,-77.237259,507.614014,-18.23782,-18.122169,-22.090469,71.722794,0.0,964.492004,971.503723,1.456784,3.037127,325.875793,0.0,0.045877
1,2,ADDI,2018-01-01 02:00:00,42.040359,-77.237259,507.614014,-18.368231,-18.33906,-21.647659,75.388893,0.0,964.286804,971.328857,1.484595,2.211236,305.772797,0.0,0.046327
2,3,ADDI,2018-01-01 03:00:00,42.040359,-77.237259,507.614014,-18.8304,-18.33363,-21.994141,76.068916,0.0,964.409973,971.574524,1.247007,1.992487,309.544586,0.0,0.045029
3,4,ADDI,2018-01-01 04:00:00,42.040359,-77.237259,507.614014,-18.518641,-18.328711,-21.890549,74.752434,0.0,964.479187,971.563843,1.821808,3.12947,314.346588,0.0,0.047812
4,5,ADDI,2018-01-01 05:00:00,42.040359,-77.237259,507.614014,-18.635839,-18.578341,-21.847321,75.787628,0.0,964.581726,971.698914,1.780934,2.918397,294.888092,0.0,0.047251


In [48]:
hrrr_df = hrrr_data.read_hrrr_data()
hrrr_df.head()

Unnamed: 0,valid_time,time,station,level_0,index,latitude,longitude,t2m,sh2,d2m,...,asnow,cape,dswrf,dlwrf,gh,u_total,u_dir,lead time,lsm,new_tp
1,2018-01-01 03:00:00,2018-01-01 01:00:00,ADDI,59218879,326784,42.045955,-77.218867,-19.199194,0.00068,-21.993216,...,7e-06,0.0,0.0,146.800003,5215.425781,4.024367,304.386841,2,1.0,0.001
2,2018-01-01 04:00:00,2018-01-01 02:00:00,ADDI,59218880,532440,42.045955,-77.218867,-19.118689,0.00068,-21.91857,...,1e-06,0.0,0.0,147.100006,5210.410645,3.768112,298.900574,2,1.0,0.0
3,2018-01-01 05:00:00,2018-01-01 03:00:00,ADDI,59218881,738096,42.045955,-77.218867,-19.439062,0.00065,-22.570868,...,0.0,0.0,0.0,147.600006,5207.041992,3.118722,286.333313,2,1.0,0.0
4,2018-01-01 06:00:00,2018-01-01 04:00:00,ADDI,59218882,943752,42.045955,-77.218867,-19.760291,0.00063,-22.821158,...,0.0,0.0,0.0,146.899994,5201.477539,3.214206,284.222107,2,1.0,0.0
5,2018-01-01 07:00:00,2018-01-01 05:00:00,ADDI,59218883,1149408,42.045955,-77.218867,-19.860175,0.00062,-22.993704,...,9e-06,0.0,0.0,146.600006,5200.875,3.268753,270.951294,2,1.0,0.001


In [49]:
# Set the path for tabular data.
nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
nysm_cats_df = pd.read_csv(nysm_cats_path)
nysm_cats_df = nysm_cats_df[nysm_cats_df['climate_division_name']=='Western Plateau']
nysm_cats_df

Unnamed: 0,stid,number,name,lat [degrees],lon [degrees],elevation [m],county,nearest_city,state,distance_from_town [km],direction_from_town [degrees],climate_division,climate_division_name,wfo,commissioned,decommissioned
0,ADDI,107,Addison,42.04036,-77.23726,507.614,Steuben,Addison,NY,6.9,S,1,Western Plateau,BGM,2016-08-10 18:15:00 UTC,
6,BELM,70,Belmont,42.24249,-78.03958,417.576,Allegany,Belmont,NY,2.2,N,1,Western Plateau,BUF,2016-05-16 20:45:00 UTC,
28,COHO,74,Cohocton,42.51178,-77.43762,599.328,Steuben,Cohocton,NY,5.8,E,1,Western Plateau,BGM,2016-05-17 16:30:00 UTC,
34,DELE,67,Delevan,42.418464,-78.4232,622.31,Cattaraugus,Delevan,NY,9.2,SSE,1,Western Plateau,BUF,2016-03-08 21:15:00 UTC,
43,ELMI,73,Elmira,42.11332,-76.83664,332.445,Chemung,Elmira,NY,3.5,NW,1,Western Plateau,BGM,2016-06-30 17:35:00 UTC,
51,GROV,71,Grove,42.48951,-77.94946,609.722,Allegany,Grove,NY,8.9,NW,1,Western Plateau,BUF,2016-07-20 18:40:00 UTC,
55,HART,40,Hartsville,42.211221,-77.689733,729.569,Steuben,Hartsville,NY,4.0,S,1,Western Plateau,BGM,2016-08-09 21:00:00 UTC,
73,OLEA,65,Olean,42.09141,-78.40743,457.23,Cattaraugus,Olean,NY,2.1,ENE,1,Western Plateau,BUF,2016-06-02 23:15:00 UTC,
85,RAND,125,Randolph,42.14928,-78.90096,448.97,Cattaraugus,Randolph,NY,6.3,ESE,1,Western Plateau,BUF,2016-06-02 15:50:00 UTC,


In [50]:
stations = nysm_cats_df["stid"].tolist()

In [51]:
nysm_df = nysm_df[nysm_df['station'].isin(stations)]
hrrr_df = hrrr_df[hrrr_df['station'].isin(stations)]

In [52]:
nysm_df.head()

Unnamed: 0,index,station,valid_time,lat,lon,elev,tair,ta9m,td,relh,srad,pres,mslp,wspd_sonic,wmax_sonic,wdir_sonic,precip_total,snow_depth
0,1,ADDI,2018-01-01 01:00:00,42.040359,-77.237259,507.614014,-18.23782,-18.122169,-22.090469,71.722794,0.0,964.492004,971.503723,1.456784,3.037127,325.875793,0.0,0.045877
1,2,ADDI,2018-01-01 02:00:00,42.040359,-77.237259,507.614014,-18.368231,-18.33906,-21.647659,75.388893,0.0,964.286804,971.328857,1.484595,2.211236,305.772797,0.0,0.046327
2,3,ADDI,2018-01-01 03:00:00,42.040359,-77.237259,507.614014,-18.8304,-18.33363,-21.994141,76.068916,0.0,964.409973,971.574524,1.247007,1.992487,309.544586,0.0,0.045029
3,4,ADDI,2018-01-01 04:00:00,42.040359,-77.237259,507.614014,-18.518641,-18.328711,-21.890549,74.752434,0.0,964.479187,971.563843,1.821808,3.12947,314.346588,0.0,0.047812
4,5,ADDI,2018-01-01 05:00:00,42.040359,-77.237259,507.614014,-18.635839,-18.578341,-21.847321,75.787628,0.0,964.581726,971.698914,1.780934,2.918397,294.888092,0.0,0.047251


In [53]:
hrrr_df.head()

Unnamed: 0,valid_time,time,station,level_0,index,latitude,longitude,t2m,sh2,d2m,...,asnow,cape,dswrf,dlwrf,gh,u_total,u_dir,lead time,lsm,new_tp
1,2018-01-01 03:00:00,2018-01-01 01:00:00,ADDI,59218879,326784,42.045955,-77.218867,-19.199194,0.00068,-21.993216,...,7e-06,0.0,0.0,146.800003,5215.425781,4.024367,304.386841,2,1.0,0.001
2,2018-01-01 04:00:00,2018-01-01 02:00:00,ADDI,59218880,532440,42.045955,-77.218867,-19.118689,0.00068,-21.91857,...,1e-06,0.0,0.0,147.100006,5210.410645,3.768112,298.900574,2,1.0,0.0
3,2018-01-01 05:00:00,2018-01-01 03:00:00,ADDI,59218881,738096,42.045955,-77.218867,-19.439062,0.00065,-22.570868,...,0.0,0.0,0.0,147.600006,5207.041992,3.118722,286.333313,2,1.0,0.0
4,2018-01-01 06:00:00,2018-01-01 04:00:00,ADDI,59218882,943752,42.045955,-77.218867,-19.760291,0.00063,-22.821158,...,0.0,0.0,0.0,146.899994,5201.477539,3.214206,284.222107,2,1.0,0.0
5,2018-01-01 07:00:00,2018-01-01 05:00:00,ADDI,59218883,1149408,42.045955,-77.218867,-19.860175,0.00062,-22.993704,...,9e-06,0.0,0.0,146.600006,5200.875,3.268753,270.951294,2,1.0,0.001


In [54]:
# need to create a master list for valid_times so that all the dataframes are the same shape
master_time = hrrr_df['valid_time'].tolist()
for station in stations:
    hrrr_dft = hrrr_df[hrrr_df["station"] == station]
    nysm_dft = nysm_df[nysm_df["station"] == station]
    times = hrrr_dft['valid_time'].tolist()
    times2 = nysm_dft['valid_time'].tolist()
    result = list(set(times) & set(master_time) & set(times2))
    master_time = result

master_time_final = master_time

In [55]:
sort_me1 = (sorted(master_time_final))

In [56]:
sort_me1

[Timestamp('2018-01-01 03:00:00'),
 Timestamp('2018-01-01 04:00:00'),
 Timestamp('2018-01-01 05:00:00'),
 Timestamp('2018-01-01 06:00:00'),
 Timestamp('2018-01-01 07:00:00'),
 Timestamp('2018-01-01 08:00:00'),
 Timestamp('2018-01-01 09:00:00'),
 Timestamp('2018-01-01 10:00:00'),
 Timestamp('2018-01-01 11:00:00'),
 Timestamp('2018-01-01 12:00:00'),
 Timestamp('2018-01-01 13:00:00'),
 Timestamp('2018-01-01 14:00:00'),
 Timestamp('2018-01-01 15:00:00'),
 Timestamp('2018-01-01 16:00:00'),
 Timestamp('2018-01-01 17:00:00'),
 Timestamp('2018-01-01 18:00:00'),
 Timestamp('2018-01-01 19:00:00'),
 Timestamp('2018-01-01 20:00:00'),
 Timestamp('2018-01-01 21:00:00'),
 Timestamp('2018-01-01 22:00:00'),
 Timestamp('2018-01-01 23:00:00'),
 Timestamp('2018-01-02 00:00:00'),
 Timestamp('2018-01-02 01:00:00'),
 Timestamp('2018-01-02 02:00:00'),
 Timestamp('2018-01-02 03:00:00'),
 Timestamp('2018-01-02 04:00:00'),
 Timestamp('2018-01-02 05:00:00'),
 Timestamp('2018-01-02 06:00:00'),
 Timestamp('2018-01-

In [57]:
print(len(master_time_final))
sorted(master_time_final)

33958


[Timestamp('2018-01-01 03:00:00'),
 Timestamp('2018-01-01 04:00:00'),
 Timestamp('2018-01-01 05:00:00'),
 Timestamp('2018-01-01 06:00:00'),
 Timestamp('2018-01-01 07:00:00'),
 Timestamp('2018-01-01 08:00:00'),
 Timestamp('2018-01-01 09:00:00'),
 Timestamp('2018-01-01 10:00:00'),
 Timestamp('2018-01-01 11:00:00'),
 Timestamp('2018-01-01 12:00:00'),
 Timestamp('2018-01-01 13:00:00'),
 Timestamp('2018-01-01 14:00:00'),
 Timestamp('2018-01-01 15:00:00'),
 Timestamp('2018-01-01 16:00:00'),
 Timestamp('2018-01-01 17:00:00'),
 Timestamp('2018-01-01 18:00:00'),
 Timestamp('2018-01-01 19:00:00'),
 Timestamp('2018-01-01 20:00:00'),
 Timestamp('2018-01-01 21:00:00'),
 Timestamp('2018-01-01 22:00:00'),
 Timestamp('2018-01-01 23:00:00'),
 Timestamp('2018-01-02 00:00:00'),
 Timestamp('2018-01-02 01:00:00'),
 Timestamp('2018-01-02 02:00:00'),
 Timestamp('2018-01-02 03:00:00'),
 Timestamp('2018-01-02 04:00:00'),
 Timestamp('2018-01-02 05:00:00'),
 Timestamp('2018-01-02 06:00:00'),
 Timestamp('2018-01-

In [58]:
print(len(master_time))
sorted(master_time)

33958


[Timestamp('2018-01-01 03:00:00'),
 Timestamp('2018-01-01 04:00:00'),
 Timestamp('2018-01-01 05:00:00'),
 Timestamp('2018-01-01 06:00:00'),
 Timestamp('2018-01-01 07:00:00'),
 Timestamp('2018-01-01 08:00:00'),
 Timestamp('2018-01-01 09:00:00'),
 Timestamp('2018-01-01 10:00:00'),
 Timestamp('2018-01-01 11:00:00'),
 Timestamp('2018-01-01 12:00:00'),
 Timestamp('2018-01-01 13:00:00'),
 Timestamp('2018-01-01 14:00:00'),
 Timestamp('2018-01-01 15:00:00'),
 Timestamp('2018-01-01 16:00:00'),
 Timestamp('2018-01-01 17:00:00'),
 Timestamp('2018-01-01 18:00:00'),
 Timestamp('2018-01-01 19:00:00'),
 Timestamp('2018-01-01 20:00:00'),
 Timestamp('2018-01-01 21:00:00'),
 Timestamp('2018-01-01 22:00:00'),
 Timestamp('2018-01-01 23:00:00'),
 Timestamp('2018-01-02 00:00:00'),
 Timestamp('2018-01-02 01:00:00'),
 Timestamp('2018-01-02 02:00:00'),
 Timestamp('2018-01-02 03:00:00'),
 Timestamp('2018-01-02 04:00:00'),
 Timestamp('2018-01-02 05:00:00'),
 Timestamp('2018-01-02 06:00:00'),
 Timestamp('2018-01-

In [None]:
for station in stations:
    nysm_df1 = nysm_df[nysm_df['station']==station]
    hrrr_df1 = hrrr_df[hrrr_df['station']==station]

    master_df = hrrr_df1.merge(nysm_df1, on="valid_time")
    master_df = master_df.drop_duplicates(
        subset=["valid_time", "t2m"], keep="first"
    )

In [None]:
master_df = columns_drop(master_df)

In [None]:
master_df.head()

In [None]:
# Calculate the error using NWP data.
master_df = get_error.nwp_error("t2m", master_df)

In [None]:
master_df = normalize.encode(master_df, 'day_of_year', 366)

In [None]:
master_df.head()

In [None]:
master_df = get_flag.get_flag(master_df)

In [None]:
master_df

In [None]:
cols_to_carry = ['valid_time', 'station', 'latitude', 'longitude', 'flag']

In [None]:
new_df = master_df.drop(columns=cols_to_carry)

In [None]:
new_df, features = normalize.normalize_df(new_df)

In [None]:
new_df.head()

In [None]:
features

In [None]:
# Split the data into training and testing sets.
length = len(new_df)
test_len = int(length * 0.8)
df_train = new_df.iloc[:test_len].copy()
df_test = new_df.iloc[test_len:].copy()
print("Test Set Fraction", len(df_test) / len(new_df))

In [None]:
# Reintegrate the specified columns back into the training and testing DataFrames.
for c in cols_to_carry:
    df_train[c] = master_df[c]
    df_test[c] = master_df[c]

In [None]:
df_train.head()

In [None]:
df_test.head()