In [18]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
from comet_ml import Experiment
from comet_ml.integration.pytorch import log_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import os
import datetime as dt
import xarray as xr
import glob
import metpy.calc as mpcalc
from metpy.units import units

In [20]:
def get_flag(hrrr_df):
    stations_ls = hrrr_df["station"].unique()
    one_hour = dt.timedelta(hours=1)
    flag_ls = []

    for station in stations_ls:
        df = hrrr_df[hrrr_df["station"] == station]
        time_ls = df["valid_time"].tolist()
        for now, then in zip(time_ls, time_ls[1:]):
            if now + one_hour == then:
                flag_ls.append(False)
            else:
                flag_ls.append(True)

    flag_ls.append(False)
    hrrr_df["flag"] = flag_ls

    return hrrr_df

In [21]:
def read_hrrr_data():
    """
    Reads and concatenates parquet files containing forecast and error data for HRRR weather models
    for the years 2018 to 2022.

    Returns:
        pandas.DataFrame: of hrrr weather forecast information for each NYSM site.
    """

    years = ["2018", "2019"]
    savedir = "/home/aevans/ai2es/processed_data/HRRR/ny/"

    # create empty lists to hold dataframes for each model
    hrrr_fcast_and_error = []

    # loop over years and read in parquet files for each model
    for year in years:
        for month in np.arange(1, 13):
            str_month = str(month).zfill(2)
            if (
                os.path.exists(
                    f"{savedir}HRRR_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                )
                == True
            ):
                hrrr_fcast_and_error.append(
                    pd.read_parquet(
                        f"{savedir}HRRR_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                    )
                )
            else:
                continue

    # concatenate dataframes for each model
    hrrr_fcast_and_error_df = pd.concat(hrrr_fcast_and_error)
    hrrr_fcast_and_error_df = hrrr_fcast_and_error_df.reset_index().dropna()

    # return dataframes for each model
    return hrrr_fcast_and_error_df

In [22]:
def read_gfs_data(fh):
    """
    Reads and concatenates parquet files containing forecast and error data for HRRR weather models
    for the years 2018 to 2022.

    Returns:
        pandas.DataFrame: of hrrr weather forecast information for each NYSM site.
    """

    years = ["2018", "2019"]
    savedir = f"/home/aevans/nwp_bias/src/machine_learning/data/gfs_data/fh{fh}/"

    # create empty lists to hold dataframes for each model
    gfs_fcast_and_error = []

    # loop over years and read in parquet files for each model
    for year in years:
        for month in np.arange(1, 13):
            str_month = str(month).zfill(2)
            if (
                os.path.exists(
                    f"{savedir}GFS_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                )
                == True
            ):
                gfs_fcast_and_error.append(
                    pd.read_parquet(
                        f"{savedir}GFS_{year}_{str_month}_direct_compare_to_nysm_sites_mask_water.parquet"
                    )
                )
            else:
                continue

    # concatenate dataframes for each model
    gfs_fcast_and_error_df = pd.concat(gfs_fcast_and_error)

    # return dataframes for each model
    return gfs_fcast_and_error_df

In [23]:
def add_tabular(hrrr_df, geo_df, suffix):
    geo_keys = geo_df.keys()

    for i, _ in enumerate(geo_df["station"]):
        for k in geo_keys:
            hrrr_df.loc[
                hrrr_df["station"] == geo_df["station"].iloc[i], f"{k}_{suffix}"
            ] = geo_df[k].iloc[i]

    return hrrr_df

In [24]:
def encode(data, col, max_val):
    data[col + "_sin"] = np.sin(2 * np.pi * data[col] / max_val)
    data[col + "_cos"] = np.cos(2 * np.pi * data[col] / max_val)

    return data

In [25]:
def get_flag():
    stations_ls = hrrr_df["station"].unique()
    one_hour = dt.timedelta(hours=1)
    flag_ls = []

    for station in stations_ls:
        df = hrrr_df[hrrr_df["station"] == station]
        time_ls = df["valid_time"].tolist()
        for now, then in zip(time_ls, time_ls[1:]):
            if now + one_hour == then:
                flag_ls.append(False)
            else:
                flag_ls.append(True)

    flag_ls.append(False)
    hrrr_df["flag"] = flag_ls

    return hrrr_df

In [26]:
def load_nysm_data():
    # these parquet files are created by running "get_resampled_nysm_data.ipynb"
    nysm_path = "/home/aevans/nwp_bias/data/nysm/"

    nysm_1H = []
    for year in np.arange(2018, 2020):
        df = pd.read_parquet(f"{nysm_path}nysm_1H_obs_{year}.parquet")
        df.reset_index(inplace=True)
        nysm_1H.append(df)
    nysm_1H_obs = pd.concat(nysm_1H)
    nysm_1H_obs = nysm_1H_obs.dropna()
    return nysm_1H_obs

In [28]:
df = pd.read_parquet(
    "/home/aevans/nwp_bias/src/machine_learning/data/gfs_data/fh096/GFS_2018_04_direct_compare_to_nysm_sites_mask_water.parquet"
)
df = df[df["station"] == "ADDI"]

In [29]:
df

Unnamed: 0,time,latitude,longitude,t2m,sh2,d2m,r2,u10,v10,u_total,...,cin,dswrf,dlwrf,gh,station,valid_time,level_0,index,lead time,landn
2018-04-05 00:00:00,2018-04-01,42.040359,-77.237259,-1.752735,0.002370,-6.954848,67.564892,5.995449,-2.795196,6.619442,...,-1.468936,318.268707,270.169617,5356.411487,ADDI,2018-04-05 00:00:00,0.0,0.0,0.0,0.0
2018-04-05 06:00:00,2018-04-01,42.040359,-77.237259,-5.363739,0.001702,-11.119905,64.677582,5.555916,-3.149315,6.388231,...,-0.113953,0.000000,213.278107,5322.060792,ADDI,2018-04-05 06:00:00,0.0,0.0,0.0,0.0
2018-04-05 12:00:00,2018-04-01,42.040359,-77.237259,-4.024001,0.001730,-10.930130,59.089184,5.982168,-1.221361,6.114904,...,-0.730127,9.009796,213.887024,5274.619798,ADDI,2018-04-05 12:00:00,0.0,0.0,0.0,0.0
2018-04-05 18:00:00,2018-04-01,42.040359,-77.237259,1.377793,0.001858,-10.085236,42.544949,8.165777,1.145828,8.254450,...,0.444458,493.516998,264.384003,5311.288021,ADDI,2018-04-05 18:00:00,0.0,0.0,0.0,0.0
2018-04-06 00:00:00,2018-04-01,42.040359,-77.237259,-0.905446,0.002078,-8.681306,55.739490,3.067875,-0.107490,3.070001,...,0.490723,430.501312,222.139862,5358.655597,ADDI,2018-04-06 00:00:00,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-05-03 18:00:00,2018-04-01,42.040359,-77.237259,21.573272,0.012757,17.143937,75.596592,5.580210,5.001514,7.504849,...,-0.715625,482.678619,369.579315,5783.877796,ADDI,2018-05-03 18:00:00,0.0,0.0,0.0,0.0
2018-05-04 00:00:00,2018-04-01,42.040359,-77.237259,17.415724,0.012409,16.692720,95.259930,4.672379,0.472822,4.718947,...,-10.032837,122.151764,394.838562,5778.286235,ADDI,2018-05-04 00:00:00,0.0,0.0,0.0,0.0
2018-05-04 06:00:00,2018-04-01,42.040359,-77.237259,14.796630,0.010754,14.438827,97.633713,1.222315,1.447019,1.917787,...,-32.723877,0.000000,338.283325,5753.355619,ADDI,2018-05-04 06:00:00,0.0,0.0,0.0,0.0
2018-05-04 12:00:00,2018-04-01,42.040359,-77.237259,16.759781,0.011877,15.879318,95.194820,3.955989,4.084792,5.884556,...,-56.076504,3.098694,377.799011,5745.660430,ADDI,2018-05-04 12:00:00,0.0,0.0,0.0,0.0


In [None]:
forecast_hr = 6
stations = ["a", "b", "c", "d"]
X = df.iloc[0:10].values
X

In [None]:
x = X
x.shape
x[: int(forecast_hr / 3), -10:] = x[int(forecast_hr / 3), -int(10) :]
x

In [None]:
x[: int(forecast_hr / 3), -10:]

In [None]:
x[int(forecast_hr / 3), -int(10) :]

In [None]:
df = load_nysm_data()

In [None]:
df

In [None]:
gfs_df = read_gfs_data("003")

In [None]:
gfs_df["station"].unique()

In [None]:
gfs_df

In [None]:
gfs_df.keys()

In [None]:
df1["station"].unique()

In [None]:
df.reset_index(inplace=True)
df = df.rename(columns={"time_1H": "valid_time"})
mytimes = hrrr_df["valid_time"].tolist()
# nysm_df = df[df['valid_time'].isin(mytimes)]

In [None]:
df.shape

In [None]:
hrrr_df.shape

In [None]:
nysm_cats_path = "/home/aevans/nwp_bias/src/landtype/data/nysm.csv"
nlcd_path = "/home/aevans/nwp_bias/src/correlation/data/nlcd_nam.csv"
aspect_path = "/home/aevans/nwp_bias/src/correlation/data/aspect_nam.csv"
elev_path = "/home/aevans/nwp_bias/src/correlation/data/elev_nam.csv"

In [None]:
nlcd_df = pd.read_csv(nlcd_path)
aspect_df = pd.read_csv(aspect_path)
elev_df = pd.read_csv(elev_path)
nysm_cats_df = pd.read_csv(nysm_cats_path)

In [None]:
nysm_cats_df["climate_division_name"][1]

In [None]:
cats = nysm_cats_df["climate_division"].unique()

In [None]:
nysm_cats_df1 = nysm_cats_df[nysm_cats_df["climate_division"] == cats[0]]
category_name = nysm_cats_df1["climate_division_name"].unique()[0]
stations = nysm_cats_df1["stid"].tolist()
hrrr_df1 = hrrr_df[hrrr_df["station"].isin(stations)]
nysm_df1 = df[df["station"].isin(stations)]

master_df = hrrr_df1.merge(nysm_df1, on="valid_time", suffixes=(None, "_nysm"))
# master_df['day_of_year'] = master_df['valid_time'].dt.dayofyear
# master_df = encode(master_df, 'day_of_year', 366)
# master_df = add_tabular(master_df, nlcd_df, 'nlcd')
# master_df = add_tabular(master_df, aspect_df, 'aspect')
# master_df = add_tabular(master_df, elev_df, 'elev')

# master_df.to_parquet(f'/home/aevans/nwp_bias/src/machine_learning/data/rough_parquets/rough_lstm_nysmcat_{category_name}.parquet')

In [None]:
hrrr = pd.read_parquet("/home/aevans/ai2es/lstm/fh_04/2022/20220101_hrrr_fh04.parquet")
hrrr

In [None]:
master_df = master_df.drop_duplicates(
    subset=["valid_time", "station", "t2m"], keep="first"
)

In [None]:
master_df

In [None]:
# hrrr_df = read_hrrr_data()
hrrr_df["day_of_year"] = hrrr_df["valid_time"].dt.dayofyear
hrrr_df = encode(hrrr_df, "day_of_year", 366)
hrrr_df = add_tabular(hrrr_df, nlcd_df, "nlcd")
hrrr_df = add_tabular(hrrr_df, aspect_df, "aspect")
hrrr_df = add_tabular(hrrr_df, elev_df, "elev")

In [None]:
nysm_cats_df["climate_division_name"].unique()[0]

In [None]:
new_df = pd.read_parquet(
    "/home/aevans/nwp_bias/src/machine_learning/data/rough_lstm_nysmcat_Western Plateau.parquet"
)

In [None]:
the_keys = new_df.keys()

In [None]:
for k in the_keys:
    print(k)