#### This notebook will take all NYSM data for a specified year & resample to model output times (e.g., 1H & 3H)


In [4]:
%matplotlib inline
import pandas as pd
import xarray as xr
import glob
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units

In [2]:
def get_raw_nysm_data(year):
    # first, find the available months in the year directory
    nysm_path = f"/home/aevans/nysm/archive/nysm/netcdf/proc/{year}/"
    file_dirs = glob.glob(f"{nysm_path}/*")
    file_dirs.sort()
    avail_months = [int(x.split("/")[-1]) for x in file_dirs]

    df_nysm_list = []
    for x in range(avail_months[0], avail_months[-1] + 1):
        print("month index: ", x)
        ds_nysm_month = xr.open_mfdataset(f"{nysm_path}{str(x).zfill(2)}/*.nc")
        df_nysm_list.append(ds_nysm_month.to_dataframe())

    df_nysm = pd.concat(df_nysm_list)

    temp = units.Quantity(df_nysm["tair"].values, "degC")
    relh = df_nysm["relh"].values / 100.0
    df_nysm["td"] = mpcalc.dewpoint_from_relative_humidity(temp, relh).magnitude

    altimeter_value = units.Quantity(df_nysm["pres"].values, "hPa")
    height = units.Quantity(
        df_nysm["elev"].values + 1.5, "m"
    )  # + 1.5 to adjust for barometer height
    df_nysm["mslp"] = mpcalc.altimeter_to_sea_level_pressure(
        altimeter_value, height, temp
    )

    nysm_sites = df_nysm.reset_index()["station"].unique()

    return df_nysm, nysm_sites


def get_resampled_data(df, interval, method):
    """
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled
    method: min, max, mean, etc. [str]
    """
    return (
        df.reset_index()
        .set_index("time_5M")
        .groupby("station")
        .resample(interval, label="right")
        .apply(method)
        .rename_axis(index={"time_5M": f"time_{interval}"})
    )


def get_valid_time_data(df, hours_list, interval):
    df = df.reset_index()
    # extract hourly observations at top of the hour in provided list
    df_return = df[
        (df["time_5M"].dt.hour.isin(hours_list)) & (df["time_5M"].dt.minute == 0)
    ]
    return df_return.set_index(["station", "time_5M"]).rename_axis(
        index={"time_5M": f"time_{interval}"}
    )


def get_resampled_precip_data(df, interval, method):
    """
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled
    method: min, max, mean, etc. [str]
    """
    precip_diff = df.groupby("station").diff().reset_index().set_index("time_5M")
    # remove unrealistic precipitation values (e.g., > 500 mm / 5 min)
    precip_diff.loc[precip_diff["precip_total"] > 500.0, "precip_total"] = np.nan
    a = (
        precip_diff.groupby("station")
        .resample(interval, label="right")
        .apply(method)
        .rename_axis(index={"time_5M": f"time_{interval}"})
    )
    return a


def get_resampled_wind_data(df, interval, method):
    """
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled
    method: min, max, mean, etc. [str]
    """
    df = df.reset_index()
    wind_resampled = (
        df.groupby(["station", pd.Grouper(freq=interval, key="time_5M")])["wspd_sonic"]
        .apply(method)
        .rename(f"wspd_sonic_{method}")
        .rename_axis(index={"time_5M": f"time_{interval}"})
        .reset_index()
        .set_index(["station", f"time_{interval}"])
    )
    return wind_resampled


def get_nysm_dataframe_for_resampled(df_nysm, freq):
    nysm_vars = [
        "lat",
        "lon",
        "elev",
        "tair",
        "ta9m",
        "td",
        "relh",
        "srad",
        "pres",
        "mslp",
        "wspd_sonic",
        "wmax_sonic",
        "wdir_sonic",
        "precip_total",
        "snow_depth",
    ]
    if freq == "1H":
        hours_list = np.arange(0, 24)  # every hour
    elif freq == "3H":
        hours_list = np.arange(0, 24, 3)  # every 3 hours

    precip_dfs = []
    wind_dfs = []

    for var in nysm_vars:
        print(var)
        if var == "precip_total":
            precip_dfs.append(get_resampled_precip_data(df_nysm[var], freq, "sum"))
        elif var == "wspd_sonic":
            wind_resampled = get_resampled_wind_data(df_nysm[var], freq, "mean")
            wind_valid_time = get_valid_time_data(df_nysm[var], hours_list, freq)
            # Combine wind data with valid time data
            wind_dfs.append(wind_resampled)
            wind_dfs.append(wind_valid_time)
        else:
            wind_dfs.append(get_valid_time_data(df_nysm[var], hours_list, freq))

    precip_combined = pd.concat(precip_dfs, axis=1)
    wind_combined = pd.concat(wind_dfs, axis=1)

    # Concatenate precip and wind data frames
    nysm_obs = pd.concat([wind_combined, precip_combined], axis=1)

    # Apply condition to precip_total column
    nysm_obs["precip_total"] = nysm_obs["precip_total"].apply(
        lambda x: np.where(x < 0.0, np.nan, x)
    )
    return nysm_obs

In [3]:
def main(year):
    # inputs
    save_path = f"/home/aevans/nwp_bias/data/nysm/"

    # get the raw nysm data
    print("--- get_raw_nysm_data ---")
    df_nysm, nysm_sites = get_raw_nysm_data(year)

    # resample the data to 1H and 3H frequencies
    print("--- get_nysm_dataframe_for_resampled ---")
    nysm_1H_obs = get_nysm_dataframe_for_resampled(df_nysm, "1H")
    nysm_3H_obs = get_nysm_dataframe_for_resampled(df_nysm, "3H")

    # nysm_1H_obs.to_parquet(f'{save_path}nysm_1H_obs_{year}.parquet')
    # nysm_3H_obs.to_parquet(f'{save_path}nysm_3H_obs_{year}.parquet')

    return nysm_1H_obs

In [4]:
df = main(2018)
df

--- get_raw_nysm_data ---
month index:  1
month index:  2
month index:  3
month index:  4
month index:  5
month index:  6
month index:  7
month index:  8
month index:  9
month index:  10
month index:  11
month index:  12
--- get_nysm_dataframe_for_resampled ---
lat
lon
elev
tair
ta9m
td
relh
srad
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total


  precip_diff.groupby("station")


snow_depth
lat
lon
elev
tair
ta9m
td
relh
srad
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total


  precip_diff.groupby("station")


snow_depth


Unnamed: 0_level_0,Unnamed: 1_level_0,precip_total,lat,lon,elev,tair,ta9m,td,relh,srad,pres,mslp,wspd_sonic_mean,wspd_sonic,wmax_sonic,wdir_sonic,snow_depth
station,time_1H,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ADDI,2018-01-01 01:00:00,0.0,42.040359,-77.237259,507.614014,-18.237820,-18.122169,-22.090469,71.722794,0.000000,964.492004,971.503723,1.485138,1.456784,3.037127,325.875793,0.045877
ADDI,2018-01-01 02:00:00,0.0,42.040359,-77.237259,507.614014,-18.368231,-18.339060,-21.647659,75.388893,0.000000,964.286804,971.328857,1.565861,1.484595,2.211236,305.772797,0.046327
ADDI,2018-01-01 03:00:00,0.0,42.040359,-77.237259,507.614014,-18.830400,-18.333630,-21.994141,76.068916,0.000000,964.409973,971.574524,1.550868,1.247007,1.992487,309.544586,0.045029
ADDI,2018-01-01 04:00:00,0.0,42.040359,-77.237259,507.614014,-18.518641,-18.328711,-21.890549,74.752434,0.000000,964.479187,971.563843,1.884284,1.821808,3.129470,314.346588,0.047812
ADDI,2018-01-01 05:00:00,0.0,42.040359,-77.237259,507.614014,-18.635839,-18.578341,-21.847321,75.787628,0.000000,964.581726,971.698914,1.649987,1.780934,2.918397,294.888092,0.047251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WGAT,2018-01-01 00:00:00,,43.532410,-75.158600,442.966003,,,,,,,,,,,,
WHIT,2018-01-01 00:00:00,,43.485073,-73.423073,36.563801,-16.304770,-15.851770,-23.937332,51.771172,0.220189,1021.562988,1022.429016,2.180317,1.393547,3.209960,50.748920,0.213306
WOLC,2018-01-01 00:00:00,,43.228680,-76.842613,121.219002,,,,,,,,,,,,
YORK,2018-01-01 00:00:00,,42.855042,-77.847763,177.942001,-20.394730,-17.461361,-22.580978,82.625633,0.009835,1007.806030,1011.042786,0.985538,1.769329,1.986342,216.832993,0.076018


In [5]:
for k in df.keys():
    print(k)

precip_total
lat
lon
elev
tair
ta9m
td
relh
srad
pres
mslp
wspd_sonic_mean
wspd_sonic
wmax_sonic
wdir_sonic
snow_depth


In [6]:
df = df.reset_index()

In [7]:
df.iloc[:15]

Unnamed: 0,station,time_1H,precip_total,lat,lon,elev,tair,ta9m,td,relh,srad,pres,mslp,wspd_sonic_mean,wspd_sonic,wmax_sonic,wdir_sonic,snow_depth
0,ADDI,2018-01-01 01:00:00,0.0,42.040359,-77.237259,507.614014,-18.23782,-18.122169,-22.090469,71.722794,0.0,964.492004,971.503723,1.485138,1.456784,3.037127,325.875793,0.045877
1,ADDI,2018-01-01 02:00:00,0.0,42.040359,-77.237259,507.614014,-18.368231,-18.33906,-21.647659,75.388893,0.0,964.286804,971.328857,1.565861,1.484595,2.211236,305.772797,0.046327
2,ADDI,2018-01-01 03:00:00,0.0,42.040359,-77.237259,507.614014,-18.8304,-18.33363,-21.994141,76.068916,0.0,964.409973,971.574524,1.550868,1.247007,1.992487,309.544586,0.045029
3,ADDI,2018-01-01 04:00:00,0.0,42.040359,-77.237259,507.614014,-18.518641,-18.328711,-21.890549,74.752434,0.0,964.479187,971.563843,1.884284,1.821808,3.12947,314.346588,0.047812
4,ADDI,2018-01-01 05:00:00,0.0,42.040359,-77.237259,507.614014,-18.635839,-18.578341,-21.847321,75.787628,0.0,964.581726,971.698914,1.649987,1.780934,2.918397,294.888092,0.047251
5,ADDI,2018-01-01 06:00:00,0.0,42.040359,-77.237259,507.614014,-18.700809,-18.628189,-22.008728,75.137451,0.0,964.307983,971.436768,1.326938,1.593597,2.972308,300.071198,0.045546
6,ADDI,2018-01-01 07:00:00,0.0,42.040359,-77.237259,507.614014,-19.828449,-18.99943,-22.554947,78.869072,0.0,964.62439,972.054993,1.063964,0.881706,1.747103,299.200897,0.047525
7,ADDI,2018-01-01 08:00:00,0.0,42.040359,-77.237259,507.614014,-19.028799,-18.68709,-21.532196,80.559052,0.0,964.603577,971.824036,1.256366,1.40832,2.517742,295.739594,0.046681
8,ADDI,2018-01-01 09:00:00,0.0,42.040359,-77.237259,507.614014,-18.81242,-18.61508,-21.197678,81.426468,0.0,964.42157,971.581787,1.663465,1.245601,3.024723,302.81839,0.044967
9,ADDI,2018-01-01 10:00:00,0.0,42.040359,-77.237259,507.614014,-18.851509,-18.62261,-21.637848,78.620453,0.0,964.62439,971.798584,1.443731,1.240315,2.010922,306.347809,0.046067


In [8]:
df["precip_total"].unique()

array([ 0.        ,  0.1499939 ,  0.06001282, ...,  3.78001404,
       11.68997192,  0.12000006])

In [None]:
wind_mean_hourly = (
    df.groupby(["station", pd.Grouper(freq="1H", key="time_5M")])["wspd_sonic"]
    .mean()
    .rename("wspd_sonic_mean")
    .rename_axis(index={"time_5M": "time_1H"})
)

In [None]:
wind_mean_hourly

In [None]:
freq = "1H"

In [None]:
if freq == "1H":
    hours_list = np.arange(0, 24)

In [None]:
wind_diff

In [None]:
# a = wind_diff.groupby("station").resample('1H', label="right").apply('mean').rename_axis(index={"time_5M": f"time_{freq}"})

In [None]:
a

In [None]:
years = [str(x) for x in np.arange(2018, 2022)]
print(years)

In [None]:
for year in years:
    print(year)
    main(year)

In [None]:
year = 2018
df_nysm, nysm_sites = get_raw_nysm_data(year)

In [None]:
df_nysm