#### This notebook will take all NYSM data for a specified year & resample to model output times (e.g., 1H & 3H)

In [27]:
%matplotlib inline
import pandas as pd
import xarray as xr
import glob
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units
import datetime
import functools as ft

In [28]:
def get_raw_oksm_data(year):
    oksm_path = f"/home/aevans/nwp_bias/src/landtype/NY_cartopy/csv_city"
    file_dirs = glob.glob(f"{oksm_path}/*")
    file_dirs.sort()

    df_oksm_list = []
    print(f"importing files...")
    for x, _ in enumerate(file_dirs):
        ds_oksm = pd.read_csv(file_dirs[x])

        find_year = ds_oksm.where(ds_oksm["TIME"] < str(year + 1))
        find_year_r2 = find_year.where(find_year["TIME"] > str(year))
        df_oksm_list.append(find_year_r2)

    df_oksm = pd.concat(df_oksm_list).dropna()
    df_oksm = format_ok(df_oksm).dropna()

    # import elevations to dataframe
    df_lon = pd.read_csv("/home/aevans/landtype/geoinfo.csv")
    station_list = df_lon["stid"].tolist()
    elev_list = df_lon["elev"].tolist()
    lon_list = df_lon["elon"].tolist()
    lat_list = df_lon["nlat"].tolist()
    elevdict = {}
    londict = {}
    latdict = {}
    for x, _ in enumerate(station_list):
        elevdict.update({station_list[x]: elev_list[x]})
        londict.update({station_list[x]: lon_list[x]})
        latdict.update({station_list[x]: lat_list[x]})
    df_oksm["elev"] = df_oksm["station"].map(elevdict)
    df_oksm["lon"] = df_oksm["station"].map(londict)
    df_oksm["lat"] = df_oksm["station"].map(latdict)

    # format variables
    temp = units.Quantity(df_oksm["tair"].values, "degC")
    relh = df_oksm["relh"].values / 100.0
    df_oksm["td"] = mpcalc.dewpoint_from_relative_humidity(temp, relh).magnitude
    altimeter_value = units.Quantity(df_oksm["pres"].values, "hPa")
    # + 1.5 to adjust for barometer height
    height = units.Quantity(df_oksm["elev"].values + 1.5, "m")
    df_oksm["mslp"] = mpcalc.altimeter_to_sea_level_pressure(
        altimeter_value, height, temp
    )
    df_oksm["valid_time"] = pd.to_datetime(
        df_oksm["valid_time"], format="%Y-%m-%d %H:%M:%S"
    )
    df_oksm_ = (
        df_oksm.reset_index(drop=True)
        .set_index(["station", "valid_time"])
        .drop(df_oksm.columns[0], axis=1)
    )

    oksm_sites = df_oksm.reset_index()["station"].unique()

    return df_oksm_, oksm_sites

In [29]:
def get_valid_time_data(df, hours_list, interval):
    df = df.reset_index()
    freq = interval
    df_return = df[
        (df["valid_time"].dt.hour.isin(hours_list)) & (df["valid_time"].dt.minute == 0)
    ]
    # try putting this after concat at end
    df_return = df_return.set_index(["station", "valid_time"]).rename_axis(
        index={"valid_time": f"time_{freq}"}
    )
    return df_return

In [30]:
def get_resampled_precip_data(df, interval, method):
    """
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled
    method: min, max, mean, etc. [str]
    """
    precip_diff = df.groupby("station").diff().reset_index().set_index("valid_time")
    # remove unrealistic precipitation values (e.g., > 500 mm / 5 min)
    precip_diff.loc[precip_diff["precip_total"] > 500.0, "precip_total"] = np.nan
    a = (
        precip_diff.groupby("station")
        .resample(interval, label="right")
        .apply(method)
        .rename_axis(index={"valid_time": f"time_{interval}"})
    )
    return a

In [31]:
def format_ok(df):
    df = df.rename(
        columns={
            "STID": "station",
            "TIME": "valid_time",
            "PRES": "pres",
            "TAIR": "tair",
            "TDEW": "td",
            "RELH": "relh",
            "WSPD": "wspd_sonic",
            "WMAX": "wmax_sonic",
            "WDIR": "wdir_sonic",
            "RAIN": "precip_total",
        }
    )
    return df

In [32]:
def get_oksm_dataframe_for_resampled(df_oksm, freq):
    oksm_vars = [
        "lat",
        "lon",
        "elev",
        "tair",
        "td",
        "relh",
        "SRAD",
        "pres",
        "mslp",
        "wspd_sonic",
        "wmax_sonic",
        "wdir_sonic",
        "precip_total",
    ]
    if freq == "1H":
        hours_list = np.arange(0, 24)  # every hour
    elif freq == "3H":
        hours_list = np.arange(0, 24, 3)  # every 3 hours
    dfs = []

    for var in oksm_vars:
        if var in ["precip_total"]:
            print(var)
            dfs += [get_resampled_precip_data(df_oksm[var], freq, "sum")]
        else:
            print(var)
            dfs += [get_valid_time_data(df_oksm[var], hours_list, freq)]

    oksm_obs = pd.concat(dfs, axis=1)
    oksm_obs["precip_total"] = oksm_obs["precip_total"].apply(
        lambda x: np.where(x < 0.0, np.nan, x)
    )
    oksm_obs["tair"] = (oksm_obs["tair"] - 32) * (5 / 9)

    return oksm_obs

In [33]:
def main(year):
    # inputs
    save_path = f"/home/aevans/nwp_bias/data/oksm/"

    # get the raw nysm data
    print("--- get_raw_oksm_data ---")
    df_oksm, oksm_sites = get_raw_oksm_data(year)

    # resample the data to 1H and 3H frequencies
    print("--- get_oksm_dataframe_for_resampled ---")
    oksm_1H_obs = get_oksm_dataframe_for_resampled(df_oksm, "1H").dropna()
    oksm_3H_obs = get_oksm_dataframe_for_resampled(df_oksm, "3H").dropna()

    oksm_1H_obs.to_parquet(f"{save_path}oksm_1H_obs_{year}.parquet")
    oksm_3H_obs.to_parquet(f"{save_path}oksm_3H_obs_{year}.parquet")

In [34]:
df = main(2018)
df

--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total


Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,elev,tair,td,relh,SRAD,pres,mslp,wspd_sonic,wmax_sonic,wdir_sonic,precip_total
station,time_1H,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ACME,2018-01-01 01:00:00,34.80833,-98.02325,397.0,-8.888889,4.697881,47.0,0.0,29.22,28.205207,13.0,17.0,4.0,0.0
ACME,2018-01-01 02:00:00,34.80833,-98.02325,397.0,-9.444444,4.949214,51.0,0.0,29.25,28.238972,11.0,15.0,7.0,0.0
ACME,2018-01-01 03:00:00,34.80833,-98.02325,397.0,-10.000000,4.841935,54.0,0.0,29.26,28.253338,9.0,11.0,9.0,0.0
ACME,2018-01-01 04:00:00,34.80833,-98.02325,397.0,-10.000000,5.619356,57.0,0.0,29.28,28.272779,8.0,11.0,7.0,0.0
ACME,2018-01-01 05:00:00,34.80833,-98.02325,397.0,-10.555556,5.177945,59.0,0.0,29.31,28.306629,8.0,11.0,15.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YUKO,2018-12-31 19:00:00,35.55671,-97.75538,407.0,7.777778,42.643189,84.0,313.0,28.31,27.165527,7.0,10.0,229.0,0.0
YUKO,2018-12-31 20:00:00,35.55671,-97.75538,407.0,8.888889,41.914577,73.0,553.0,28.34,27.187105,10.0,14.0,277.0,0.0
YUKO,2018-12-31 21:00:00,35.55671,-97.75538,407.0,10.000000,42.195341,67.0,358.0,28.37,27.208762,7.0,11.0,282.0,0.0
YUKO,2018-12-31 22:00:00,35.55671,-97.75538,407.0,9.444444,41.247731,67.0,48.0,28.44,27.280012,8.0,11.0,326.0,0.0


In [35]:
years = [int(x) for x in np.arange(2019, 2022)]
print(years)

[2019, 2020, 2021]


In [36]:
for year in years:
    print(year)
    main(year)

2019
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
2020
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
2021
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
lat
lon
elev
tair
td
relh
SRAD
pres
mslp
wspd_sonic
wmax_sonic
wdir_sonic
precip_total
