In [788]:
%load_ext autoreload
%autoreload 2
import sys

# instead of creating a package using setup.py or building from a docker/singularity file,
# import the sister directory of src code to be called on in notebook.
# This keeps the notebook free from code to only hold visualizations and is easier to test
# It also helps keep the state of variables clean such that cells aren't run out of order with a mysterious state
sys.path.append("..")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [789]:
import glob
import multiprocessing as mp
import os
import warnings
import cfgrib
import metpy.calc as mpcalc
import numpy as np
import pandas as pd
import time
from metpy.units import units
from scipy import interpolate
from sklearn.neighbors import BallTree
from multiprocessing import Process

In [790]:
def load_nysm_data(year):
    # these parquet files are created by running "get_resampled_nysm_data.ipynb"
    nysm_path = "/home/aevans/nwp_bias/data/nysm/"

    nysm_1H_obs = pd.read_parquet(f"{nysm_path}nysm_1H_obs_{year}.parquet")
    nysm_3H_obs = pd.read_parquet(f"{nysm_path}nysm_3H_obs_{year}.parquet")
    return nysm_1H_obs, nysm_3H_obs

In [791]:
def read_data_ny(model, month, year, fh):
    cleaned_data_path = f"/home/aevans/ai2es/lstm/{model.upper()}/fh_{fh}/"

    filelist = glob.glob(f"{cleaned_data_path}{year}/{month}/*.parquet")
    filelist.sort()

    li = []
    for filename in filelist:
        df_temp = pd.read_parquet(filename)
        li.append(
            df_temp.reset_index()
        )  # reset the index in case indices are different among files
    df = pd.concat(li)
    return df

In [792]:
def reformat_df(df):
    df = df.dropna()
    df = df.reset_index()
    df["longitude"] = ((df["longitude"] + 180) % 360) - 180
    df[["t2m", "d2m"]] = df[["t2m", "d2m"]] - 273.15  # convert from K to deg C
    u10 = units.Quantity(df["u10"].values, "m/s")
    v10 = units.Quantity(df["v10"].values, "m/s")
    df["u_total"] = mpcalc.wind_speed(u10, v10).magnitude
    df["u_dir"] = mpcalc.wind_direction(u10, v10, convention="from").magnitude

    lead_time_delta = df["valid_time"] - df["time"]
    df["lead time"] = (24.0 * lead_time_delta.dt.days) + divmod(
        lead_time_delta.dt.seconds, 3600
    )[0]

    df = df.set_index("valid_time")
    return df

In [793]:
def interpolate_func_griddata(values, model_lon, model_lat, xnew, ynew):
    if np.mean(values) == np.nan:
        print("SOME VALS ARE NAN")
    vals = interpolate.griddata(
        (model_lon, model_lat), values, (xnew, ynew), method="linear"
    )
    if np.mean(values) == np.nan:
        print("SOME VALS ARE NAN")
    return vals

In [794]:
def datetime_convert(df, col):
    new_vals = []
    for i,_ in enumerate(df[col]):
        seconds = df[col].iloc[1] / 1e9  # Convert nanoseconds to seconds
        dt = datetime.datetime.utcfromtimestamp(seconds)
        new_vals.append(dt)
    df[col] = new_vals
    return df

In [795]:
def interpolate_model_data_to_nysm_locations_groupby(df_model, df_nysm, vars_to_interp):
    """
    Use this function if you would like to interpolate to NYSM locations rather than use the ball tree with
    the existing model grid.
    This function interpolates the model grid to the NYSM site locations for each variable in dataframe.
    The new dataframe is then returned.

    This function should not be used with the HRRR grid, as it is incredibly slow.
    """
    # New York
    df_nysm = df_nysm.groupby("station").mean()[["lat", "lon"]]
    xnew = df_nysm["lon"]
    ynew = df_nysm["lat"]

    df_model = df_model.reset_index().set_index("time")

    # if vals != points in interpolation routine
    # called a few lines below, it's because this line is finding multiple of the same valid_times which occurs at times later in the month
    # grab a smaller subset of the data encompassing New York State and Oklahoma
    df_model_ny = df_model[
        (df_model.latitude >= 39.0)
        & (df_model.latitude <= 47.0)
        & (df_model.longitude <= -71.0)
        & (df_model.longitude >= -80.0)
    ]

    model_lon_lat_ny = df_model_ny[
        df_model_ny["valid_time"] == df_model_ny["valid_time"].unique().min()
    ]

    model_lon_ny = model_lon_lat_ny["longitude"].values
    model_lat_ny = model_lon_lat_ny["latitude"].values

    df_model = df_model_ny[df_model_ny["longitude"].isin(model_lon_ny)]
    df_model = df_model.reset_index()

    # NEED TO FIX THIS
    df = pd.DataFrame()
    for v, var in enumerate(vars_to_interp):
        print(var)
        df[var] = df_model.groupby(["valid_time"])[var].apply(
            interpolate_func_griddata, model_lon_ny, model_lat_ny, xnew, ynew
        )
    print(df)
    df_explode = df.apply(lambda col: col.explode())
    print("explode1",df_explode)

    # add in the lat & lon & station
    if "latitude" in df_explode.keys():
        print("adding NYSM site column")
        nysm_sites = df_nysm.reset_index().station.unique()
        model_interp_lats = df_explode.latitude.unique()
        map_dict = {model_interp_lats[i]: nysm_sites[i] for i in range(len(nysm_sites))}
        df_explode["station"] = df_explode["latitude"].map(map_dict)
    
    df_explode = datetime_convert(df_explode, 'valid_time')
    df_explode = datetime_convert(df_explode, 'time')
    print("explode2",df_explode)
    return df_explode

In [796]:
def get_locations_for_ball_tree(df, nysm_1H_obs):
    locations_a = df.reset_index()[["latitude", "longitude"]]
    locations_b = nysm_1H_obs[["lat", "lon"]].dropna().drop_duplicates().reset_index()

    # ball tree to find nysm site locations
    # locations_a ==> build the tree
    # locations_b ==> query the tree
    # Creates new columns converting coordinate degrees to radians.
    for column in locations_a[["latitude", "longitude"]]:
        rad = np.deg2rad(locations_a[column].values)
        locations_a[f"{column}_rad"] = rad

    for column in locations_b[["lat", "lon"]]:
        rad = np.deg2rad(locations_b[column].values)
        locations_b[f"{column}_rad"] = rad

    return locations_a, locations_b

In [797]:
def haversine(lon1, lat1,lon2, lat2):
    import math
    R = 6371000  # radius of Earth in meters
    phi_1 = math.radians(lat1)
    phi_2 = math.radians(lat2)

    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    a = math.sin(delta_phi / 2.0) ** 2 + math.cos(phi_1) * math.cos(phi_2) * math.sin(delta_lambda / 2.0) ** 2
    
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    meters = R * c  # output distance in meters
    km = meters / 1000.0  # output distance in kilometers

    km = round(km, 3)

    return km

In [798]:
def get_ball_tree_indices_ny(model_data, nysm_1H_obs):
    locations_a, locations_b = get_locations_for_ball_tree(model_data, nysm_1H_obs)
    # Takes the first group's latitude and longitude values to construct the ball tree.

    ball = BallTree(
        locations_a[["latitude_rad", "longitude_rad"]].values, metric="haversine"
    )
    # k: The number of neighbors to return from tree
    k = 1
    # Executes a query with the second group. This will also return two arrays.
    distances, indices = ball.query(locations_b[["lat_rad", "lon_rad"]].values, k=k)
    # get indices in a format where we can query the df
    indices_list = [indices[x][0] for x in range(len(indices))]
    distances_list = [distances[x][0] for x in range(len(distances))]
    return indices_list

In [799]:
def find_closest_station(query_lat, query_lon, df):
    """
    Find the station with the closest latitude and longitude to the given query point.

    Parameters:
    query_lat (float): Latitude of the query point.
    query_lon (float): Longitude of the query point.
    df (pandas.DataFrame): DataFrame containing stations, latitudes, and longitudes.

    Returns:
    str: Name of the station with the closest latitude and longitude.
    float: Latitude of the closest station.
    float: Longitude of the closest station.
    """
    # Calculate distances for each station in the DataFrame
    df['distance'] = np.sqrt((df['latitude'] - query_lat)**2 + (df['longitude'] - query_lon)**2)
    
    # Find the station with the minimum distance
    closest_station = df.loc[df['distance'].idxmin()]
    
    try: 
        lon_q = closest_station['longitude'].iloc[0]
        lat_q = closest_station['latitude'].iloc[0]
    except:
        lon_q = closest_station['longitude']
        lat_q = closest_station['latitude']
    
    return closest_station, lon_q, lat_q

In [800]:
def df_with_nysm_locations(df, df_nysm, indices_list):
    # needs to mirror the df manipulations in get_locations_for_ball_tree locations a and b
    df = df.reset_index()
    df_nysm = df_nysm.dropna().reset_index()
    df_closest_locs = df.iloc[indices_list][["latitude", "longitude"]]
    df_closest_locs = df_closest_locs.drop_duplicates()
    df_nysm_station_locs = df_nysm.groupby("station")[["lat", "lon"]].mean()
    distances = []
    stations = []

    for x in range(len(df_nysm_station_locs.index)):
        df_dummy = pd.DataFrame()
        temp_ = df.iloc[indices_list]
        station_q, longitude_q, latitide_q = find_closest_station(df_nysm_station_locs.lat[x], df_nysm_station_locs.lon[x], temp_) 

        df_dummy = df[
                (df['latitude'] == latitide_q)
                & (df['longitude'] == longitude_q)
            ]
        df_dummy["station"] = df_nysm_station_locs.index[x]
        if x == 0:
            df_save = df_dummy
        else:
            df_save = pd.concat([df_save, df_dummy])

        dx = haversine(df_dummy['longitude'].iloc[0],df_dummy['latitude'].iloc[0], df_nysm_station_locs.lon[x], df_nysm_station_locs.lat[x])
        # get distances of GFS grid points to NYSM sites 
        distances.append(dx)
        #append station
        stations.append(df_dummy['station'].iloc[0])
        #plot the points
        # plot_points(df_dummy['longitude'].iloc[0],df_dummy['latitude'].iloc[0], df_nysm_station_locs.lon[x], df_nysm_station_locs.lat[x], df_dummy['station'].iloc[0])

    temp_df = pd.DataFrame()
    temp_df['station'] = stations
    temp_df['distance'] = distances
    interpolate_stations = []
    for i,_ in enumerate(temp_df['station']):
        if temp_df['distance'].iloc[i] > 5.0:
            interpolate_stations.append(temp_df['station'].iloc[i])
    df_save = df_save.set_index(["station", "valid_time"])

    return df_save, interpolate_stations

In [801]:
def redefine_precip_intervals_NAM(data, dt):
    # dt is 1 for 1H and 3 for 3H
    tp_data = data.reset_index().set_index(["valid_time", "time", "station"])[["tp"]]
    # get valid times 00, 06, 12, & 18
    tp_data["new tp keep"] = tp_data[
        (tp_data.index.get_level_values(level=0).hour == 12 + dt)
        | (tp_data.index.get_level_values(level=0).hour == 0 + dt)
    ]
    tp_data["tp to diff"] = tp_data[
        (tp_data.index.get_level_values(level=0).hour != 12 + dt)
        | (tp_data.index.get_level_values(level=0).hour != 0 + dt)
    ]["tp"]
    dummy = (
        tp_data.reset_index()
        .set_index(["station", "time", "valid_time"])
        .sort_index(level=1)
        .shift(periods=1)
    )
    tp_data["tp shifted"] = dummy.reset_index().set_index(
        ["valid_time", "time", "station"]
    )["tp"]
    tp_data["tp diff"] = tp_data["tp to diff"] - tp_data["tp shifted"]
    tp_data["new_tp"] = tp_data["new tp keep"].combine_first(tp_data["tp diff"])
    tp_data = tp_data.drop(
        columns=["new tp keep", "tp to diff", "tp shifted", "tp diff"]
    )

    # merge in with original dataframe
    data = data.reset_index().set_index(["valid_time", "time", "station"])
    data["new_tp"] = tp_data["new_tp"].clip(lower=0)
    return data

In [802]:
def redefine_precip_intervals_GFS(data):
    tp_data = data.reset_index().set_index(["valid_time", "time", "station"])[
        ["tp", "lead time"]
    ]
    # get valid times 00, 06, 12, & 18
    tp_data["new tp 1"] = tp_data.loc[
        (tp_data.index.get_level_values(level=0).hour % 6 == 0)
    ]["tp"]
    dummy = (
        tp_data.reset_index()
        .set_index(["station", "time", "valid_time"])
        .sort_index(level=1)
        .shift(periods=1)
    )
    tp_data["tp shifted"] = dummy.reset_index().set_index(
        ["valid_time", "time", "station"]
    )["tp"]
    tp_data["tp diff"] = tp_data["new tp 1"] - tp_data["tp shifted"]
    tp_data["new tp 2"] = tp_data.loc[
        (tp_data.index.get_level_values(level=0).hour % 6 != 0)
    ]["tp"]
    tp_data["new_tp"] = tp_data["new tp 2"].combine_first(tp_data["tp diff"])
    tp_data = tp_data.drop(columns=["tp shifted", "tp diff", "new tp 1", "new tp 2"])

    # merge in with original dataframe
    data = data.reset_index().set_index(["valid_time", "time", "station"])
    data["new_tp"] = tp_data["new_tp"].clip(lower=0)
    return data

In [803]:
def redefine_precip_intervals_HRRR(data):
    # dt is 1 for 1H and 3 for 3H
    tp_data = data.reset_index().set_index(["valid_time", "time", "station"])[
        ["tp", "lead time"]
    ]
    # should use lead time here instead of valid_time (to be more generalizable)
    tp_data["new tp keep"] = tp_data[tp_data["lead time"] == 1]["tp"]
    tp_data["tp to diff"] = tp_data[(tp_data["lead time"] != 1)]["tp"]
    dummy = (
        tp_data.reset_index()
        .set_index(["station", "time", "valid_time"])
        .sort_index(level=1)
        .shift(periods=1)
    )
    tp_data["tp shifted"] = dummy.reset_index().set_index(
        ["valid_time", "time", "station"]
    )["tp"]
    tp_data["tp diff"] = tp_data["tp"].diff()
    tp_data["new_tp"] = tp_data["new tp keep"].combine_first(tp_data["tp diff"])
    tp_data = tp_data.drop(
        columns=["new tp keep", "tp to diff", "tp shifted", "tp diff", "lead time"]
    )

    # merge in with original dataframe
    data = data.reset_index().set_index(["valid_time", "time", "station"])
    # replacing negative values with 0...the negative values are occurring during the forecast period which is unexpected behavior
    # as the precipitation forecast should accumulate throughout forecast period
    data["new_tp"] = tp_data["new_tp"].clip(lower=0)
    return data

In [804]:
def drop_unwanted_time_diffs(df_model_both_sites, t_int):
    # get rid of uneven time intervals that mess up precipitation forecasts

    # t_int == 3 for GFS and NAM > f36
    # t_int == 1 for NAM <= f36 & HRRR
    df_model_both_sites["lead time diff"] = df_model_both_sites.groupby(
        ["station", "time"]
    )["lead time"].diff()
    # following line fixes the issue where the lead time difference is nan for f01 and f39 because of the diff - we don't want to drop these later in the func
    df_model_both_sites = df_model_both_sites.fillna(value={"lead time diff": t_int})

    df_model_both_sites = df_model_both_sites.drop(
        df_model_both_sites[
            (df_model_both_sites["lead time diff"] > t_int)
            | (df_model_both_sites["lead time diff"].isnull())
        ].index
    )
    df_model_both_sites = df_model_both_sites.drop(columns=["lead time diff"])
    return df_model_both_sites

In [805]:
def mask_out_water(model, df_model):
    df_model = df_model.reset_index()
    # read in respective data
    # these files are hard coded since we only need land surface information that was not extracted from original files
    # within the data cleaning script
    indir = f"/home/aevans/nwp_bias/data/model_data/{model.upper()}/2018/01/"
    if model.upper() == "GFS":
        filename = "gfs_4_20180101_0000_003.grb2"
        ind = 42
        var = "landn"
    elif model.upper() == "NAM":
        filename = "nam_218_20180101_0000_003.grb2"
        ind = 26
        var = "lsm"
    elif model.upper() == "HRRR":
        filename = "20180101_hrrr.t00z.wrfsfcf03.grib2"
        ind = 34
        var = "lsm"
    ds = cfgrib.open_datasets(f"{indir}{filename}")

    ds_tointerp = ds[ind]  # extract the data array that contains land surface class
    ds_tointerp = ds_tointerp.assign_coords(
        {"longitude": (((ds_tointerp.longitude + 180) % 360) - 180)}
    )
    if model.upper() == "GFS":
        ds_tointerp = ds_tointerp.sortby("longitude")
    df_tointerp = ds_tointerp.to_dataframe(dim_order=None).reset_index()

    # will need to use this dataframe & merge with data (that needs to be interpolated)
    # based on lat/lon values
    df_model_merge = df_model.merge(
        df_tointerp[["latitude", "longitude", var]], on=["latitude", "longitude"]
    )
    df_model_merge = df_model_merge[
        df_model_merge[var] == 1
    ]  # only return grid cells over land

    return df_model_merge

In [806]:
def main(month, year, model, fh, mask_water=True):
    start_time = time.time()
    """
    This function loads in the parquet data cleaned from the grib files and interpolates (GFS, NAM) or finds the nearest
    grid neighbor (HRRR) for each specified variable to each NYSM site location across NYS. It also calculates the
    precipitation accumulation over 1-h (HRRR, NAM when forecast hour <= 36) and 3-h (GFS, NAM when forecast hour > 36)
    increments. These data are saved as parquet files.

    The following parameters need to be passed into main():

    month (str) - integer corresponding to calendar month (e.g. '01' is January, '02' is Februrary, etc.)
    year (str) - the year of interest (e.g., '2020')
    model (str) - hrrr, nam, gfs
    init(str) - initilization time for model, '00' or '12' UTC
    mask_water (bool) - true to mask out grid cells over water before interpolation/nearest-neighbor,
                        false to leave all grid cells available for interpolation/nearest-neighbor
    """
    if model == "HRRR":
        pres = "mslma"
    else:
        pres = "prmsl"

    nysm_1H_obs, nysm_3H_obs = load_nysm_data(year)
    df_model_ny = read_data_ny(model, month, year, fh)
    model = model.upper()

    # drop some info that got carried over from xarray data array
    keep_vars = [
        "valid_time",
        "time",
        "latitude",
        "longitude",
        "t2m",
        "sh2",
        "d2m",
        "r2",
        "u10",
        "v10",
        "tp",
        pres,
        "orog",
        "tcc",
        #"asnow",
        "cape",
        "cin",
        "dswrf",
        "dlwrf",
        "gh",
        ]

    if "x" in df_model_ny.keys():
        df_model_ny = df_model_ny.drop(
            columns=["x", "y"]
        )  # drop x & y if they're columns since reindex will fail with them in original index

    df_model_ny = df_model_ny.reset_index()[keep_vars]
    df_model_ny = reformat_df(df_model_ny)

    if mask_water==True:
        print("Masking Water")
        # before interpolation or nearest neighbor methods, mask out any grid cells over water
        df_model_ny = mask_out_water(model, df_model_ny)

    if model in ["GFS", "NAM"]:
        print("interpolating variables")
        vars_to_interp = [
                "valid_time",
                "time",
                "latitude",
                "longitude",
                "t2m",
                "sh2",
                "d2m",
                "r2",
                "u10",
                "v10",
                "u_total",
                "u_dir",
                "tp",
                pres,
                "orog",
                "tcc",
                #"asnow",
                "cape",
                "cin",
                "dswrf",
                "dlwrf",
                "gh",
                ]

        indices_list_ny = get_ball_tree_indices_ny(df_model_ny, nysm_1H_obs)


        # nearest neighbor
        df_model_nysm_sites_nn, interpolate_stations = df_with_nysm_locations(
            df_model_ny, nysm_1H_obs, indices_list_ny
        )
        print("Nearest neighbor", df_model_nysm_sites_nn)

        #interpolation 
        df_model_nysm_sites_interp = interpolate_model_data_to_nysm_locations_groupby(
            df_model_ny, nysm_1H_obs, vars_to_interp
        )
        print("interpolate", df_model_nysm_sites_interp)
        df_model_nysm_sites_nn["lead time"] = (
            df_model_nysm_sites_nn["lead time"].astype(float).round(0).astype(int)
        )

        df_model_nysm_sites_nn.reset_index(inplace=True)

        # print("nearest neighbor", df_model_nysm_sites_nn)
        # print("interpolation", df_model_nysm_sites_interp)
        # print('interpolate_stations', interpolate_stations)

        # join dataframes
        # Filter out rows where 'station' is not in interpolate_stations
        df_model_nysm_sites_nn = df_model_nysm_sites_nn[~df_model_nysm_sites_nn['station'].isin(interpolate_stations)]
        # Filter out rows where 'station' is in interpolate_stations
        df_model_nysm_sites_interp = df_model_nysm_sites_interp[df_model_nysm_sites_interp['station'].isin(interpolate_stations)]


        df_model_nysm_sites = pd.concat([df_model_nysm_sites_interp, df_model_nysm_sites_nn], axis=0)

    elif model == "HRRR":
        indices_list_ny = get_ball_tree_indices_ny(df_model_ny, nysm_1H_obs)
        df_model_nysm_sites = df_with_nysm_locations(
            df_model_ny, nysm_1H_obs, indices_list_ny
        )

        # to avoid future issues, convert lead time to float, round, and then convert to integer
        # without rounding first, the conversion to int will round to the floor, leading to incorrect lead times
        df_model_nysm_sites["lead time"] = (
            df_model_nysm_sites["lead time"].astype(float).round(0).astype(int)
        )

    # now get precip forecasts in smallest intervals (e.g., 1-h and 3-h) possible
    if model == "NAM":
        model_data_1H_ny = df_model_nysm_sites[df_model_nysm_sites["lead time"] <= 36]
        model_data_3H_ny = df_model_nysm_sites[df_model_nysm_sites["lead time"] > 36]

        # NY
        df_model_sites_1H_ny = redefine_precip_intervals_NAM(model_data_1H_ny, 1)
        df_model_sites_1H_ny = drop_unwanted_time_diffs(df_model_sites_1H_ny, 1.0)
        df_model_sites_3H_ny = redefine_precip_intervals_NAM(model_data_3H_ny, 3)
        df_model_sites_3H_ny = drop_unwanted_time_diffs(df_model_sites_3H_ny, 3.0)
        df_model_sites_1H_ny = redefine_precip_intervals_NAM(model_data_1H_ny, 1)
        df_model_sites_1H_ny = drop_unwanted_time_diffs(df_model_sites_1H_ny, 1.0)
        df_model_sites_3H_ny = redefine_precip_intervals_NAM(model_data_3H_ny, 3)
        df_model_sites_3H_ny = drop_unwanted_time_diffs(df_model_sites_3H_ny, 3.0)

        df_model_nysm_sites = pd.concat([df_model_sites_1H_ny, df_model_sites_3H_ny])

    # elif model == "GFS":
    #     df_model_nysm_sites = redefine_precip_intervals_GFS(df_model_nysm_sites)
    #     df_model_nysm_sites = drop_unwanted_time_diffs(df_model_nysm_sites, 3.0)
    # elif model == "HRRR":
    #     df_model_nysm_sites = redefine_precip_intervals_HRRR(df_model_nysm_sites)
    #     df_model_nysm_sites = drop_unwanted_time_diffs(df_model_nysm_sites, 1.0)

    savedir = f"/home/aevans/nwp_bias/src/machine_learning/data/gfs_data/fh{fh}/"
    df_model_nysm_sites=df_model_nysm_sites.fillna(0)
    print("final", df_model_nysm_sites)
    # savedir = f'/home/lgaudet/model-data/GFS/GFSv16_parallel/interp/'
    # if mask_water:
    #     df_model_nysm_sites.to_parquet(
    #         f"{savedir}/{model}_{year}_{month}_direct_compare_to_nysm_sites_mask_water.parquet"
    #     )
    # else:
    #     df_model_nysm_sites.to_parquet(
    #         f"{savedir}/{model}_-{year}_{month}_direct_compare_to_nysm_sites.parquet"
    #     )

    timer9 = time.time() - start_time

    print(f"Saving New Files For :: {model} : {year}--{month}")
    print("--- %s seconds ---" % (timer9))
    return df_model_nysm_sites

In [807]:
df_model_nysm_sites = main(str(1).zfill(2), 2018, "gfs", str(6).zfill(3))

Masking Water
interpolating variables


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = np.sqrt((df['latitude'] - query_lat)**2 + (df['longitude'] - query_lon)**2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dummy["station"] = df_nysm_station_locs.index[x]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['distance'] = np.sqrt((df['latitude'] - query_lat)**2 + (

Nearest neighbor                              level_0  index                time  latitude  \
station valid_time                                                          
ADDI    2018-01-01 06:00:00    43648    352 2018-01-01 00:00:00      42.0   
        2018-01-01 12:00:00    43649   1131 2018-01-01 06:00:00      42.0   
        2018-01-01 18:00:00    43650   1910 2018-01-01 12:00:00      42.0   
        2018-01-02 00:00:00    43651   2689 2018-01-01 18:00:00      42.0   
        2018-01-02 06:00:00    43652   3468 2018-01-02 00:00:00      42.0   
...                              ...    ...                 ...       ...   
YORK    2018-01-31 00:00:00    54183  93137 2018-01-30 18:00:00      43.0   
        2018-01-31 06:00:00    54184  93916 2018-01-31 00:00:00      43.0   
        2018-01-31 12:00:00    54185  94695 2018-01-31 06:00:00      43.0   
        2018-01-31 18:00:00    54186  95474 2018-01-31 12:00:00      43.0   
        2018-02-01 00:00:00    54187  96253 2018-01-31 18:0

In [808]:
df_model_nysm_sites

Unnamed: 0,valid_time,time,latitude,longitude,t2m,sh2,d2m,r2,u10,v10,...,cape,cin,dswrf,dlwrf,gh,station,level_0,index,lead time,landn
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,42.040359,-77.237259,-17.423811,0.000752,-20.837325,86.490169,2.821225,-2.485386,...,0.0,0.048242,0.0,152.145966,5202.046425,ADDI,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,42.182270,-74.801392,-20.474970,0.000589,-23.826337,91.103712,1.679245,-2.638395,...,0.0,0.048242,0.0,162.006020,5193.900439,ANDE,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,43.019939,-78.135658,-17.166581,0.000837,-19.292143,96.405929,2.662002,0.036011,...,0.0,0.048242,0.0,208.039013,5183.887393,BATA,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,41.528751,-73.945267,-15.657553,0.000741,-20.583329,74.069574,1.729408,-1.199113,...,0.0,0.048242,0.0,160.196030,5218.062449,BEAC,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,42.223221,-75.668518,-16.664159,0.000792,-20.249795,84.752106,2.293479,-3.262487,...,0.0,0.048242,0.0,186.261871,5193.905687,BELD,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10163,2018-01-31 00:00:00,2018-01-30 18:00:00,43.500000,-74.500000,-11.019012,0.001153,-16.191132,69.561378,2.675266,-4.410044,...,0.0,0.008301,97.0,243.160934,5273.935059,PISE,58399.0,93171.0,6.0,1.0
10164,2018-01-31 06:00:00,2018-01-31 00:00:00,43.500000,-74.500000,-13.291962,0.000986,-17.969986,73.891678,2.534788,-2.108374,...,0.0,-0.315125,0.0,195.307037,5299.503906,PISE,58400.0,93950.0,6.0,1.0
10165,2018-01-31 12:00:00,2018-01-31 06:00:00,43.500000,-74.500000,-14.316986,0.000795,-20.473679,65.777115,2.585244,-0.964770,...,0.0,0.161377,0.0,159.000000,5412.228027,PISE,58401.0,94729.0,6.0,1.0
10166,2018-01-31 18:00:00,2018-01-31 12:00:00,43.500000,-74.500000,-7.459534,0.001108,-16.631042,49.330124,1.037886,2.453921,...,0.0,-0.151855,300.0,186.000000,5440.027344,PISE,58402.0,95508.0,6.0,1.0


In [809]:
df1 = df_model_nysm_sites[df_model_nysm_sites.isna().any(axis=1)]

In [810]:
df1

Unnamed: 0,valid_time,time,latitude,longitude,t2m,sh2,d2m,r2,u10,v10,...,cape,cin,dswrf,dlwrf,gh,station,level_0,index,lead time,landn


In [812]:
df_model_nysm_sites

Unnamed: 0,valid_time,time,latitude,longitude,t2m,sh2,d2m,r2,u10,v10,...,cape,cin,dswrf,dlwrf,gh,station,level_0,index,lead time,landn
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,42.040359,-77.237259,-17.423811,0.000752,-20.837325,86.490169,2.821225,-2.485386,...,0.0,0.048242,0.0,152.145966,5202.046425,ADDI,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,42.182270,-74.801392,-20.474970,0.000589,-23.826337,91.103712,1.679245,-2.638395,...,0.0,0.048242,0.0,162.006020,5193.900439,ANDE,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,43.019939,-78.135658,-17.166581,0.000837,-19.292143,96.405929,2.662002,0.036011,...,0.0,0.048242,0.0,208.039013,5183.887393,BATA,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,41.528751,-73.945267,-15.657553,0.000741,-20.583329,74.069574,1.729408,-1.199113,...,0.0,0.048242,0.0,160.196030,5218.062449,BEAC,0.0,0.0,0.0,0.0
2018-01-01 06:00:00,2018-01-01 06:00:00,2018-01-01 00:00:00,42.223221,-75.668518,-16.664159,0.000792,-20.249795,84.752106,2.293479,-3.262487,...,0.0,0.048242,0.0,186.261871,5193.905687,BELD,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10163,2018-01-31 00:00:00,2018-01-30 18:00:00,43.500000,-74.500000,-11.019012,0.001153,-16.191132,69.561378,2.675266,-4.410044,...,0.0,0.008301,97.0,243.160934,5273.935059,PISE,58399.0,93171.0,6.0,1.0
10164,2018-01-31 06:00:00,2018-01-31 00:00:00,43.500000,-74.500000,-13.291962,0.000986,-17.969986,73.891678,2.534788,-2.108374,...,0.0,-0.315125,0.0,195.307037,5299.503906,PISE,58400.0,93950.0,6.0,1.0
10165,2018-01-31 12:00:00,2018-01-31 06:00:00,43.500000,-74.500000,-14.316986,0.000795,-20.473679,65.777115,2.585244,-0.964770,...,0.0,0.161377,0.0,159.000000,5412.228027,PISE,58401.0,94729.0,6.0,1.0
10166,2018-01-31 18:00:00,2018-01-31 12:00:00,43.500000,-74.500000,-7.459534,0.001108,-16.631042,49.330124,1.037886,2.453921,...,0.0,-0.151855,300.0,186.000000,5440.027344,PISE,58402.0,95508.0,6.0,1.0


In [None]:
import datetime as datetime
# Convert Unix time to datetime
unix_time = 1514786400000000000.0
seconds = unix_time / 1e9  # Convert nanoseconds to seconds
dt = datetime.datetime.utcfromtimestamp(seconds)

print(dt)

2018-01-01 06:00:00


In [813]:
df2 = pd.read_parquet('/home/aevans/nwp_bias/src/machine_learning/data/gfs_data/fh003/GFS_2018_01_direct_compare_to_nysm_sites_mask_water.parquet')
df2

Unnamed: 0,valid_time,time,latitude,longitude,t2m,sh2,d2m,r2,u10,v10,...,cape,cin,dswrf,dlwrf,gh,station,level_0,index,lead time,landn
2018-01-01 03:00:00.000000,2018-01-01 03:00:00,2018-01-01 00:00:00,42.040359,-77.237259,-17.114106,0.000741,-20.952301,83.162865,2.759705,-2.812990,...,0.0,-0.046729,0.0,152.963806,5215.356623,ADDI,0.0,0.0,0.0,0.0
2018-01-01 03:00:00.000000,2018-01-01 03:00:00,2018-01-01 00:00:00,42.182270,-74.801392,-19.109242,0.000660,-22.514039,88.838295,2.464176,-3.848759,...,0.0,-0.046729,0.0,165.653893,5209.961267,ANDE,0.0,0.0,0.0,0.0
2018-01-01 03:00:00.000000,2018-01-01 03:00:00,2018-01-01 00:00:00,43.019939,-78.135658,-16.927134,0.000862,-18.896244,97.445172,2.837316,0.408013,...,0.0,-0.046729,0.0,207.307663,5192.911397,BATA,0.0,0.0,0.0,0.0
2018-01-01 03:00:00.000000,2018-01-01 03:00:00,2018-01-01 00:00:00,41.528751,-73.945267,-15.344768,0.000751,-20.388625,73.349531,2.037120,-0.844642,...,0.0,-0.046729,0.0,160.752762,5235.698266,BEAC,0.0,0.0,0.0,0.0
2018-01-01 03:00:00.000000,2018-01-01 03:00:00,2018-01-01 00:00:00,42.223221,-75.668518,-16.330978,0.000806,-19.961235,83.403336,3.139344,-3.517746,...,0.0,-0.046729,0.0,182.921539,5210.042822,BELD,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:00:00.010163,2018-01-30 21:00:00,2018-01-30 18:00:00,43.500000,-74.500000,-7.653076,0.001576,-12.426880,70.871735,2.575454,-3.591877,...,1.0,-0.049512,180.0,265.800507,5278.076172,PISE,58399.0,93171.0,3.0,1.0
1970-01-01 00:00:00.010164,2018-01-31 03:00:00,2018-01-31 00:00:00,43.500000,-74.500000,-12.238831,0.001094,-16.745422,74.113548,3.562432,-3.312236,...,0.0,0.011597,0.0,206.000000,5264.423340,PISE,58400.0,93950.0,3.0,1.0
1970-01-01 00:00:00.010165,2018-01-31 09:00:00,2018-01-31 06:00:00,43.500000,-74.500000,-14.172058,0.000802,-20.362549,65.400002,3.111321,-1.632822,...,0.0,0.338501,0.0,158.000000,5367.766113,PISE,58401.0,94729.0,3.0,1.0
1970-01-01 00:00:00.010166,2018-01-31 15:00:00,2018-01-31 12:00:00,43.500000,-74.500000,-11.312134,0.000930,-18.708679,58.095161,1.759897,0.606235,...,0.0,0.375000,170.0,169.000000,5433.176758,PISE,58402.0,95508.0,3.0,1.0
