#### This notebook will take all NYSM data for a specified year & resample to model output times (e.g., 1H & 3H)

In [1]:
%matplotlib inline
import pandas as pd
import xarray as xr
import glob
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units
import datetime 
import functools as ft

Matplotlib created a temporary config/cache directory at /tmp/tmp.uUXKAIYoDF/matplotlib-zgsdhudg because the default path (/home/aevans/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
def get_raw_oksm_data(year):
    oksm_path = f'/home/aevans/landtype/NY_cartopy/csv_city/'
    file_dirs = glob.glob(f'{oksm_path}/*')
    file_dirs.sort()
    df_oksm_list = []
    print(f'importing files...')
    for x,_ in enumerate(file_dirs):
        ds_oksm = pd.read_csv(file_dirs[x])
        find_year = ds_oksm.where(ds_oksm['TIME'] < str(year+1))
        find_year_r2 = find_year.where(find_year['TIME'] > str(year))
        df_oksm_list.append(find_year_r2)

    df_oksm = pd.concat(df_oksm_list).dropna()

    # import elevations to dataframe
    df_lon = pd.read_csv('/home/aevans/landtype/geoinfo.csv')
    station_list = df_lon['stid'].tolist()
    elev_list = df_lon['elev'].tolist()
    lon_list = df_lon['elon'].tolist()
    lat_list = df_lon['nlat'].tolist()
    elevdict={}
    londict={}
    latdict={}
    for x,_ in enumerate(station_list):
        elevdict.update({station_list[x] : elev_list[x]})
        londict.update({station_list[x] : lon_list[x]})
        latdict.update({station_list[x] : lat_list[x]})
    df_oksm['elev'] = df_oksm['STID'].map(elevdict)
    df_oksm['lon'] = df_oksm['STID'].map(londict)
    df_oksm['lat'] = df_oksm['STID'].map(latdict)

    # format variables
    temp = units.Quantity(df_oksm['TAIR'].values, 'degC')
    relh = df_oksm['RELH'].values/100.
    df_oksm['TDEW'] = mpcalc.dewpoint_from_relative_humidity(temp, relh).magnitude
    altimeter_value = units.Quantity(df_oksm['PRES'].values, 'hPa')
    # + 1.5 to adjust for barometer height
    height = units.Quantity(df_oksm['elev'].values + 1.5, 'm')
    df_oksm['mslp'] = mpcalc.altimeter_to_sea_level_pressure(altimeter_value, height, temp)
    df_oksm['TIME'] = pd.to_datetime(df_oksm['TIME'], format='%Y-%m-%d %H:%M:%S')
    df_oksm_ = df_oksm.reset_index(drop=True).set_index(['STID','TIME']).drop(df_oksm.columns[0], axis =1)

    oksm_sites = df_oksm.reset_index()['STID'].unique()

    return df_oksm_, oksm_sites

In [3]:
def get_valid_time_data(df, hours_list, interval):
    df = df.reset_index()
    freq = interval
    df_return = df[(df['TIME'].dt.hour.isin(hours_list)) & (df['TIME'].dt.minute==0)]
    # try putting this after concat at end 
    df_return.set_index(['STID','TIME']).rename_axis(index={'TIME':f'time_{freq}'})
    return df_return

In [4]:
def get_resampled_precip_data(df, interval, method):
    '''
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled 
    method: min, max, mean, etc. [str]
    '''
    precip_diff = df.groupby('STID').diff().reset_index().set_index('TIME')
    # remove unrealistic precipitation values (e.g., > 500 mm / 5 min)
    precip_diff.loc[precip_diff['RAIN'] > 500.,'RAIN'] = np.nan
    return precip_diff.groupby('STID').resample(interval, label='right').apply(method).rename_axis(index={'TIME':f'time_{interval}'})

In [5]:
def get_oksm_dataframe_for_resampled(df_oksm, freq):
    oksm_vars = ['PRES', 'TAIR', 'TMIN', 'TMAX', 'TDEW',
       'RELH', 'WDIR', 'WSPD', 'WMAX', 'RAIN', 'SRAD', 'elev', 'lon', 'lat']
    if freq=='1H':
        hours_list = np.arange(0,24) # every hour
    elif freq=='3H':
        hours_list = np.arange(0,24,3) # every 3 hours
    dfs = []
    
    for var in oksm_vars:
        if var in ['RAIN']:
            print(var)
            dfs += [get_resampled_precip_data(df_oksm[var], freq, 'sum')]
        else:
            print(var)
            dfs += [get_valid_time_data(df_oksm[var], hours_list, freq)]

    oksm_obs = pd.concat(dfs, axis=1)
    oksm_obs['RAIN'] = oksm_obs['RAIN'].apply(lambda x: np.where(x < 0., np.nan, x))
    oksm_obs_df=oksm_obs.loc[:,~oksm_obs.columns.duplicated()]
    return oksm_obs_df

In [6]:
def main(year):
    # inputs
    save_path = f'/home/aevans/nwp_bias/data/oksm/'

    # get the raw nysm data
    print('--- get_raw_oksm_data ---')
    df_oksm, oksm_sites = get_raw_oksm_data(year)
        
    # resample the data to 1H and 3H frequencies
    print('--- get_oksm_dataframe_for_resampled ---')
    oksm_1H_obs = get_oksm_dataframe_for_resampled(df_oksm, '1H')
    oksm_3H_obs = get_oksm_dataframe_for_resampled(df_oksm, '3H')
    
    oksm_1H_obs.to_csv(f'{save_path}oksm_1H_obs_{year}.csv')
    oksm_3H_obs.to_csv(f'{save_path}oksm_3H_obs_{year}.csv')

In [7]:
years = [int(x) for x in np.arange(2018,2022)]
print(years)

[2018, 2019, 2020, 2021]


In [8]:
for year in years:
    print(year)
    main(year)

2018
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
2019
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
2020
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
2021
--- get_raw_oksm_data ---
importing files...


  val = np.log(vapor_pressure / mpconsts.nounit.sat_pressure_0c)
  magnitude = new_self._magnitude**exponent


--- get_oksm_dataframe_for_resampled ---
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
PRES
TAIR
TMIN
TMAX
TDEW
RELH
WDIR
WSPD
WMAX
RAIN
SRAD
elev
lon
lat
