#### This notebook will take all NYSM data for a specified year & resample to model output times (e.g., 1H & 3H)


In [1]:
%matplotlib inline
import pandas as pd
import xarray as xr
import glob
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units

Matplotlib created a temporary config/cache directory at /tmp/tmp.lRvxTTut4W/matplotlib-hm5966_t because the default path (/home/aevans/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
def get_raw_nysm_data(year):
    #first, find the available months in the year directory
    nysm_path = f'/home/aevans/nysm/archive/nysm/netcdf/proc/{year}/'
    file_dirs = glob.glob(f'{nysm_path}/*')
    file_dirs.sort()
    avail_months = [int(x.split('/')[-1]) for x in file_dirs]
    
    df_nysm_list = []
    for x in range(avail_months[0], avail_months[-1]+1):
        print('month index: ',x)
        ds_nysm_month = xr.open_mfdataset(f'{nysm_path}{str(x).zfill(2)}/*.nc')
        df_nysm_list.append(ds_nysm_month.to_dataframe())

    df_nysm = pd.concat(df_nysm_list)
    
    temp = units.Quantity(df_nysm['tair'].values, 'degC')
    relh = df_nysm['relh'].values/100.
    df_nysm['td'] = mpcalc.dewpoint_from_relative_humidity(temp, relh).magnitude
    
    altimeter_value = units.Quantity(df_nysm['pres'].values, 'hPa')
    height = units.Quantity(df_nysm['elev'].values + 1.5, 'm') # + 1.5 to adjust for barometer height
    df_nysm['mslp'] = mpcalc.altimeter_to_sea_level_pressure(altimeter_value, height, temp)
    print(df_nysm)

    nysm_sites = df_nysm.reset_index()['station'].unique()
    
    return df_nysm, nysm_sites

def get_resampled_data(df, interval, method):
    '''
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled 
    method: min, max, mean, etc. [str]
    '''
    return df.reset_index().set_index('time_5M').groupby('station').resample(interval, label='right').apply(method).rename_axis(index={'time_5M':f'time_{interval}'})

def get_valid_time_data(df, hours_list, interval):
    df = df.reset_index()
    # extract hourly observations at top of the hour in provided list
    df_return = df[(df['time_5M'].dt.hour.isin(hours_list)) & (df['time_5M'].dt.minute==0)]
    return df_return.set_index(['station','time_5M']).rename_axis(index={'time_5M':f'time_{interval}'})
    
def get_resampled_precip_data(df, interval, method):
    '''
    df: main dataframe [pandas dataframe]
    interval: the frequency at which the data should be resampled 
    method: min, max, mean, etc. [str]
    '''
    precip_diff = df.groupby('station').diff().reset_index().set_index('time_5M')
    # remove unrealistic precipitation values (e.g., > 500 mm / 5 min)
    precip_diff.loc[precip_diff['precip_total'] > 500.,'precip_total'] = np.nan
    return precip_diff.groupby('station').resample(interval, label='right').apply(method).rename_axis(index={'time_5M':f'time_{interval}'})
   
def get_nysm_dataframe_for_resampled(df_nysm, freq):
    nysm_vars = ['lat', 'lon', 'elev', 'tair', 'ta9m', 'td', 'relh', 'srad', 'pres', 'mslp',
                'wspd_sonic', 'wmax_sonic', 'wdir_sonic', 'precip_total', 'snow_depth']
    if freq=='1H':
        hours_list = np.arange(0,24) # every hour
    elif freq=='3H':
        hours_list = np.arange(0,24,3) # every 3 hours
    dfs = []
    
    for var in nysm_vars:
        if var in ['precip_total']:
            print(var)
            dfs += [get_resampled_precip_data(df_nysm[var], freq, 'sum')]
        else:
            print(df_nysm)
            dfs += [get_valid_time_data(df_nysm[var], hours_list, freq)]

    nysm_obs = pd.concat(dfs, axis=1)
    nysm_obs['precip_total'] = nysm_obs['precip_total'].apply(lambda x: np.where(x < 0., np.nan, x))
    return nysm_obs

In [3]:
def main(year):
    # inputs
    save_path = f'/home/aevans/nwp_bias/data/nysm/'

    # get the raw nysm data
    print('--- get_raw_nysm_data ---')
    df_nysm, nysm_sites = get_raw_nysm_data(year)
        
    # resample the data to 1H and 3H frequencies
    print('--- get_nysm_dataframe_for_resampled ---')
    nysm_1H_obs = get_nysm_dataframe_for_resampled(df_nysm, '1H')
    nysm_3H_obs = get_nysm_dataframe_for_resampled(df_nysm, '3H')
    
    nysm_1H_obs.to_parquet(f'{save_path}nysm_1H_obs_{year}.parquet')
    nysm_3H_obs.to_parquet(f'{save_path}nysm_3H_obs_{year}.parquet')

In [4]:
years = [str(x) for x in np.arange(2018,2022)]
print(years)

['2018', '2019', '2020', '2021']


In [5]:
for year in years:
    print(year)
    main(year)

2018
--- get_raw_nysm_data ---
month index:  1
month index:  2
month index:  3
month index:  4
month index:  5
month index:  6
month index:  7
month index:  8
month index:  9
month index:  10
month index:  11
month index:  12
                                   lat        lon        elev       tair  \
station time_5M                                                            
ADDI    2018-01-01 00:00:00  42.040359 -77.237259  507.614014 -17.669901   
        2018-01-01 00:05:00  42.040359 -77.237259  507.614014 -17.794640   
        2018-01-01 00:10:00  42.040359 -77.237259  507.614014 -17.645809   
        2018-01-01 00:15:00  42.040359 -77.237259  507.614014 -17.831619   
        2018-01-01 00:20:00  42.040359 -77.237259  507.614014 -17.961750   
...                                ...        ...         ...        ...   
YORK    2018-12-31 23:35:00  42.855042 -77.847763  177.942001   5.168409   
        2018-12-31 23:40:00  42.855042 -77.847763  177.942001   5.103686   
        2018-1

  return precip_diff.groupby('station').resample(interval, label='right').apply(method).rename_axis(index={'time_5M':f'time_{interval}'})


                                   lat        lon        elev       tair  \
station time_5M                                                            
ADDI    2018-01-01 00:00:00  42.040359 -77.237259  507.614014 -17.669901   
        2018-01-01 00:05:00  42.040359 -77.237259  507.614014 -17.794640   
        2018-01-01 00:10:00  42.040359 -77.237259  507.614014 -17.645809   
        2018-01-01 00:15:00  42.040359 -77.237259  507.614014 -17.831619   
        2018-01-01 00:20:00  42.040359 -77.237259  507.614014 -17.961750   
...                                ...        ...         ...        ...   
YORK    2018-12-31 23:35:00  42.855042 -77.847763  177.942001   5.168409   
        2018-12-31 23:40:00  42.855042 -77.847763  177.942001   5.103686   
        2018-12-31 23:45:00  42.855042 -77.847763  177.942001   5.017123   
        2018-12-31 23:50:00  42.855042 -77.847763  177.942001   5.054668   
        2018-12-31 23:55:00  42.855042 -77.847763  177.942001   5.048200   

           

  return precip_diff.groupby('station').resample(interval, label='right').apply(method).rename_axis(index={'time_5M':f'time_{interval}'})


                                   lat        lon        elev       tair  \
station time_5M                                                            
ADDI    2018-01-01 00:00:00  42.040359 -77.237259  507.614014 -17.669901   
        2018-01-01 00:05:00  42.040359 -77.237259  507.614014 -17.794640   
        2018-01-01 00:10:00  42.040359 -77.237259  507.614014 -17.645809   
        2018-01-01 00:15:00  42.040359 -77.237259  507.614014 -17.831619   
        2018-01-01 00:20:00  42.040359 -77.237259  507.614014 -17.961750   
...                                ...        ...         ...        ...   
YORK    2018-12-31 23:35:00  42.855042 -77.847763  177.942001   5.168409   
        2018-12-31 23:40:00  42.855042 -77.847763  177.942001   5.103686   
        2018-12-31 23:45:00  42.855042 -77.847763  177.942001   5.017123   
        2018-12-31 23:50:00  42.855042 -77.847763  177.942001   5.054668   
        2018-12-31 23:55:00  42.855042 -77.847763  177.942001   5.048200   

           

KeyboardInterrupt: 

In [6]:
year = 2018
df_nysm, nysm_sites = get_raw_nysm_data(year)

month index:  1
month index:  2
month index:  3
month index:  4
month index:  5
month index:  6
month index:  7
month index:  8
month index:  9
month index:  10
month index:  11
month index:  12
                                   lat        lon        elev       tair  \
station time_5M                                                            
ADDI    2018-01-01 00:00:00  42.040359 -77.237259  507.614014 -17.669901   
        2018-01-01 00:05:00  42.040359 -77.237259  507.614014 -17.794640   
        2018-01-01 00:10:00  42.040359 -77.237259  507.614014 -17.645809   
        2018-01-01 00:15:00  42.040359 -77.237259  507.614014 -17.831619   
        2018-01-01 00:20:00  42.040359 -77.237259  507.614014 -17.961750   
...                                ...        ...         ...        ...   
YORK    2018-12-31 23:35:00  42.855042 -77.847763  177.942001   5.168409   
        2018-12-31 23:40:00  42.855042 -77.847763  177.942001   5.103686   
        2018-12-31 23:45:00  42.855042 -77.84

In [7]:
df_nysm

Unnamed: 0_level_0,Unnamed: 1_level_0,lat,lon,elev,tair,ta9m,tslo,relh,srad,pres,wspd_sonic,...,ts50,sm05,sm25,sm50,frozen05,frozen25,frozen50,snow_depth,td,mslp
station,time_5M,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ADDI,2018-01-01 00:00:00,42.040359,-77.237259,507.614014,-17.669901,-17.604080,-17.583130,67.137527,0.0,964.405212,1.560830,...,2.80,,0.330,0.383,1.0,0.0,0.0,0.046819,-22.295731,971.268127
ADDI,2018-01-01 00:05:00,42.040359,-77.237259,507.614014,-17.794640,-17.662420,-17.687160,67.611877,0.0,964.434387,1.546818,...,2.80,,0.330,0.383,1.0,0.0,0.0,0.047248,-22.335403,971.330078
ADDI,2018-01-01 00:10:00,42.040359,-77.237259,507.614014,-17.645809,-17.634680,-17.625460,67.834053,0.0,964.440430,1.807150,...,2.80,,0.330,0.383,1.0,0.0,0.0,0.046736,-22.155151,971.297791
ADDI,2018-01-01 00:15:00,42.040359,-77.237259,507.614014,-17.831619,-17.746981,-17.703220,67.691513,0.0,964.463623,1.452414,...,2.80,,0.330,0.383,1.0,0.0,0.0,0.045442,-22.357529,971.369263
ADDI,2018-01-01 00:20:00,42.040359,-77.237259,507.614014,-17.961750,-17.824751,-17.828440,68.333061,0.0,964.480774,1.331697,...,2.80,,0.330,0.383,1.0,0.0,0.0,0.048790,-22.375336,971.420776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YORK,2018-12-31 23:35:00,42.855042,-77.847763,177.942001,5.168409,5.193782,5.248834,93.443542,0.0,986.861084,4.547054,...,3.80,0.323,0.286,0.298,0.0,0.0,0.0,-0.006658,4.197632,987.749329
YORK,2018-12-31 23:40:00,42.855042,-77.847763,177.942001,5.103686,5.104034,5.168116,93.958893,0.0,986.771484,3.812987,...,3.80,0.323,0.287,0.298,0.0,0.0,0.0,-0.007459,4.211853,987.664246
YORK,2018-12-31 23:45:00,42.855042,-77.847763,177.942001,5.017123,5.032194,5.079066,94.446716,0.0,986.671082,4.015590,...,3.80,0.325,0.287,0.296,0.0,0.0,0.0,-0.006172,4.199707,987.570374
YORK,2018-12-31 23:50:00,42.855042,-77.847763,177.942001,5.054668,5.054992,5.085561,94.659950,0.0,986.576782,4.247073,...,3.82,0.327,0.287,0.298,0.0,0.0,0.0,-0.007302,4.269165,987.472351
