In [1]:
# peak hour and peak period volumes for typical weekday

In [2]:
import sys, os, gzip, shutil
import datetime as dt
import pandas as pd
#import geopandas as gpd
import numpy as np
import holidays
#from shapely.geometry import Point, LineString
#import matplotlib.pyplot as plt
#%matplotlib inline

In [3]:
INDIR = r'Q:\Model Projects\101_280\data\pems'
OUTDIR = r'Q:\Model Projects\101_280\data\pems\with_holidays'
data_type = 'station_hour'
district = 4
ca_holidays = holidays.UnitedStates(state='CA')

start_year, end_year = 2005, 2022
threshold = 25
typical_weekday = True
include_holidays = True
sf_only = False
continuous_only = False
is_update = False
data_type = 'processed_station_hour'

In [4]:
def get_dir(base, year=2020, data_type='station_hour', district=4):
    if data_type in ['station_hour','station_5min','station_meta']:
        return os.path.join(base,'D{}_Data_{}\{}'.format(district,year,data_type))
    elif data_type == 'processed_station_hour':
        return os.path.join(base,'pems','pems_station_hour_{}.h5'.format(year))
    
def get_columns(data_type, num_cols):
    if data_type == 'station_meta':
        columns = ['station','route','dir','district','county','city','state_postmile','abs_postmile','latitude','longitude',
                   'length','type','lanes','name','user_id_1','user_id_2','user_id_3','user_id_4']
    if data_type == 'station_hour':
        columns = ['timestamp', 'station', 'district', 'route', 'dir', 'lane_type', 'station_length',
                   'samples', 'obs_pct', 'total_flow', 'avg_occupancy', 'avg_speed',
                   'delay_35','delay_40','delay_45','delay_50','delay_55','delay_60']
        for i in range(0, int((num_cols - 18) / 3)):
            columns += [f'lane_{i}_flow',
                        f'lane_{i}_avg_occ',
                        f'lane_{i}_avg_speed',
                       ]
    if data_type == 'station_5min':
        columns = ['timestamp', 'station', 'district', 'route', 'dir', 'lane_type', 'station_length',
                   'samples', 'obs_pct', 'total_flow', 'avg_occupancy', 'avg_speed']
        for i in range(0, int((num_cols - 12) / 5)):
            columns += [f'lane_{i}_samples',
                        f'lane_{i}_flow',
                        f'lane_{i}_avg_occ',
                        f'lane_{i}_avg_speed',
                        f'lane_{i}_avg_obs',
                       ]
    return columns

In [5]:
locations = pd.read_csv(os.path.join(INDIR,'stable_locations.csv'), infer_datetime_format=True, parse_dates=['start_date','end_date'],)

In [6]:
locations['start_date'] = locations['start_date'].map(lambda x: x.date())
locations['end_date'] = locations['end_date'].map(lambda x: x.date())

In [7]:
dfs = []

if sf_only:
    locations = locations.loc[locations['county'].eq(75)]
    
for year in np.arange(start_year, end_year+1):
    f = os.path.join(INDIR,'pems_station_hour_{}.h5'.format(year))
    print(f.split()[1])
    df = pd.read_hdf(f)

    groupby_cols = ['station','era','hour']
    
    if typical_weekday and include_holidays:
        df = df.loc[df['month'].isin([3,4,5,9,10,11]) & df['day_of_week'].isin([1,2,3]) & ~df['is_holiday']]
    else:
        groupby_cols = groupby_cols[:-1] + ['month','day_of_week'] + groupby_cols[-1:]
    
    if include_holidays and not typical_weekday:
        groupby_cols = groupby_cols[:-1] + ['is_holiday'] + groupby_cols[-1:]
    else:
        df = df.loc[~df['is_holiday']]
        
    df = df.loc[df['obs_pct'].ge(threshold)]
    before = len(df)
    df = pd.merge(locations, df, on='station', suffixes=['','_obs'])
    df = df.loc[df['date'].between(df['start_date'], df['end_date'])]
    after = len(df)
    print('removing stations with unknown location before: {}, after: {}'.format(before, after))
    
    era = 'pre-covid'
    if year >= 2020:
        era = 'covid-{}'.format(year)
        
    df.insert(0,'era', era)
    m = df[['station','era','hour','total_flow']]
    dfs.append(m)

Projects\101_280\data\pems\pems_station_hour_2005.h5
removing stations with unknown location before: 1379145, after: 871514
Projects\101_280\data\pems\pems_station_hour_2006.h5
removing stations with unknown location before: 1390234, after: 861155
Projects\101_280\data\pems\pems_station_hour_2007.h5
removing stations with unknown location before: 1554620, after: 930782
Projects\101_280\data\pems\pems_station_hour_2008.h5
removing stations with unknown location before: 1703117, after: 992256
Projects\101_280\data\pems\pems_station_hour_2009.h5
removing stations with unknown location before: 1681720, after: 977793
Projects\101_280\data\pems\pems_station_hour_2010.h5
removing stations with unknown location before: 2121676, after: 1383790
Projects\101_280\data\pems\pems_station_hour_2011.h5
removing stations with unknown location before: 2067558, after: 1429346
Projects\101_280\data\pems\pems_station_hour_2012.h5
removing stations with unknown location before: 1786928, after: 1240011
Proje

In [8]:
df = pd.concat(dfs)

In [9]:
df.groupby('era').size()

era
covid-2020     3091534
covid-2021     2248968
pre-covid     27094348
dtype: int64

In [10]:
df['total_flow'] = df['total_flow'].replace(0, np.nan)

In [11]:
m = df.groupby(['station','hour','era']).agg({'total_flow':['mean','std','count']})
m.columns = ['mean','std','n']

In [12]:
if is_update:
    import shutil
    old = pd.read_csv(os.path.join(OUTDIR,'station_flow_mean_std_{}_{}.csv'.format(start_year, end_year)))
    shutil.copy(os.path.join(OUTDIR,'station_flow_mean_std_{}_{}.csv'.format(start_year, end_year)),
                os.path.join(OUTDIR,'station_flow_mean_std_{}_{}_old.csv'.format(start_year, end_year)))
    old.set_index(['station','hour','era'], inplace=True)
    old.update(m)
    m = old.copy()

In [13]:
m.reset_index(inplace=True)

In [14]:
m.to_csv(os.path.join(OUTDIR,'station_flow_mean_std_{}_{}.csv'.format(start_year, end_year)), index=False)