In [1]:
# peak hour and peak period volumes for typical weekday

In [2]:
import sys, os, gzip, shutil
import datetime as dt
import pandas as pd
import geopandas as gpd
import numpy as np
import holidays
from shapely.geometry import Point, LineString
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
PEMSDIR = r'Q:\Data\Observed\Streets\PeMS'
OUTDIR = r'Q:\Model Projects\101_280\data'
data_type = 'station_meta'
district = 4
ca_holidays = holidays.UnitedStates(state='CA')

In [4]:
def get_dir(base, year=2020, data_type='station_hour', district=4):
    if data_type in ['station_hour','station_5min','station_meta']:
        return os.path.join(base,'D{}_Data_{}\{}'.format(district,year,data_type))
    elif data_type == 'processed_station_hour':
        return os.path.join(base,'pems','pems_station_hour_{}.h5'.format(year))
    
def get_columns(data_type, num_cols):
    if data_type == 'station_meta':
        columns = ['station','route','dir','district','county','city','state_postmile','abs_postmile','latitude','longitude',
                   'length','type','lanes','name','user_id_1','user_id_2','user_id_3','user_id_4']
    if data_type == 'station_hour':
        columns = ['timestamp', 'station', 'district', 'route', 'dir', 'lane_type', 'station_length',
                   'samples', 'obs_pct', 'total_flow', 'avg_occupancy', 'avg_speed',
                   'delay_35','delay_40','delay_45','delay_50','delay_55','delay_60']
        for i in range(0, int((num_cols - 18) / 3)):
            columns += [f'lane_{i}_flow',
                        f'lane_{i}_avg_occ',
                        f'lane_{i}_avg_speed',
                       ]
    if data_type == 'station_5min':
        columns = ['timestamp', 'station', 'district', 'route', 'dir', 'lane_type', 'station_length',
                   'samples', 'obs_pct', 'total_flow', 'avg_occupancy', 'avg_speed']
        for i in range(0, int((num_cols - 12) / 5)):
            columns += [f'lane_{i}_samples',
                        f'lane_{i}_flow',
                        f'lane_{i}_avg_occ',
                        f'lane_{i}_avg_speed',
                        f'lane_{i}_avg_obs',
                       ]
    return columns

In [5]:
locations = pd.read_csv(os.path.join(OUTDIR,'stable_locations.csv'), infer_datetime_format=True, parse_dates=['start_date','end_date'],)

In [6]:
locations['start_date'] = locations['start_date'].map(lambda x: x.date())
locations['end_date'] = locations['end_date'].map(lambda x: x.date())

In [7]:
hour_00, hour_20, hour_50 = [], [], []
data_type = 'processed_station_hour'

typical_weekday = True
sf_only = True
continuous_only = False

if sf_only:
    locations = locations.loc[locations['county'].eq(75)]
for year in np.arange(2005,2022):
    f = r'Q:\Model Projects\101_280\data\pems\pems_station_hour_{}.h5'.format(year)
    print(f.split()[1])
    df = pd.read_hdf(f)
    
    groupby_cols = ['route','dir','type','station','county','city','start_name','end_name','is_complete','year','hour']
    agg_args = {'total_flow':['mean','count','std']}
    
    if typical_weekday:
        df = df.loc[df['month'].isin([3,4,5,9,10,11]) & df['day_of_week'].isin([1,2,3]) & ~df['is_holiday']]
    else:
        groupby_cols = groupby_cols[:-1] + ['month','day_of_week','is_holiday'] + groupby_cols[-1:]
        
    before = len(df)
    df = pd.merge(locations, df, on='station', suffixes=['','_obs'])
    df = df.loc[df['date'].between(df['start_date'], df['end_date'])]
    after = len(df)
    print('removing stations with unknown location before: {}, after: {}'.format(before, after))
    
    print('mismatched route/dir {}'.format(len(df.loc[df['route'].ne(df['route_obs']) | df['dir'].ne(df['dir_obs'])])))
    df00 = df.loc[df['obs_pct'].ge(0)].groupby(groupby_cols).agg(agg_args)
    df20 = df.loc[df['obs_pct'].ge(20)].groupby(groupby_cols).agg(agg_args)
    df50 = df.loc[df['obs_pct'].ge(50)].groupby(groupby_cols).agg(agg_args)
    hour_00.append(df00)
    hour_20.append(df20)
    hour_50.append(df50)

Projects\101_280\data\pems\pems_station_hour_2005.h5
removing stations with unknown location before: 2359008, after: 96048
mismatched route/dir 0
Projects\101_280\data\pems\pems_station_hour_2006.h5
removing stations with unknown location before: 2095872, after: 70344
mismatched route/dir 0
Projects\101_280\data\pems\pems_station_hour_2007.h5
removing stations with unknown location before: 2273688, after: 76632
mismatched route/dir 0
Projects\101_280\data\pems\pems_station_hour_2008.h5
removing stations with unknown location before: 2250168, after: 74688
mismatched route/dir 0
Projects\101_280\data\pems\pems_station_hour_2009.h5
removing stations with unknown location before: 2249904, after: 72000
mismatched route/dir 0
Projects\101_280\data\pems\pems_station_hour_2010.h5
removing stations with unknown location before: 3283080, after: 84720
mismatched route/dir 0
Projects\101_280\data\pems\pems_station_hour_2011.h5
removing stations with unknown location before: 3769584, after: 96048
m

In [8]:
hour_00 = pd.concat(hour_00)
hour_00.columns=['flow','obs','stdev']
hour_00.reset_index(inplace=True)
hour_20 = pd.concat(hour_20)
hour_20.columns=['flow','obs','stdev']
hour_20.reset_index(inplace=True)
hour_50 = pd.concat(hour_50)
hour_50.columns=['flow','obs','stdev']
hour_50.reset_index(inplace=True)

In [9]:
loc_part = 'sf' if sf_only else 'd4'
days_part = 'typwkdy' if typical_weekday else 'month'
fname = '{}_station_{}_hour_obs{:02d}.csv'

hour_00.to_csv(os.path.join(OUTDIR,'pems',fname.format(loc_part, days_part, 0)))
hour_20.to_csv(os.path.join(OUTDIR,'pems',fname.format(loc_part, days_part, 20)))
hour_50.to_csv(os.path.join(OUTDIR,'pems',fname.format(loc_part, days_part, 50)))