In [1]:
# peak hour and peak period volumes for typical weekday

In [2]:
import sys, os, gzip, shutil
import datetime as dt
import pandas as pd
import geopandas as gpd
import numpy as np
import holidays
from shapely.geometry import Point, LineString
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
PEMSDIR = r'Q:\Data\Observed\Streets\PeMS'
OUTDIR = r'Q:\Model Projects\101_280\data'
data_type = 'station_meta'
district = 4
ca_holidays = holidays.UnitedStates(state='CA')

In [4]:
def get_dir(base, year=2020, data_type='station_hour', district=4):
    if data_type in ['station_hour','station_5min','station_meta']:
        return os.path.join(PEMSDIR,'D{}_Data_{}\{}'.format(district,year,data_type))
def get_columns(data_type, num_cols):
    if data_type == 'station_meta':
        columns = ['station','route','dir','district','county','city','state_postmile','abs_postmile','latitude','longitude',
                   'length','type','lanes','name','user_id_1','user_id_2','user_id_3','user_id_4']
    if data_type == 'station_hour':
        columns = ['timestamp', 'station', 'district', 'route', 'dir', 'lane_type', 'station_length',
                   'samples', 'obs_pct', 'total_flow', 'avg_occupancy', 'avg_speed',
                   'delay_35','delay_40','delay_45','delay_50','delay_55','delay_60']
        for i in range(0, int((num_cols - 18) / 3)):
            columns += [f'lane_{i}_flow',
                        f'lane_{i}_avg_occ',
                        f'lane_{i}_avg_speed',
                       ]
    if data_type == 'station_5min':
        columns = ['timestamp', 'station', 'district', 'route', 'dir', 'lane_type', 'station_length',
                   'samples', 'obs_pct', 'total_flow', 'avg_occupancy', 'avg_speed']
        for i in range(0, int((num_cols - 12) / 5)):
            columns += [f'lane_{i}_samples',
                        f'lane_{i}_flow',
                        f'lane_{i}_avg_occ',
                        f'lane_{i}_avg_speed',
                        f'lane_{i}_avg_obs',
                       ]
    return columns

In [5]:
unzip = False
source = 'gz' # or 'zip','text','txt'
save_h5 = True
data_type = 'station_hour'
sep = ','

for year in np.arange(2021,2022):
    year_dfs = []
    path = get_dir(PEMSDIR, year, data_type, district)
    outpath = os.path.join(OUTDIR,'pems')
    contents = os.listdir(path)
    gzs = filter(lambda x: os.path.splitext(x)[1] == '.gz', contents)
    txts = filter(lambda x: os.path.splitext(x)[1] == '.txt', contents)

    if source == 'gz':
        files = gzs
        compression = 'gzip'
    else:
        files = txts
        compression = None

    header = 0 if data_type == 'station_meta' else None

    for f in files:
        print(f)
        try:
            df = pd.read_csv(os.path.join(path, f), 
                             sep=sep,
                             header=header, 
                             index_col=False, 
                             parse_dates=[0], 
                             infer_datetime_format=True,
                             compression=compression)
        except Exception as e:
            print(e)
            print('trying no quotechar...')
            try:
                df = pd.read_csv(os.path.join(path, f), 
                                 sep=sep,
                                 header=header, 
                                 index_col=False, 
                                 parse_dates=[0], 
                                 infer_datetime_format=True,
                                 quotechar=None,
                                 compression=compression)

            except Exception as e2:
                print(e2)
                continue

        try:
            df.columns = get_columns(data_type, len(df.columns))
        except Exception as e3:
            print(e3)
            continue
        if data_type == 'station_meta':
            y, m, d = f.replace('d{:02d}_text_meta_'.format(district),'').replace('.txt','').split('_')
            ts = dt.datetime(int(y), int(m), int(d))
            date = ts.date()
            df['timestamp'] = ts
            df['date'] = date
            df['year'] = y
            df['month'] = m
            df['day'] = d
            meta.append(df)
        elif data_type == 'station_hour':
            df['date'] = df['timestamp'].map(lambda x: x.date())
            df['year'] = df['timestamp'].map(lambda x: x.year)
            df['month'] = df['timestamp'].map(lambda x: x.month)
            df['day'] = df['timestamp'].map(lambda x: x.day)
            df['hour'] = df['timestamp'].map(lambda x: x.hour)
            df['day_of_week'] = df['timestamp'].map(lambda x: x.weekday())
            df['is_holiday'] = df['timestamp'].map(lambda x: x.date() in ca_holidays)
            year_dfs.append(df)
            
    y = pd.concat(year_dfs)
    try:
        y.to_hdf(os.path.join(OUTDIR,'pems','pems_station_hour_{}.h5'.format(year)), 'data')
    except Exception as e:
        print(e)

d04_text_station_hour_2021_01.txt.gz
d04_text_station_hour_2021_02.txt.gz
d04_text_station_hour_2021_03.txt.gz


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block4_values] [items->Index(['dir', 'lane_type', 'date'], dtype='object')]

  encoding=encoding,
