#### Import packages

In [35]:
import pandas as pd
import numpy as np
import pandas_access as mdb
import os
import sys
from datetime import datetime
import glob
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc
import geopandas as gpd
import itertools
import folium

from scipy.spatial import cKDTree

warnings.filterwarnings('ignore')
pd.options.display.max_columns = 75
pd.options.display.max_rows = 250

#### Functions

In [2]:
def get_time():
    """
    Returns current timestamp.
    """
    return datetime.now().strftime("%m-%d-%Y %H:%M:%S")

#### Configuration

In [3]:
months = [202103, 202104]
routes = ['30N', '30S', '31', '33']
days = ['Tuesday']
directions = ['EAST', 'SOUTH']
start_time = '05:00:00'
end_time = '10:00:00'
CRS = 'EPSG:4326'
distance_thresh = 400
duration_thresh = 1.5

#### Directory

In [4]:
root_directory = os.path.normpath(r'C:\Users\amondal\OneDrive - Cambridge Systematics\CS PROJECTS\WMATA Bus Priority\data\tsp_exploratory')

path_repo = os.path.join(r'C:\Users\amondal\OneDrive - Cambridge Systematics\CS PROJECTS\WMATA Bus Priority\codebase\WMATA_AVL')
sys.path.append(path_repo)

# Import wmatarawnav library
import wmatarawnav as wr

#### Read rawnav data

In [5]:
def read_rawnav_parquet(root_directory, month, routes, days):
    return (wr.read_cleaned_rawnav(analysis_routes_= routes,
                                   analysis_days_ = days,
                                   path = os.path.join(root_directory, 'rawnav_data_wisconsin', f'rawnav_data_{month}.parquet')))

data = pd.concat([read_rawnav_parquet(root_directory, month, routes, days) for month in months])

# Sort data by `start_date_time`, `route`, and `index_loc`
data = data.sort_values(by = ['start_date_time', 'route', 'index_loc'])

# Apply filters | AM Peak (5 am - 10 am)
data = data.set_index('start_date_time').between_time(start_time, end_time, include_end = False).reset_index()

# Create unique trip ID
data['trip_id'] = data.index_run_start.astype(int).astype(str) + ' - ' + data.filename

#### Read and process schedule database

In [6]:
# Set up connection to WMATA Schedule database
# Create cursor object

schedule_filename = 'Schedule_082719-201718.mdb'
database_path = os.path.join(root_directory, 'wmata_schedule', schedule_filename)
connection = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=' + database_path)
cursor = connection.cursor()

In [7]:
# Read required tables

pattern = pd.read_sql(sql = 'SELECT * FROM Pattern', con = connection)
stop = pd.read_sql(sql = 'SELECT * FROM Stop', con = connection)
stop_info = pd.read_sql(sql = 'SELECT * FROM StopInfo', con = connection)
stop_list = pd.read_sql(sql = 'SELECT * FROM StopList', con = connection)

In [8]:
# Select required columns in each table

pattern = pattern[['PatternID', 'TARoute', 'PatternName', 'CDVariation', 'Direction', 'Distance', 'RouteKey']]
stop = stop[['GeoID', 'GeoDescription', 'Longitude', 'Latitude']]
stop_info = stop_info[['StopID', 'StopDesc']]
stop_list = stop_list[['RouteKey', 'StopSequence', 'StopID', 'GeoID']]

In [9]:
# Filter and merge datasets

pattern = pattern[pattern.TARoute.isin(routes)].sort_values(by = ['TARoute', 'CDVariation'])
stops = stop_list.merge(stop, on = 'GeoID', how = 'left')
stops = stops[stops.RouteKey.isin(pattern.RouteKey.unique())].sort_values(by = ['RouteKey', 'StopSequence'])
stops = pattern.merge(stops, on = 'RouteKey', how = 'right')

In [10]:
# Create route_pattern column to merge with rawnav
stops['route_pattern'] = stops.TARoute.astype(str) + stops.CDVariation.astype(str).str.zfill(2)

In [11]:
# Remove whitespaces from route_pattern columns in rawnav data
# Merge route pattern to get Direction

data['route_pattern'] = data['route_pattern'].str.strip()
data = data.merge(stops[['route_pattern', 'Distance', 'Direction']].drop_duplicates(keep = 'first'),
                  on = 'route_pattern',
                  how = 'left')

# Filter by direction only
data = data[data.Direction.isin(directions)]

#### Import TSP Intersection Coordinates

In [12]:
# Import TSP intersection coordinates and convert to GeoDataFrame
tsp_int = (pd.read_excel(os.path.join(root_directory, 
                                     'tsp_intersections', 
                                     'wisconsin_tsp_intersections.xlsx'),
                       sheet_name = 'tsp_intersections')
           .rename(columns = {'Intersection ' : 'intersection', 'xcoor' : 'int_long', 'ycoor' : 'int_lat'})
           .dropna(subset = ['intersection']))[['intersection', 'int_lat', 'int_long']]

tsp_int = gpd.GeoDataFrame(tsp_int, 
                           geometry = gpd.points_from_xy(tsp_int.int_long, tsp_int.int_lat),
                           crs = CRS)

#### Plot Intersection Coordinates

In [33]:
def plot_points_on_map(gdf, y_col, x_col, label, radius = 25, color = 'crimson'):
    
    map_with_points = folium.Map(location = [gdf[y_col].mean(), gdf[x_col].mean()],
                                 #tiles = "Stamen Toner", 
                                 zoom_start = 15)
    # Add points
    for ix, row in gdf.iterrows():
        folium.Circle(radius = radius,
                      location = [row[y_col], row[x_col]],
                      popup = row[label],
                      color = color,
                      fill = False).add_to(map_with_points)
    return map_with_points

In [36]:
plot_points_on_map(gdf = tsp_int, 
                   x_col = 'int_long', 
                   y_col = 'int_lat', 
                   label = 'intersection')

#### Trip Filtering

- For each pattern and for each trip:
    - Consider the trip segments from the first step to the last step. Use the schedule data and stop sequence to identify the closest `rawnav` ping to these two terminal points and drop all pings that are before the first stop and after the last stop.
    - Reset the odometer and timer based on the first stop and recalculate the cumulative distribution of distance and duration. 
    - Record the trip length and duration (from the first stop to the last stop).
- For each pattern:
    - Calculate the average distance and duration excluding the NULL/ZERO values.
    - Select the trips: 
        - that are 400ft longer or shorter than the average trip distance.
        - that are 1.5 times longer or shorter than the average trip duration.

In [38]:
# Based on https://gis.stackexchange.com/questions/222315/geopandas-find-nearest-point-in-other-dataframe
def get_nearest_point(gdA, gdB):
    '''
    Find the nearest point from gdB for 
    each point in gdA.  
    '''
    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k = 1)
    gdB_nearest = gdB.iloc[idx].drop(columns = 'geometry').reset_index(drop = True)
    gdf = pd.concat([gdA.reset_index(drop=True), gdB_nearest, pd.Series(dist, name = 'dist')], axis=1)
    return gdf

In [39]:
# Create a list of all available patterns.
pattern_list = data.route_pattern.unique().tolist()

# Create empty dictionary to store 
# trip distance and duration.
dist_dict = {}
dura_dict = {}
processed_df_dict = {}

p_count = 1
for p in pattern_list:
    # Create a stop GeoDataFrame for Pattern 'p'
    stops_pattern = stops[stops.route_pattern == p][['route_pattern', 'StopSequence', 'StopID', 'Latitude', 'Longitude']]
    stops_pattern = gpd.GeoDataFrame(stops_pattern, 
                                 geometry = gpd.points_from_xy(stops_pattern.Longitude, stops_pattern.Latitude),
                                 crs = CRS)
    
    # Create a rawnav DataFrame for Pattern 'p'
    # Also, create a list of all trips for Pattern 'p'
    df = data[data.route_pattern == p]
    trip_list = df.trip_id.unique().tolist()
    
    t_count = 1
    _dist_dict = {}
    _dura_dict = {}
    
    for t in trip_list:
        # Create a rawnav GeoDataFrame for Pattern 'p' and Trip 't'
        df_single_trip = df[df.trip_id == t]
        df_single_trip = gpd.GeoDataFrame(df_single_trip, 
                                  geometry = gpd.points_from_xy(df_single_trip['long'], df_single_trip['lat']),
                                  crs = CRS)
        
        # Get the closest rawnav pings to each stop and intersection
        nearest_rawnav_from_stop = get_nearest_point(stops_pattern, df_single_trip[['index_loc', 'geometry']])
        nearest_rawnav_from_tsp = get_nearest_point(tsp_int, df_single_trip[['index_loc', 'geometry']])
        
        # Filter the records before the first stop and after the last stop
        # Merge stop level information and intersections with rawnav records
        index_loc_min, index_loc_max = nearest_rawnav_from_stop.index_loc.min(), nearest_rawnav_from_stop.index_loc.max()
        
        df_single_trip = df_single_trip.merge(nearest_rawnav_from_stop[['StopID', 'StopSequence', 'index_loc']],
                                              on = 'index_loc', how = 'left')
        df_single_trip = df_single_trip.merge(nearest_rawnav_from_tsp[['intersection', 'int_lat', 'int_long', 'index_loc']],
                                              on = 'index_loc', how = 'left')
        df_single_trip = df_single_trip[(df_single_trip.index_loc >= index_loc_min) & (df_single_trip.index_loc <= index_loc_max)]
        
        # Adjust odometer reading and runtime based on first stop
        df_single_trip['adj_odom_ft_first_stop'] = df_single_trip['odom_ft'] - df_single_trip['odom_ft'].min()
        df_single_trip['adj_dur_sec_first_stop'] = df_single_trip['sec_past_st'] - df_single_trip['sec_past_st'].min()
        
        # Store trip distance and duration for each trip
        _dist_dict[t] = df_single_trip.adj_odom_ft_first_stop.max()
        _dura_dict[t] = df_single_trip.adj_dur_sec_first_stop.max()
        processed_df_dict[t] = df_single_trip
        print(f'Pattern {p} - {p_count} of {len(pattern_list)} | Trip {t_count * 100/len(trip_list): .2f}%', end = '\r')
        t_count += 1
    
    # Store trip distance and duration of all trips
    # for Pattern 'p'
    dist_dict[p] = _dist_dict
    dura_dict[p] = _dura_dict
    p_count += 1

Pattern 3101 - 4 of 4 | Trip  100.00%%

In [45]:
def dict2_to_df(dictionary, col_name_1, col_name_2, col_name_3):
    '''
    Creates a Pandas DataFrame from 
    2-layer nested dictionary.
    '''
    df = (pd.concat({k: pd.DataFrame.from_dict(v, orient = 'index') for k, v in dictionary.items()})
          .reset_index()
          .rename(columns = {'level_0' : col_name_1, 'level_1' : col_name_2, 0 : col_name_3}))
    return df

In [46]:
# Create a DataFrame with trip distance and duration
trip_sel_df = (dict2_to_df(dictionary = dist_dict, 
                          col_name_1 = 'route_pattern', 
                          col_name_2 = 'trip_id', 
                          col_name_3 = 'distance')
               .merge(dict2_to_df(dictionary = dura_dict, 
                          col_name_1 = 'route_pattern', 
                          col_name_2 = 'trip_id', 
                          col_name_3 = 'duration'),
                     on = ['route_pattern', 'trip_id'],
                     how = 'left')
              .replace(0, np.nan))

In [47]:
# Add median distance and duration for each route pattern
trip_sel_df = (trip_sel_df.set_index('route_pattern')
               .join(trip_sel_df.groupby(['route_pattern'])
                     .agg({'distance' : np.median, 'duration' : np.median})
                     .add_prefix('rp_med_')))
# Selection criteria
select_criteria = ((trip_sel_df.distance > trip_sel_df.rp_med_distance - distance_thresh) &
                   (trip_sel_df.distance < trip_sel_df.rp_med_distance + distance_thresh) &
                   (trip_sel_df.duration < trip_sel_df.rp_med_duration * duration_thresh))

trip_sel_df = trip_sel_df.loc[select_criteria, :]

In [48]:
# Select the trips that are in 'trip_sel_df'
data_filtered = pd.concat(list(processed_df_dict.values()))
data_filtered = data_filtered[data_filtered.trip_id.isin(trip_sel_df.trip_id.unique())]

In [None]:
# Export data for visualization
data_filtered.to_csv('WISCONSIN_TSP_VIZ_RAWNAV.csv', index = False)