In [1]:
# 0 Housekeeping. Clear variable space
########################################################################################################################
from IPython import get_ipython  # run magic commands

ipython = get_ipython()
ipython.magic("reset -f")
ipython = get_ipython()
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")

In [2]:
# 1 Import Libraries and Set Global Parameters
####################################################################################################
# 1.1 Import Python Libraries
############################################
from datetime import datetime
import os, sys, shutil
import pandas as pd
import geopandas as gpd
import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
# 1.2 Set Global Parameters
############################################
if os.getlogin() == "WylieTimmerman":
    path_working = r"C:\OD\OneDrive - Foursquare ITP\Projects\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\OD\OneDrive - Foursquare ITP\Projects\WMATA_AVL")
    path_sp = r"C:\Users\WylieTimmerman\Documents\projects_local\wmata_avl_local"
    path_source_data = os.path.join(path_sp,"data","00-raw")
    path_processed_data = os.path.join(path_sp, "data","02-processed")
    path_segments = os.path.join(path_working,"data","02-processed")
elif os.getlogin() == "abibeka":
    path_working = r"C:\Users\abibeka\OneDrive - Kittelson & Associates, Inc\Documents\Github\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(path_working)
    path_source_data = r"C:\Users\abibeka\OneDrive - Kittelson & Associates, Inc\Documents\WMATA-AVL\Data"
    path_processed_data = os.path.join(path_source_data, "ProcessedData")
    path_segments = path_processed_data
elif os.getlogin() == "E048374":
    # Working Paths
    path_working = r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL")
    path_source_data = r"\\l-600730\RawNavArchive"
    path_sp = r"C:\Users\E048374\Documents\RawNav"
    path_processed_data = os.path.join(path_sp, "data", "02-processed")
    path_segments = path_processed_data
    
else:
    raise FileNotFoundError("Define the path_working, path_source_data, gtfs_dir, \
                            ZippedFilesloc, and path_processed_data in a new elif block")

In [4]:
# Globals
q_jump_route_list = ['S4']
analysis_routes = q_jump_route_list
# analysis_routes = ['S1']
# analysis_routes = ['S1', 'S9', 'H4', 'G8', '64']
# analysis_routes = ['S2','S4','H1','H2','H3','79','W47']
analysis_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# analysis_days = ['Wednesday','Thursday','Friday']

In [5]:
# EPSG code for WMATA-area work
wmata_crs = 2248
# 1.3 Import User-Defined Package
############################################
import wmatarawnav as wr

In [33]:
from io import StringIO
csv_string = ''' route,direction,seg_name_id,stop_id
                 79,SOUTH,georgia_columbia_stub,10981
                 79,SOUTH,georgia_piney_branch_stub,4217'''
df = pd.read_csv(StringIO(csv_string))
df

Unnamed: 0,route,direction,seg_name_id,stop_id
0,79,SOUTH,georgia_columbia_stub,10981
1,79,SOUTH,georgia_piney_branch_stub,4217


In [6]:
# 2 Load Relevant Static Files 
####################################################################################################

# 2.1. Load segment-pattern-stop crosswalk 
# This crosswalk is used to connect segment shapes to rawnav data. The 'route' field must 
# match the same string used in the rawnav data. 'direction' will be looked up in a moment
# against the wmata schedule database
# and replaced with a pattern code as an int32 value. 'seg_name_id' is also found in the segment
# geometry file. 'stop_id' matches the stop identifier in the WMATA schedule database.
xwalk_seg_pattern_stop_in = wr.tribble(
             ['route',        'direction',                        'seg_name_id','stop_id'], 
                 "79",            "SOUTH",               "georgia_columbia_stub",   10981, 
                 "79",            "SOUTH",           "georgia_piney_branch_stub",    4217, 
                 "70",            "SOUTH",                 "georgia_irving_stub",   19186, #irving stop
                 "70",            "SOUTH",               "georgia_columbia_stub",   10981, #columbia stop 
                 "70",            "SOUTH",           "georgia_piney_branch_stub",    4217,
                 "S1",            "NORTH",                    "sixteenth_u_stub",   18042,
                 "S2",            "NORTH",                    "sixteenth_u_stub",   18042,
                 "S4",            "NORTH",                    "sixteenth_u_stub",   18042,
                 "S9",            "NORTH",                    "sixteenth_u_stub",   18042,
                 "64",            "NORTH",            "eleventh_i_new_york_stub",   16490,
                 "G8",             "EAST",            "eleventh_i_new_york_stub",   16490,
                "D32",             "EAST",     "irving_fifteenth_sixteenth_stub",    2368,
                 "H1",            "NORTH",     "irving_fifteenth_sixteenth_stub",    2368,
                 "H2",             "EAST",     "irving_fifteenth_sixteenth_stub",    2368,
                 "H3",             "EAST",     "irving_fifteenth_sixteenth_stub",    2368,
                 "H4",             "EAST",     "irving_fifteenth_sixteenth_stub",    2368,
                 "H8",             "EAST",     "irving_fifteenth_sixteenth_stub",    2368,
                "W47",             "EAST",     "irving_fifteenth_sixteenth_stub",    2368
  )

In [7]:
xwalk_seg_pattern_stop_in.head(3)

Unnamed: 0,route,direction,seg_name_id,stop_id
0,79,SOUTH,georgia_columbia_stub,10981
1,79,SOUTH,georgia_piney_branch_stub,4217
2,70,SOUTH,georgia_irving_stub,19186


In [8]:
wmata_schedule_dat = (
    pd.read_csv(
        os.path.join(path_sp, "wmata_schedule_data_q_jump_routes.csv"),
        index_col = 0
    )
    .reset_index(drop=True)
)

In [9]:
xwalk_wmata_route_dir_pattern = wmata_schedule_dat.filter(items = ['direction', 'route','pattern']).drop_duplicates()
xwalk_wmata_route_dir_pattern

Unnamed: 0,direction,route,pattern
0,SOUTH,52,1
42,NORTH,52,2
104,SOUTH,52,3
163,NORTH,52,4
208,SOUTH,S4,1
262,NORTH,S4,2


In [10]:
xwalk_seg_pattern_stop = (
    xwalk_seg_pattern_stop_in
    .merge(
        xwalk_wmata_route_dir_pattern, 
        on = ['route','direction']
    )
    .drop('direction', 1)
    .reindex(columns = ['route','pattern','seg_name_id','stop_id'])
)

In [11]:
xwalk_seg_pattern_stop

Unnamed: 0,route,pattern,seg_name_id,stop_id
0,S4,2,sixteenth_u_stub,18042


In [12]:
del xwalk_seg_pattern_stop_in

In [13]:
xwalk_seg_pattern = (
    xwalk_seg_pattern_stop
    .drop('stop_id', 1)
    .drop_duplicates()
)
xwalk_seg_pattern

Unnamed: 0,route,pattern,seg_name_id
0,S4,2,sixteenth_u_stub


In [14]:
# 3. load shapes
# Segments
# Note unique identifier seg_name_id. Other fields are optional
# Note that these are not yet updated to reflect the extension of the 11th street segment 
# further south to give the stop more breathing room.
segments = (
    gpd.read_file(os.path.join(path_segments,"segments.geojson"))
    .to_crs(wmata_crs)
)

In [15]:
# 3 Merge Additional Geometry
####################################################################################################

# 3.1 Rawnav-Segment ########################

# Make Output Directory
path_seg_summary = os.path.join(path_processed_data, "segment_summary.parquet")
shutil.rmtree(path_seg_summary, ignore_errors=True) 
os.mkdir(path_seg_summary)

path_seg_index = os.path.join(path_processed_data, "segment_index.parquet")
shutil.rmtree(path_seg_index, ignore_errors=True) 
os.mkdir(path_seg_index)

# issues with the script below - try one combination step by step

In [16]:
analysis_route = 'S4'
analysis_day = 'Monday'
try:
    rawnav_dat = (
        wr.read_cleaned_rawnav(
           analysis_routes_ = analysis_route,
           analysis_days_ = analysis_day,
           path = os.path.join(path_processed_data, "rawnav_data.parquet"))
        .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
        )
except:
    print(f'No data on analysis route {analysis_route} for {analysis_day}')

In [17]:
rawnav_dat.head()

Unnamed: 0,index_loc,lat,long,heading,door_state,veh_state,odom_ft,sec_past_st,stop_window,row_before_apc,route_pattern,pattern,index_run_start,index_run_end,filename,start_date_time,route,wday
0,0.0,38.9937,-77.030458,245.0,C,S,0.0,0.0,,0.0,S401,1,0.0,1584.0,rawnav04463200707.txt,2020-07-06 09:04:36,S4,Monday
1,1.0,38.9937,-77.030458,252.0,C,S,0.0,1.0,X-1,0.0,S401,1,0.0,1584.0,rawnav04463200707.txt,2020-07-06 09:04:36,S4,Monday
2,2.0,38.993697,-77.030457,198.0,C,M,2.0,34.0,E00,0.0,S401,1,0.0,1584.0,rawnav04463200707.txt,2020-07-06 09:04:36,S4,Monday
3,3.0,38.993697,-77.030457,179.0,C,M,21.0,36.0,,0.0,S401,1,0.0,1584.0,rawnav04463200707.txt,2020-07-06 09:04:36,S4,Monday
4,4.0,38.993697,-77.030457,179.0,C,M,23.0,36.0,X-1,0.0,S401,1,0.0,1584.0,rawnav04463200707.txt,2020-07-06 09:04:36,S4,Monday


In [18]:
rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

# Subset Rawnav Data to Records Desired
rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')

rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_dat[['filename', 'index_run_start']], 
                                    on=['filename', 'index_run_start'],
                                    how='right')

In [19]:
# Address Remaining Col Format issues
rawnav_qjump_gdf = (
gpd.GeoDataFrame(
    rawnav_qjump_dat, 
    geometry = gpd.points_from_xy(
        rawnav_qjump_dat.long,
        rawnav_qjump_dat.lat
    ),
    crs='EPSG:4326')
.to_crs(epsg=wmata_crs)
)

In [20]:
# Iterate on over Pattern-Segments Combinations Applicable to Route
xwalk_seg_pattern_subset = xwalk_seg_pattern.query('route == @analysis_route')

In [21]:
xwalk_seg_pattern_subset

Unnamed: 0,route,pattern,seg_name_id
0,S4,2,sixteenth_u_stub


In [22]:
seg = xwalk_seg_pattern_subset.loc[:,'seg_name_id'].values[0]
seg

'sixteenth_u_stub'

In [23]:
segments.loc[segments.seg_name_id == seg]

Unnamed: 0,OBJECTID,Shape_Length,nicename,seg_name_id,sort_order,RT_D,Note,geometry
4,20,687.439976,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.","LINESTRING (1301947.256 454915.428, 1301949.43..."


In [24]:
xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]

Unnamed: 0,route,pattern,seg_name_id
0,S4,2,sixteenth_u_stub


In [112]:
# We pass the rawnav data and summary tables, check against a segment,
# and use the patterns_by_seg to indicate which patterns should be examined
## below function is not working - break it down further in cells below
## updated function to reset_index(drop=True) on index_run_segment_start_end_1
index_run_segment_start_end, summary_run_segment = (
    wr.merge_rawnav_segment(
        rawnav_gdf_=rawnav_qjump_gdf,
        rawnav_sum_dat_=rawnav_summary_dat,
        target_=segments.loc[segments.seg_name_id == seg],
        patterns_by_seg_=xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]
    )
)

In [115]:
index_run_segment_start_end.iloc[0]

filename                           rawnav04465200713.txt
index_run_start                                      137
index_loc                                            470
OBJECTID                                              20
Shape_Length                                      687.44
nicename                     16th & U St NW (S1, S2, S4)
seg_name_id                             sixteenth_u_stub
sort_order                                             7
RT_D                                                None
Note                     Applies to S1, S2, and S4 only.
route                                                 S4
pattern                                                2
location                                           first
odom_ft                                             9306
sec_past_st                                          485
lat                                              38.9158
long                                            -77.0365
dist_to_nearest_point          

In [116]:
summary_run_segment.iloc[0]

filename                                                     rawnav04465200713.txt
index_run_start                                                                137
seg_name_id                                                       sixteenth_u_stub
route                                                                           S4
pattern                                                                          2
start_date_time                                                2020-07-12 09:26:01
wday                                                                        Sunday
flag_too_far_any                                                             False
flag_wrong_order_any                                                         False
flag_too_long_odom                                                           False
flag_secs_total_mismatch                                                     False
flag_odom_total_mismatch                                                     False
star

## debug cells below

In [111]:
target_ = segments.loc[segments.seg_name_id == seg]
target_

Unnamed: 0,OBJECTID,Shape_Length,nicename,seg_name_id,sort_order,RT_D,Note,geometry
4,20,687.439976,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.","LINESTRING (1301947.256 454915.428, 1301949.43..."


In [52]:
assert(len(target_)==1), print("Function expects a segments file with one record")

In [54]:
# why not use "shape_length" field?
seg_length = (
        target_
        .to_crs(2248)
        .geometry
        .length
        .iloc[0] #return a float
    )
seg_length

687.4392398036563

In [55]:
patterns_by_seg_ = xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]
# Subset segment shapes to current segment and add route identifier
seg_pattern_shape = (
    target_
    # Add route and pattern identifier
    .merge(
        patterns_by_seg_,
        on = ['seg_name_id'],
        how = "left")
)
seg_pattern_shape

Unnamed: 0,OBJECTID,Shape_Length,nicename,seg_name_id,sort_order,RT_D,Note,geometry,route,pattern
0,20,687.439976,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.","LINESTRING (1301947.256 454915.428, 1301949.43...",S4,2


In [57]:
# Prepare segment shape for merge    
seg_pattern_first_last = wr.explode_first_last(seg_pattern_shape)
seg_pattern_first_last

Unnamed: 0,OBJECTID,Shape_Length,nicename,seg_name_id,sort_order,RT_D,Note,route,pattern,location,geometry
0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,first,POINT (1301947.256 454915.428)
1,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,last,POINT (1301947.448 455602.820)


In [77]:
rawnav_gdf_ = rawnav_qjump_gdf
# Find rawnav point nearest each segment
index_run_segment_start_end_1 = (
    wr.merge_rawnav_target(
        target_dat = seg_pattern_first_last,
        rawnav_dat = rawnav_gdf_
    )
).reset_index(drop=True)
index_run_segment_start_end_1.head()

Unnamed: 0,filename,index_run_start,index_loc,OBJECTID,Shape_Length,nicename,seg_name_id,sort_order,RT_D,Note,route,pattern,location,geometry,odom_ft,sec_past_st,lat,long,dist_to_nearest_point
0,rawnav04468200721.txt,2307.0,2307.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,first,POINT (1301947.256 454915.428),0.0,0.0,39.047118,-77.108382,52012.887472
1,rawnav04468200721.txt,2307.0,2307.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,last,POINT (1301947.448 455602.820),0.0,0.0,39.047118,-77.108382,51381.331772
2,rawnav04468200721.txt,5489.0,5489.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,first,POINT (1301947.256 454915.428),0.0,0.0,39.047118,-77.108382,52012.887472
3,rawnav04468200721.txt,5489.0,5489.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,last,POINT (1301947.448 455602.820),0.0,0.0,39.047118,-77.108382,51381.331772
4,rawnav04470200707.txt,2738.0,3321.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,first,POINT (1301947.256 454915.428),12897.0,1256.0,38.915792,-77.036527,7.913626


In [78]:
index_run_segment_start_end_1.loc[:,'flag_too_far'] = (index_run_segment_start_end_1
                                                       .loc[:,'dist_to_nearest_point']
                                                      .apply(lambda x: x > 50))
index_run_segment_start_end_1.head(3)

Unnamed: 0,filename,index_run_start,index_loc,OBJECTID,Shape_Length,nicename,seg_name_id,sort_order,RT_D,Note,route,pattern,location,geometry,odom_ft,sec_past_st,lat,long,dist_to_nearest_point,flag_too_far
0,rawnav04468200721.txt,2307.0,2307.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,first,POINT (1301947.256 454915.428),0.0,0.0,39.047118,-77.108382,52012.887472,True
1,rawnav04468200721.txt,2307.0,2307.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,last,POINT (1301947.448 455602.820),0.0,0.0,39.047118,-77.108382,51381.331772,True
2,rawnav04468200721.txt,5489.0,5489.0,20,687.44,"16th & U St NW (S1, S2, S4)",sixteenth_u_stub,7,,"Applies to S1, S2, and S4 only.",S4,2,first,POINT (1301947.256 454915.428),0.0,0.0,39.047118,-77.108382,52012.887472,True


In [80]:
index_run_segment_start_end_1.loc[:,'flag_wrong_order'] = (index_run_segment_start_end_1
                                                         .groupby(['filename','index_run_start']
                                                                  , sort = False)
                                                     .index_loc
                                                     .diff()
                                                     .fillna(0)
                                                     .lt(0))

In [84]:
rawnav_sum_dat_ = rawnav_summary_dat
# Generate Summary
summary_run_segment = (
    wr.include_segment_summary(
        rawnav_q_dat = rawnav_gdf_,
        rawnav_sum_dat = rawnav_sum_dat_,
        nearest_seg_boundary_dat = index_run_segment_start_end_1,
        seg_length_ = seg_length
    )
)

In [85]:
# Could do this earlier, but need to remove geometry reference in get_first_last_stop_rawnav if so
index_run_segment_start_end = wr.drop_geometry(index_run_segment_start_end_1)

In [113]:
# Iterate
for analysis_route in analysis_routes:
    print("*" * 100)
    print(f'Processing analysis route {analysis_route}')
    for analysis_day in analysis_days:
        print(f'Processing analysis route {analysis_route} for {analysis_day}...')
        
        # Reload data
        try:
            rawnav_dat = (
                wr.read_cleaned_rawnav(
                   analysis_routes_ = analysis_route,
                   analysis_days_ = analysis_day,
                   path = os.path.join(path_processed_data, "rawnav_data.parquet"))
                .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
                )
        except:
            print(f'No data on analysis route {analysis_route} for {analysis_day}')
            continue
        else:
   
            # Reload Data
            rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

            # Subset Rawnav Data to Records Desired
            rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')
            
            rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_dat[['filename', 'index_run_start']], 
                                                on=['filename', 'index_run_start'],
                                                how='right')
            
            # Address Remaining Col Format issues
            rawnav_qjump_gdf = (
                gpd.GeoDataFrame(
                    rawnav_qjump_dat, 
                    geometry = gpd.points_from_xy(
                        rawnav_qjump_dat.long,
                        rawnav_qjump_dat.lat
                    ),
                    crs='EPSG:4326')
                .to_crs(epsg=wmata_crs)
            )
    
            # Iterate on over Pattern-Segments Combinations Applicable to Route
            xwalk_seg_pattern_subset = xwalk_seg_pattern.query('route == @analysis_route')
                        
            for seg in xwalk_seg_pattern_subset.seg_name_id.unique():
                print('Processing segment {} ...'.format(seg))

                # We pass the rawnav data and summary tables, check against a segment,
                # and use the patterns_by_seg to indicate which patterns should be examined
                index_run_segment_start_end, summary_run_segment = (
                    wr.merge_rawnav_segment(
                        rawnav_gdf_=rawnav_qjump_gdf,
                        rawnav_sum_dat_=rawnav_summary_dat,
                        target_=segments.loc[segments.seg_name_id == seg],
                        patterns_by_seg_=xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]
                    )
                )
                # Note that because seg_pattern_first_last is defined for route and pattern,
                # our summary will implicitly drop any runs that are on 'wrong' pattern(s) for 
                # a route. 
                
                index_run_segment_start_end['wday'] = analysis_day
                summary_run_segment['wday'] = analysis_day
                
                # The additional partitioning here is excessive, but if fits better in the 
                # iterative/chunking process above
                pq.write_to_dataset(
                    table = pa.Table.from_pandas(summary_run_segment),
                    root_path = path_seg_summary,
                    partition_cols = ['route','wday','seg_name_id']
                )
                
                pq.write_to_dataset(
                    table = pa.Table.from_pandas(index_run_segment_start_end),
                    root_path = path_seg_index,
                    partition_cols = ['route','wday','seg_name_id']
                )

****************************************************************************************************
Processing analysis route S4
Processing analysis route S4 for Monday...
Processing segment sixteenth_u_stub ...
Processing analysis route S4 for Tuesday...
Processing segment sixteenth_u_stub ...
Processing analysis route S4 for Wednesday...
Processing segment sixteenth_u_stub ...
Processing analysis route S4 for Thursday...
Processing segment sixteenth_u_stub ...
Processing analysis route S4 for Friday...
Processing segment sixteenth_u_stub ...
Processing analysis route S4 for Saturday...
Processing segment sixteenth_u_stub ...
Processing analysis route S4 for Sunday...
Processing segment sixteenth_u_stub ...
