# find nearest stops and segment ends 

In [74]:
# 0 Housekeeping. Clear variable space
########################################################################################################################
from IPython import get_ipython  # run magic commands
ipython = get_ipython()
ipython.magic("reset -f")
ipython = get_ipython()
#https://stackoverflow.com/questions/36572282/ipython-autoreload-magic-function-not-found
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")
# 1 Import Libraries and Set Global Parameters
########################################################################################################################
# 1.1 Import Python Libraries
############################################
from datetime import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import shutil
print("Run Section 1 Import Libraries and Set Global Parameters...")
begin_time = datetime.now()
import os, sys, pandas as pd, geopandas as gpd

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")  # Stop Pandas warnings
    
path_working = r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL"
os.chdir(os.path.join(path_working))
sys.path.append(r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL")
path_source_data = r"\\l-600730\RawNavArchive"
path_sp = r"C:\Users\E048374\Documents\RawNav"
path_processed_data = os.path.join(path_working, "data", "02-processed")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Run Section 1 Import Libraries and Set Global Parameters...


In [75]:
# Globals

q_jump_route_list = ['52']
pattern_id = '5201' # see schedule filename below -- probably could just have full schedule..
analysis_routes = q_jump_route_list
analysis_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
wmata_crs = 2248

In [76]:
# 1.3 Import User-Defined Package
############################################
import wmatarawnav as wr

executionTime = str(datetime.now() - begin_time).split('.')[0]
print("Run Time Section 1 Import Libraries and Set Global Parameters : {}".format(executionTime))
print("*" * 100)

wmata_schedule_dat = (
    pd.read_csv(
        os.path.join(path_processed_data, f"bus_sched_{pattern_id}.csv")
        ,dtype={'pattern':'int32','route':'str'}
    )
)

wmata_schedule_gdf = (
    gpd.GeoDataFrame(
        wmata_schedule_dat, 
        geometry = gpd.points_from_xy(wmata_schedule_dat.stop_lon,wmata_schedule_dat.stop_lat),
        crs='EPSG:4326'
    )
    .to_crs(epsg=wmata_crs)
)

# Make Output Directory
path_stop_summary = os.path.join(path_processed_data, "stop_summary.parquet")
if not os.path.isdir(path_stop_summary):
    os.mkdir(path_stop_summary)

path_stop_index = os.path.join(path_processed_data, "stop_index.parquet")
if not os.path.isdir(path_stop_index):
    os.mkdir(path_stop_index)

Run Time Section 1 Import Libraries and Set Global Parameters : 0:00:00
****************************************************************************************************


In [86]:
for analysis_route in analysis_routes:
    print("*" * 100)
    print('Processing analysis route {}'.format(analysis_route))
    for analysis_day in analysis_days:
        print('Processing analysis route {} for {}...'.format(analysis_route,analysis_day))
                
        # Reload data
        try:
            rawnav_dat = (
                wr.read_cleaned_rawnav(
                   analysis_routes_ = analysis_route,
                   analysis_days_ = analysis_day,
                   path = os.path.join(path_processed_data, "rawnav_data.parquet")
                )
                .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
            )
        except Exception as e:
            print(e)  # usually no data found or something similar
            continue
        else:

            rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

            # Subset Rawnav Data to Records Desired
            rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')
            
            rawnav_summary_keys_col = rawnav_summary_dat[['filename', 'index_run_start']]
            
            rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_keys_col,
                                                on=['filename', 'index_run_start'],
                                                how='right')

            rawnav_qjump_gdf = (
                gpd.GeoDataFrame(
                    rawnav_qjump_dat,
                    geometry=gpd.points_from_xy(rawnav_qjump_dat.long, rawnav_qjump_dat.lat),
                    crs='EPSG:4326'
                )
                .to_crs(epsg=wmata_crs)
            )

        stop_summary, stop_index = (
            wr.merge_rawnav_wmata_schedule(
                analysis_route_=analysis_route,
                analysis_day_=analysis_day,
                rawnav_dat_=rawnav_qjump_gdf,
                rawnav_sum_dat_=rawnav_summary_dat,
                wmata_schedule_dat_=wmata_schedule_gdf
            )
        )
        
        if type(stop_summary) == type(None):
            print('No data on analysis route {} for {}'.format(analysis_route,analysis_day))
            continue
        
        # Write Summary Table 
        shutil.rmtree(
            os.path.join(
                path_stop_summary,
                "route={}".format(analysis_route),
                "wday={}".format(analysis_day)
            ),
            ignore_errors=True
        ) 
        
        pq.write_to_dataset(
            table=pa.Table.from_pandas(stop_summary),
            root_path=path_stop_summary,
            partition_cols=['route', 'wday']
        )
        
        # Write Index Table
        shutil.rmtree(
            os.path.join(
                path_stop_index,
                "route={}".format(analysis_route),
                "wday={}".format(analysis_day)
            ),
            ignore_errors=True
        ) 
        
        stop_index = wr.drop_geometry(stop_index)
        
        stop_index = stop_index.assign(wday=analysis_day)
                
        pq.write_to_dataset(
            table=pa.Table.from_pandas(stop_index),
            root_path=path_stop_index,
            partition_cols=['route', 'wday']
        )

executionTime = str(datetime.now() - begin_time).split('.')[0]
print(
      "Run Time Section Section 2: Read, analyze and summarize rawnav, WMATA schedule data : {}"
      .format(executionTime)
)
print("*" * 100)

****************************************************************************************************
Processing analysis route 52
Processing analysis route 52 for Monday...
deleted 43 rows of 1000 rows with distance to the nearest stop > 100 ft. from index table
deleted 3 of 957 stops with incorrect order from index table
Processing analysis route 52 for Tuesday...
deleted 32 rows of 880 rows with distance to the nearest stop > 100 ft. from index table
deleted 0 of 848 stops with incorrect order from index table
Processing analysis route 52 for Wednesday...
deleted 13 rows of 440 rows with distance to the nearest stop > 100 ft. from index table
deleted 1 of 427 stops with incorrect order from index table
Processing analysis route 52 for Thursday...
deleted 35 rows of 960 rows with distance to the nearest stop > 100 ft. from index table
deleted 0 of 925 stops with incorrect order from index table
Processing analysis route 52 for Friday...
deleted 69 rows of 1400 rows with distance to th

# find nearest rawnav to segments

In [88]:
segments = (
    gpd.read_file(os.path.join(path_processed_data,"seg_5201_by_intersection.geojson"), dtype={'pattern':'int32'})
    .to_crs(wmata_crs)
)[['seg_name_id', 'name_str', 'geoid', 'stop_id',
       'length', 'geometry']]

seg_pattern = pd.read_csv(os.path.join(path_processed_data,"stop_seq_pattern_5201_by_intersection.csv"),
                         dtype={'route':str, 'PATTERN_ID':str, 'pattern':'int32'})

In [90]:
# Make Output Directory
path_seg_summary = os.path.join(path_processed_data, "segment_summary.parquet")
shutil.rmtree(path_seg_summary, ignore_errors=True) 
os.mkdir(path_seg_summary)

path_seg_index = os.path.join(path_processed_data, "segment_index.parquet")
shutil.rmtree(path_seg_index, ignore_errors=True) 
os.mkdir(path_seg_index)

In [91]:
# 3 Merge Additional Geometry
####################################################################################################

# 3.1 Rawnav-Segment ########################
# Iterate
for analysis_route in analysis_routes:
    print("*" * 100)
    print(f'Processing analysis route {analysis_route}')
    for analysis_day in analysis_days:
        print(f'Processing analysis route {analysis_route} for {analysis_day}...')
        
        # Reload data
        try:
            rawnav_dat = (
                wr.read_cleaned_rawnav(
                   analysis_routes_ = analysis_route,
                   analysis_days_ = analysis_day,
                   path = os.path.join(path_processed_data, "rawnav_data.parquet"))
                .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
                )
        except:
            print(f'No data on analysis route {analysis_route} for {analysis_day}')
            continue
        else:
   
            # Reload Data
            rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

            # Subset Rawnav Data to Records Desired
            rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')
            
            rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_dat[['filename', 'index_run_start']], 
                                                on=['filename', 'index_run_start'],
                                                how='right')
            
            # Address Remaining Col Format issues
            rawnav_qjump_gdf = (
                gpd.GeoDataFrame(
                    rawnav_qjump_dat, 
                    geometry = gpd.points_from_xy(
                        rawnav_qjump_dat.long,
                        rawnav_qjump_dat.lat
                    ),
                    crs='EPSG:4326')
                .to_crs(epsg=wmata_crs)
            )
    
            # Iterate on over Pattern-Segments Combinations Applicable to Route
            xwalk_seg_pattern_subset = seg_pattern[['route','pattern','seg_name_id']].copy()
                        
            for seg in xwalk_seg_pattern_subset.seg_name_id.unique():
                print('Processing segment {} ...'.format(seg))

                # We pass the rawnav data and summary tables, check against a segment,
                # and use the patterns_by_seg to indicate which patterns should be examined
                index_run_segment_start_end, summary_run_segment = (
                    wr.merge_rawnav_segment(
                        rawnav_gdf_=rawnav_qjump_gdf,
                        rawnav_sum_dat_=rawnav_summary_dat,
                        target_=segments.loc[segments.seg_name_id == seg],
                        patterns_by_seg_=xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]
                    )
                )
                # Note that because seg_pattern_first_last is defined for route and pattern,
                # our summary will implicitly drop any runs that are on 'wrong' pattern(s) for 
                # a route. 
                
                index_run_segment_start_end['wday'] = analysis_day
                summary_run_segment['wday'] = analysis_day
                
                # The additional partitioning here is excessive, but if fits better in the 
                # iterative/chunking process above
                pq.write_to_dataset(
                    table = pa.Table.from_pandas(summary_run_segment),
                    root_path = path_seg_summary,
                    partition_cols = ['route','wday','seg_name_id']
                )
                
                pq.write_to_dataset(
                    table = pa.Table.from_pandas(index_run_segment_start_end),
                    root_path = path_seg_index,
                    partition_cols = ['route','wday','seg_name_id']
                )

****************************************************************************************************
Processing analysis route 52
Processing analysis route 52 for Monday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Tuesday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Wednesday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Thursday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Friday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Saturday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Sunday...
Pro