In [1]:
# 0 Housekeeping. Clear variable space
########################################################################################################################
from IPython import get_ipython  # run magic commands

ipython = get_ipython()
ipython.magic("reset -f")
ipython = get_ipython()
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")

In [2]:
# 1 Import Libraries and Set Global Parameters
####################################################################################################
# 1.1 Import Python Libraries
############################################
from datetime import datetime
import os, sys, shutil
import pandas as pd
import geopandas as gpd
import pyarrow as pa
import pyarrow.parquet as pq

In [8]:
# 1.2 Set Global Parameters
############################################
if os.getlogin() == "WylieTimmerman":
    path_working = r"C:\OD\OneDrive - Foursquare ITP\Projects\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\OD\OneDrive - Foursquare ITP\Projects\WMATA_AVL")
    path_sp = r"C:\Users\WylieTimmerman\Documents\projects_local\wmata_avl_local"
    path_source_data = os.path.join(path_sp,"data","00-raw")
    path_processed_data = os.path.join(path_sp, "data","02-processed")
    path_segments = os.path.join(path_working,"data","02-processed")
elif os.getlogin() == "abibeka":
    path_working = r"C:\Users\abibeka\OneDrive - Kittelson & Associates, Inc\Documents\Github\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(path_working)
    path_source_data = r"C:\Users\abibeka\OneDrive - Kittelson & Associates, Inc\Documents\WMATA-AVL\Data"
    path_processed_data = os.path.join(path_source_data, "ProcessedData")
    path_segments = path_processed_data
elif os.getlogin() == "E048374":
    # Working Paths
    path_working = r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL")
    path_source_data = r"\\l-600730\RawNavArchive"
    path_sp = r"C:\Users\E048374\Documents\RawNav"
    path_processed_data = os.path.join(path_sp, "data", "02-processed")
    path_segments = path_processed_data
#     path_segments = os.path.join(path_working, "data", "02-processed")
    
else:
    raise FileNotFoundError("Define the path_working, path_source_data, gtfs_dir, \
                            ZippedFilesloc, and path_processed_data in a new elif block")

In [275]:
# Globals
q_jump_route_list = ['52']
analysis_routes = q_jump_route_list
# analysis_routes = ['S1']
# analysis_routes = ['S1', 'S9', 'H4', 'G8', '64']
# analysis_routes = ['S2','S4','H1','H2','H3','79','W47']
# took out Sunday bc doesn't exist in my test data (BAM)
analysis_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
# analysis_days = ['Wednesday','Thursday','Friday']

In [5]:
# EPSG code for WMATA-area work
wmata_crs = 2248
# 1.3 Import User-Defined Package
############################################
import wmatarawnav as wr

In [293]:

segments = (
    gpd.read_file(os.path.join(path_segments,"pattern_5201_segment.geojson"))
    .to_crs(wmata_crs)
)

In [277]:
segments

Unnamed: 0,seg_name_id,name_str,geoid,stop_id,length,geometry
0,14th_22,14th Street Northwest,0,0,272.507588,"LINESTRING (1303138.895 458452.856, 1303140.67..."
1,14th_24,14th Street Northwest,0,0,78.552727,"LINESTRING (1303159.851 458181.155, 1303166.08..."
2,14th_25,14th Street Northwest,7243,19143,284.194639,"LINESTRING (1303166.082 458102.850, 1303168.09..."


In [292]:

seg_pattern = pd.read_csv(os.path.join(path_segments,"test_seg_pattern_5201.csv"),
                         dtype={'route':str, 'PATTERN_ID':str})
seg_pattern

Unnamed: 0,seg_name_id,PATTERN_ID,route,pattern,direction,from_geoid,from_stop_seq,to_stop_seq,to_geoid,stop_id
0,16203-16662,5201,52,1,SOUTH,16203,16,17,16662,19066


In [279]:
wmata_schedule_dat = (
    pd.read_csv(
        os.path.join(path_sp, "wmata_schedule_data_q_jump_routes.csv"),
        index_col = 0
    )
    .reset_index(drop=True)
)

In [280]:
# 3 Merge Additional Geometry
####################################################################################################

# 3.1 Rawnav-Segment ########################

# Make Output Directory
path_seg_summary = os.path.join(path_processed_data, "segment_summary.parquet")
shutil.rmtree(path_seg_summary, ignore_errors=True) 
os.mkdir(path_seg_summary)

path_seg_index = os.path.join(path_processed_data, "segment_index.parquet")
shutil.rmtree(path_seg_index, ignore_errors=True) 
os.mkdir(path_seg_index)

# issues with the script below - try one combination step by step

In [281]:
analysis_route = '52'
analysis_day = 'Monday'
try:
    rawnav_dat = (
        wr.read_cleaned_rawnav(
           analysis_routes_ = analysis_route,
           analysis_days_ = analysis_day,
           path = os.path.join(path_processed_data, "rawnav_data.parquet"))
        .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
        )
except:
    print(f'No data on analysis route {analysis_route} for {analysis_day}')

In [282]:
rawnav_dat.head()

Unnamed: 0,index_loc,lat,long,heading,door_state,veh_state,odom_ft,sec_past_st,stop_window,row_before_apc,route_pattern,pattern,index_run_start,index_run_end,filename,start_date_time,route,wday
0,3801.0,38.955658,-77.033155,257.0,O,S,0.0,0.0,,0.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday
1,3802.0,38.955658,-77.033155,257.0,O,S,0.0,1.0,X-1,0.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday
2,3803.0,38.955658,-77.033155,257.0,O,S,0.0,1.0,E00,1.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday
3,3805.0,38.955655,-77.033183,257.0,C,S,8.0,32.0,,0.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday
4,3806.0,38.955655,-77.03321,277.0,C,M,18.0,188.0,,0.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday


In [283]:
rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

# Subset Rawnav Data to Records Desired
rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')

rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_dat[['filename', 'index_run_start']], 
                                    on=['filename', 'index_run_start'],
                                    how='right')

In [284]:
# Address Remaining Col Format issues
rawnav_qjump_gdf = (
gpd.GeoDataFrame(
    rawnav_qjump_dat, 
    geometry = gpd.points_from_xy(
        rawnav_qjump_dat.long,
        rawnav_qjump_dat.lat
    ),
    crs='EPSG:4326')
.to_crs(epsg=wmata_crs)
)

In [285]:
# Iterate on over Pattern-Segments Combinations Applicable to Route
xwalk_seg_pattern_subset = seg_pattern[['route','pattern','seg_name_id']].copy()

In [286]:
xwalk_seg_pattern_subset

Unnamed: 0,route,pattern,seg_name_id
0,52,1,14th_22
1,52,1,14th_24
2,52,1,14th_25


In [287]:
seg = xwalk_seg_pattern_subset.loc[:,'seg_name_id'].values[0]
seg

'14th_22'

In [288]:
segments.loc[segments.seg_name_id == seg]

Unnamed: 0,seg_name_id,name_str,geoid,stop_id,length,geometry
0,14th_22,14th Street Northwest,0,0,272.507588,"LINESTRING (1303138.895 458452.856, 1303140.67..."


In [289]:
xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]

Unnamed: 0,route,pattern,seg_name_id
0,52,1,14th_22


In [290]:
rawnav_qjump_gdf.head(3)

Unnamed: 0,index_loc,lat,long,heading,door_state,veh_state,odom_ft,sec_past_st,stop_window,row_before_apc,route_pattern,pattern,index_run_start,index_run_end,filename,start_date_time,route,wday,geometry
0,3801.0,38.955658,-77.033155,257.0,O,S,0.0,0.0,,0.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday,POINT (1302904.219 469435.789)
1,3802.0,38.955658,-77.033155,257.0,O,S,0.0,1.0,X-1,0.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday,POINT (1302904.219 469435.789)
2,3803.0,38.955658,-77.033155,257.0,O,S,0.0,1.0,E00,1.0,5201,1,3801.0,5405.0,rawnav07164191119.txt,2019-11-18 18:41:00,52,Monday,POINT (1302904.219 469435.789)


In [254]:
rawnav_gdf_ = rawnav_qjump_gdf
rawnav_sum_dat_ = rawnav_summary_dat
target_ = segments.loc[segments.seg_name_id == seg]
patterns_by_seg_ = xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]

In [255]:
seg_length = (
        target_
        .to_crs(2248)
        .geometry
        .length
        .iloc[0] #return a float
    )
    
# Subset segment shapes to current segment and add route identifier
seg_pattern_shape = (
target_
# Add route and pattern identifier
.merge(
    patterns_by_seg_,
    on = ['seg_name_id'],
    how = "left")
)

In [256]:
# Prepare segment shape for merge    
seg_pattern_first_last = wr.explode_first_last(seg_pattern_shape)

In [257]:
seg_pattern_first_last

Unnamed: 0,seg_name_id,length,route,pattern,location,geometry
0,16203-16662,248.484,52,1,first,POINT (1303038.504 459653.332)
1,16203-16662,248.484,52,1,last,POINT (1303104.952 458842.234)


In [258]:
target_dat = seg_pattern_first_last
rawnav_dat = rawnav_gdf_
quiet = True

In [259]:
target_groups = target_dat.groupby(['route', 'pattern'])
rawnav_groups = rawnav_dat.groupby(
    ['route', 'pattern', 'filename', 'index_run_start'])

In [260]:
target_dat

Unnamed: 0,seg_name_id,length,route,pattern,location,geometry
0,16203-16662,248.484,52,1,first,POINT (1303038.504 459653.332)
1,16203-16662,248.484,52,1,last,POINT (1303104.952 458842.234)


In [261]:
target_dat.dtypes

seg_name_id      object
length           object
route            object
pattern          object
location         object
geometry       geometry
dtype: object

In [262]:
rawnav_dat.dtypes

index_loc                 float64
lat                       float64
long                      float64
heading                   float64
door_state                 object
veh_state                  object
odom_ft                   float64
sec_past_st               float64
stop_window                object
row_before_apc            float64
route_pattern              object
pattern                     int32
index_run_start           float64
index_run_end             float64
filename                   object
start_date_time    datetime64[ns]
route                      object
wday                       object
geometry                 geometry
dtype: object

In [263]:
nearest_rawnav_point_to_target_dat = pd.DataFrame()

In [270]:
for name, rawnav_group in rawnav_groups:
    print(name)
    try:
        target_dat_relevant = \
            target_groups.get_group(
                (name[0], name[1]))
        nearest_rawnav_point_to_target_dat = \
            pd.concat([nearest_rawnav_point_to_target_dat,
                       ll.ckdnearest(target_dat_relevant, rawnav_group)])
    except:
        if (quiet == False):
            print("No target geometry found for {} - {}".format(name[0],name[1]))

('52', 1, 'rawnav07164191119.txt', 3801.0)
('52', 1, 'rawnav07164191119.txt', 8170.0)
('52', 1, 'rawnav07164191126.txt', 601.0)
('52', 2, 'rawnav07164191126.txt', 2098.0)
('52', 4, 'rawnav07164191119.txt', 5409.0)


In [272]:
rawnav_group = rawnav_groups.get_group(('52', 1, 'rawnav07164191119.txt', 3801.0))

In [269]:
target_dat_relevant = target_groups.get_group(('52',1))

In [273]:
wr.ckdnearest(target_dat_relevant, rawnav_group)

Unnamed: 0,seg_name_id,length,route,pattern,location,geometry,filename,index_run_start,index_loc,odom_ft,sec_past_st,lat,long,dist_to_nearest_point
0,16203-16662,248.484,52,1,first,POINT (1303038.504 459653.332),rawnav07164191119.txt,3801.0,4274.0,10147.0,974.0,38.928787,-77.032615,16.270241
1,16203-16662,248.484,52,1,last,POINT (1303104.952 458842.234),rawnav07164191119.txt,3801.0,4324.0,10982.0,1102.0,38.926582,-77.032422,5.583182


In [265]:
nearest_rawnav_point_to_target_dat

In [291]:
# Iterate
for analysis_route in analysis_routes:
    print("*" * 100)
    print(f'Processing analysis route {analysis_route}')
    for analysis_day in analysis_days:
        print(f'Processing analysis route {analysis_route} for {analysis_day}...')
        
        # Reload data
        try:
            rawnav_dat = (
                wr.read_cleaned_rawnav(
                   analysis_routes_ = analysis_route,
                   analysis_days_ = analysis_day,
                   path = os.path.join(path_processed_data, "rawnav_data.parquet"))
                .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
                )
        except:
            print(f'No data on analysis route {analysis_route} for {analysis_day}')
            continue
        else:
   
            # Reload Data
            rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

            # Subset Rawnav Data to Records Desired
            rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')
            
            rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_dat[['filename', 'index_run_start']], 
                                                on=['filename', 'index_run_start'],
                                                how='right')
            
            # Address Remaining Col Format issues
            rawnav_qjump_gdf = (
                gpd.GeoDataFrame(
                    rawnav_qjump_dat, 
                    geometry = gpd.points_from_xy(
                        rawnav_qjump_dat.long,
                        rawnav_qjump_dat.lat
                    ),
                    crs='EPSG:4326')
                .to_crs(epsg=wmata_crs)
            )
    
            # Iterate on over Pattern-Segments Combinations Applicable to Route
            xwalk_seg_pattern_subset = seg_pattern[['route','pattern','seg_name_id']].copy()
                        
            for seg in xwalk_seg_pattern_subset.seg_name_id.unique():
                print('Processing segment {} ...'.format(seg))

                # We pass the rawnav data and summary tables, check against a segment,
                # and use the patterns_by_seg to indicate which patterns should be examined
                index_run_segment_start_end, summary_run_segment = (
                    wr.merge_rawnav_segment(
                        rawnav_gdf_=rawnav_qjump_gdf,
                        rawnav_sum_dat_=rawnav_summary_dat,
                        target_=segments.loc[segments.seg_name_id == seg],
                        patterns_by_seg_=xwalk_seg_pattern_subset.loc[xwalk_seg_pattern_subset.seg_name_id == seg]
                    )
                )
                # Note that because seg_pattern_first_last is defined for route and pattern,
                # our summary will implicitly drop any runs that are on 'wrong' pattern(s) for 
                # a route. 
                
                index_run_segment_start_end['wday'] = analysis_day
                summary_run_segment['wday'] = analysis_day
                
                # The additional partitioning here is excessive, but if fits better in the 
                # iterative/chunking process above
                pq.write_to_dataset(
                    table = pa.Table.from_pandas(summary_run_segment),
                    root_path = path_seg_summary,
                    partition_cols = ['route','wday','seg_name_id']
                )
                
                pq.write_to_dataset(
                    table = pa.Table.from_pandas(index_run_segment_start_end),
                    root_path = path_seg_index,
                    partition_cols = ['route','wday','seg_name_id']
                )

****************************************************************************************************
Processing analysis route 52
Processing analysis route 52 for Monday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Tuesday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Wednesday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Thursday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Friday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
Processing analysis route 52 for Saturday...
Processing segment 14th_22 ...
Processing segment 14th_24 ...
Processing segment 14th_25 ...
