In [7]:
# 0 Housekeeping. Clear variable space
########################################################################################################################
from IPython import get_ipython  # run magic commands
ipython = get_ipython()
ipython.magic("reset -f")
ipython = get_ipython()
#https://stackoverflow.com/questions/36572282/ipython-autoreload-magic-function-not-found
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# 1 Import Libraries and Set Global Parameters
########################################################################################################################
# 1.1 Import Python Libraries
############################################
from datetime import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import shutil
print("Run Section 1 Import Libraries and Set Global Parameters...")
begin_time = datetime.now()
import os, sys, pandas as pd, geopandas as gpd

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")  # Stop Pandas warnings

Run Section 1 Import Libraries and Set Global Parameters...


In [9]:
# 1.2 Set Global Parameters
############################################
if os.getlogin() == "WylieTimmerman":
    # Working Paths
    path_working = r"C:\OD\OneDrive - Foursquare ITP\Projects\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\OD\OneDrive - Foursquare ITP\Projects\WMATA_AVL")
    path_sp = r"C:\Users\WylieTimmerman\Documents\projects_local\wmata_avl_local"
    path_source_data = os.path.join(path_sp,"data","00-raw")
    path_processed_data = os.path.join(path_sp, "data","02-processed")
elif os.getlogin() == "abibeka":
    # Working Paths
    path_working = r"C:\Users\abibeka\OneDrive - Kittelson & Associates, Inc\Documents\Github\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(path_working)
    # Source data
    path_source_data = r"C:\Users\abibeka\OneDrive - Kittelson & Associates, Inc\Documents\WMATA-AVL\Data"
    # Processed data
    path_processed_data = os.path.join(path_source_data, "ProcessedData")
elif os.getlogin() == "E043868":
    # Working Paths
    path_working = r"C:\Users\e043868\OneDrive - WMATA\R Projects\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\Users\e043868\OneDrive - WMATA\R Projects\WMATA_AVL")
    path_source_data = r"\\l-600730\RawNavArchive"
    path_sp = r"C:\Users\e043868\Documents\RawNav"
    path_processed_data = os.path.join(path_sp, "data", "02-processed")

elif os.getlogin() == "E048374":
    # Working Paths
    path_working = r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\Users\E048374\OneDrive - WMATA\rawnav_rachel_fork\WMATA_AVL")
    path_source_data = r"\\l-600730\RawNavArchive"
    path_sp = r"C:\Users\E048374\Documents\RawNav"
    path_processed_data = os.path.join(path_sp, "data", "02-processed")
   
    
else:
    raise FileNotFoundError("Define the path_working, path_source_data, and"
                            " path_processed_data in a new elif block")

In [10]:
# Globals

q_jump_route_list = ['52']
analysis_routes = q_jump_route_list
analysis_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
wmata_crs = 2248

In [11]:
# 1.3 Import User-Defined Package
############################################
import wmatarawnav as wr

executionTime = str(datetime.now() - begin_time).split('.')[0]
print("Run Time Section 1 Import Libraries and Set Global Parameters : {}".format(executionTime))
print("*" * 100)

Run Time Section 1 Import Libraries and Set Global Parameters : 0:00:08
****************************************************************************************************


In [12]:
# 2 Read, analyze and summarize Schedule data
########################################################################################################################
print("Run Section 2: Read, analyze and summarize rawnav, WMATA schedule data...")
begin_time = datetime.now()
# Read the Wmata_Schedule data
wmata_schedule_dat = (
    pd.read_csv(
        os.path.join(path_sp, "wmata_schedule_data_q_jump_routes.csv"),
        index_col = 0
    )
    .reset_index(drop=True)
)

wmata_schedule_gdf = (
    gpd.GeoDataFrame(
        wmata_schedule_dat, 
        geometry = gpd.points_from_xy(wmata_schedule_dat.stop_lon,wmata_schedule_dat.stop_lat),
        crs='EPSG:4326'
    )
    .to_crs(epsg=wmata_crs)
)

# Make Output Directory
path_stop_summary = os.path.join(path_processed_data, "stop_summary.parquet")
if not os.path.isdir(path_stop_summary):
    os.mkdir(path_stop_summary)

path_stop_index = os.path.join(path_processed_data, "stop_index.parquet")
if not os.path.isdir(path_stop_index):
    os.mkdir(path_stop_index)

for analysis_route in analysis_routes:
    print("*" * 100)
    print('Processing analysis route {}'.format(analysis_route))
    for analysis_day in analysis_days:
        print('Processing analysis route {} for {}...'.format(analysis_route,analysis_day))
                
        # Reload data
        try:
            rawnav_dat = (
                wr.read_cleaned_rawnav(
                   analysis_routes_ = analysis_route,
                   analysis_days_ = analysis_day,
                   path = os.path.join(path_processed_data, "rawnav_data.parquet")
                )
                .drop(columns=['blank', 'lat_raw', 'long_raw', 'sat_cnt'])
            )
        except Exception as e:
            print(e)  # usually no data found or something similar
            continue
        else:

            rawnav_summary_dat = (
                wr.read_cleaned_rawnav(
                    analysis_routes_ = analysis_route,
                    analysis_days_ = analysis_day,
                    path = os.path.join(path_processed_data, "rawnav_summary.parquet")
                )
            )

            # Subset Rawnav Data to Records Desired
            rawnav_summary_dat = rawnav_summary_dat.query('not (run_duration_from_sec < 600 | dist_odom_mi < 2)')
            
            rawnav_summary_keys_col = rawnav_summary_dat[['filename', 'index_run_start']]
            
            rawnav_qjump_dat = rawnav_dat.merge(rawnav_summary_keys_col,
                                                on=['filename', 'index_run_start'],
                                                how='right')

            rawnav_qjump_gdf = (
                gpd.GeoDataFrame(
                    rawnav_qjump_dat,
                    geometry=gpd.points_from_xy(rawnav_qjump_dat.long, rawnav_qjump_dat.lat),
                    crs='EPSG:4326'
                )
                .to_crs(epsg=wmata_crs)
            )

        stop_summary, stop_index = (
            wr.merge_rawnav_wmata_schedule(
                analysis_route_=analysis_route,
                analysis_day_=analysis_day,
                rawnav_dat_=rawnav_qjump_gdf,
                rawnav_sum_dat_=rawnav_summary_dat,
                wmata_schedule_dat_=wmata_schedule_gdf
            )
        )
        
        if type(stop_summary) == type(None):
            print('No data on analysis route {} for {}'.format(analysis_route,analysis_day))
            continue
        
        # Write Summary Table 
        shutil.rmtree(
            os.path.join(
                path_stop_summary,
                "route={}".format(analysis_route),
                "wday={}".format(analysis_day)
            ),
            ignore_errors=True
        ) 
        
        pq.write_to_dataset(
            table=pa.Table.from_pandas(stop_summary),
            root_path=path_stop_summary,
            partition_cols=['route', 'wday']
        )
        
        # Write Index Table
        shutil.rmtree(
            os.path.join(
                path_stop_index,
                "route={}".format(analysis_route),
                "wday={}".format(analysis_day)
            ),
            ignore_errors=True
        ) 
        
        stop_index = wr.drop_geometry(stop_index)
        
        stop_index = stop_index.assign(wday=analysis_day)
                
        pq.write_to_dataset(
            table=pa.Table.from_pandas(stop_index),
            root_path=path_stop_index,
            partition_cols=['route', 'wday']
        )

executionTime = str(datetime.now() - begin_time).split('.')[0]
print(
      "Run Time Section Section 2: Read, analyze and summarize rawnav, WMATA schedule data : {}"
      .format(executionTime)
)
print("*" * 100)

Run Section 2: Read, analyze and summarize rawnav, WMATA schedule data...
****************************************************************************************************
Processing analysis route 52
Processing analysis route 52 for Monday...
deleted 2 rows of 233 rows with distance to the nearest stop > 100 ft. from index table
deleted 0 of 231 stops with incorrect order from index table
Processing analysis route 52 for Tuesday...
deleted 4 rows of 306 rows with distance to the nearest stop > 100 ft. from index table
deleted 0 of 302 stops with incorrect order from index table
Processing analysis route 52 for Wednesday...
deleted 21 rows of 663 rows with distance to the nearest stop > 100 ft. from index table
deleted 35 of 642 stops with incorrect order from index table
Processing analysis route 52 for Thursday...
deleted 62 rows of 539 rows with distance to the nearest stop > 100 ft. from index table
deleted 0 of 477 stops with incorrect order from index table
Processing analysis

In [13]:
partition_cols = ['route', 'wday']
df = pa.Table.from_pandas(stop_summary).to_pandas()
# partition_keys = [df[col] for col in partition_cols]
df

AttributeError: 'NoneType' object has no attribute 'columns'