In [1]:
import os
import pandas as pd
os.chdir('..')
import ttools #homemade module
import gtfs #homemade module
os.chdir('/gpfs2/projects/project-bus_capstone_2016/workspace/share')

## 1. Get the sample of parsed AVL data.
Clean MTA prefixes from trip_id

In [2]:
# get the sample of parsed AVL data.  Beware, large files take more time.
bustime = pd.read_csv('spark_parse/1203.txt',header=None)

# beware!  Bonan is still working on organizing the extract files.  these columns may change.
bustime.columns = ['ROUTE_ID','latitude','longitude','recorded_time','vehicle_id','TRIP_ID','trip_date','SHAPE_ID',
                   'STOP_ID','distance_stop','distance_shape','status']

bustime.drop_duplicates(['vehicle_id','recorded_time'],inplace=True)
bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTA NYCT_','')
bustime['TRIP_ID'] = bustime['TRIP_ID'].str.replace('MTABC_','')
bustime.set_index(['ROUTE_ID','TRIP_ID','trip_date','vehicle_id'],inplace=True,drop=True)

# for demonstration, use a subset. Just get data for one trip-date.
tripDateLookup = "2015-12-03" # this is a non-holiday Thursday
bustime = bustime.xs((tripDateLookup),level=(2),drop_level=False)
bustime.sort_index(inplace=True)
bustime['recorded_time'] = bustime['recorded_time'].apply(ttools.parseActualTime,tdate='2015-12-03')
print 'Finished loading BusTime data and and slicing one day.'

Finished loading BusTime data and and slicing one day.


## 2. Convert distance fields to numeric and make a calculated column for "distance along shape"
Also show range of calculated value.

In [3]:
bustime['distance_shape'] = bustime['distance_shape'].convert_objects(convert_numeric=True)
bustime['distance_stop'] = bustime['distance_stop'].convert_objects(convert_numeric=True)
bustime['veh_dist_along_shape'] = bustime['distance_shape'] - bustime['distance_stop']

In [4]:
print (bustime['veh_dist_along_shape'].min(),bustime['veh_dist_along_shape'].max())

(-35705.919999999998, 63991.839999999997)


## 3. Check for multiple vehicles reported on the same trip
Usually there is a clear "winner."  Perhaps this due to logging in/out of vehicles before the trip actually begins.
But sometimes there are many observations recorded from both vehicles.  For further exploration, see <a href='https://github.com/sarangof/Bus-Capstone/blob/master/unstable_trip_ids.ipynb'>this notebook</a>.

In [7]:
# create a GroupBy object for convenience, since most analysis is on trip and vehicle
grouped = bustime.groupby(level=(0,1,2,3))
trip_veh_validation = pd.DataFrame(grouped.size(),columns=['N'])
trip_veh_validation['time_range'] = grouped['recorded_time'].max()-grouped['recorded_time'].min()
trip_veh_validation['dist_range'] = grouped['veh_dist_along_shape'].max()-grouped['veh_dist_along_shape'].min()
trip_veh_validation.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,N,time_range,dist_range
ROUTE_ID,TRIP_ID,trip_date,vehicle_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MTA NYCT_B1,FB_D5-Weekday-SDon-047200_B49_15,2015-12-03,MTA NYCT_5125,4,00:06:54,1214.55
MTA NYCT_B1,FB_D5-Weekday-SDon-051000_B49_15,2015-12-03,MTA NYCT_4855,1,00:00:00,0.0
MTA NYCT_B1,FB_D5-Weekday-SDon-051000_B49_15,2015-12-03,MTA NYCT_5125,13,00:24:51,5291.86
MTA NYCT_B1,FB_D5-Weekday-SDon-051000_B49_15,2015-12-03,MTA NYCT_7146,2,00:04:14,434.46
MTA NYCT_B1,FB_D5-Weekday-SDon-051200_B49_21,2015-12-03,MTA NYCT_7146,5,00:06:22,701.41
MTA NYCT_B1,UP_D5-Weekday-SDon-006000_B1_1,2015-12-03,MTA NYCT_4877,40,00:40:23,10789.66
MTA NYCT_B1,UP_D5-Weekday-SDon-009800_B1_1,2015-12-03,MTA NYCT_4877,56,00:47:00,10863.65
MTA NYCT_B1,UP_D5-Weekday-SDon-010000_B1_2,2015-12-03,MTA NYCT_4893,52,00:45:10,9604.2
MTA NYCT_B1,UP_D5-Weekday-SDon-013800_B1_2,2015-12-03,MTA NYCT_4893,51,00:41:10,10913.47
MTA NYCT_B1,UP_D5-Weekday-SDon-014000_B1_1,2015-12-03,MTA NYCT_4877,42,00:33:08,10713.65


## 4. Check if stop-distance values are persistent for routes, or for shape.
First look at by ROUTE_ID.  There are clearly multiple distance values reported for stops along each ROUTE_ID, with a significant number of records associated.

In [25]:
route_dist_grouped = bustime.reset_index().groupby(['ROUTE_ID','STOP_ID','distance_shape']).size()
route_dist_grouped.head(50)

ROUTE_ID     STOP_ID     distance_shape
MTA NYCT_B1  MTA_300000  0.44              258
             MTA_300002  288.75             25
                         289.31            275
             MTA_300003  426.97              1
                         427.52             23
             MTA_300004  560.27             20
             MTA_300006  845.34              2
                         845.90             37
             MTA_300007  1001.74             1
                         1002.30            29
             MTA_300008  1115.80             3
                         1116.36            42
             MTA_300009  1322.50             3
                         1323.06            66
             MTA_300010  1505.48            47
             MTA_300011  1659.30             4
                         1659.86            53
             MTA_300012  1837.44             9
                         1838.00           219
             MTA_300013  2066.28             5
                    

Now do the same for SHAPE_ID.  Only a few with multiple distance values per STOP_ID.  The first three examples have 0.00 distance for one of the duplicates.

In [26]:
shape_dist_grouped = bustime.groupby(['SHAPE_ID','STOP_ID','distance_shape']).size()
shape_dist_grouped.head(50)

SHAPE_ID      STOP_ID     distance_shape
MTA_B1000027  MTA_300788  0.00                2
              MTA_350159  834.35              7
MTA_B1000031  MTA_300788  0.00                1
                          1987.73             2
              MTA_303920  1405.31             2
              MTA_308452  11.75               1
              MTA_350217  585.11              2
MTA_B1000057  MTA_300226  4629.62           102
              MTA_300227  4743.25            51
              MTA_300788  0.00              109
              MTA_308197  1779.88            78
              MTA_350019  0.00                1
                          7226.94           219
              MTA_350060  3972.00            89
              MTA_350061  4180.54            76
              MTA_350086  6225.50            35
              MTA_350088  5707.47            27
              MTA_350091  5467.73            23
              MTA_350093  5943.34            26
              MTA_350094  5226.29            22

Examine only the SHAPE_ID-STOP_ID combinations with multiple distance values

In [63]:
shape_dist_dupes = shape_dist_grouped.groupby(level=(0,1)).size()
shape_dist_dupes[shape_dist_dupes>1]
shape_dist_dupes.name = 'duplicate_count'

Looking at the various values, it seems that one of the distance values is always 0 or very close to 0.  This could be due to errors in the Bus Time real-time inferences, or perhaps due to loop routes where the same stop is both the last and first stop.

In [66]:
dupe_summary = pd.DataFrame(shape_dist_dupes[shape_dist_dupes>1]).merge(shape_dist_grouped.reset_index(level=2),left_index=True,right_index=True,how='left')
dupe_summary.rename(columns={0:'record_count'},inplace=True)
dupe_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,duplicate_count,distance_shape,record_count
SHAPE_ID,STOP_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MTA_B1000031,MTA_300788,2,0.00,1
MTA_B1000031,MTA_300788,2,1987.73,2
MTA_B1000057,MTA_350019,2,0.00,1
MTA_B1000057,MTA_350019,2,7226.94,219
MTA_B1000058,MTA_300788,2,0.00,26
MTA_B1000058,MTA_300788,2,6821.57,356
MTA_B1030109,MTA_303011,2,0.00,13
MTA_B1030109,MTA_303011,2,18660.56,312
MTA_B1030116,MTA_307840,2,73.51,20
MTA_B1030116,MTA_307840,2,11800.30,275
