In [1]:
import os
import pandas as pd
import numpy as np
import sys
sys.path.append('/gpfs2/projects/project-bus_capstone_2016/workspace/mu529/Bus-Capstone')
import ttools #homemade module
import gtfs #homemade module
os.chdir('/gpfs2/projects/project-bus_capstone_2016/workspace/share')
%matplotlib inline

## 1. Get the sample of parsed AVL data.
Clean MTA prefixes from trip_id

In [2]:
# get the sample of parsed AVL data.  Beware, large files take more time.
bustime = pd.read_csv('20151203_parsed.csv')

bustime.columns = ['vehicle_id','route','timestamp','lat','lon','trip_id','trip_date','shape_id',
                   'next_stop_id','est_arrival','dist_from_stop','stop_dist_on_trip','presentable_dist',
                   'response']
bustime.drop_duplicates(['vehicle_id','timestamp'],inplace=True)
bustime['trip_id'] = bustime['trip_id'].str.replace('MTA NYCT_','')
bustime['trip_id'] = bustime['trip_id'].str.replace('MTABC_','')
bustime.set_index(['route','trip_id','trip_date','vehicle_id'],inplace=True,drop=True)

In [5]:
# for demonstration, use a subset. Just get data for one trip-date.
tripDateLookup = "2015-12-03" # this is a non-holiday Monday
bustime = bustime.xs(('MTA NYCT_B41',tripDateLookup),level=(0,2),drop_level=False)
bustime.sort_index(inplace=True)
print 'Finished loading BusTime data and and slicing one day, one route (B41).'

Finished loading BusTime data and and slicing one day, one route (B41).


In [19]:
bustime['ts_parsed'] = bustime['timestamp'].apply(ttools.parseActualTime,tdate='2015-12-03')
bustime.set_index('timestamp',append=True,inplace=True)
bustime.sort_index(inplace=True)

## 2. Convert distance and time fields to numeric
#### also make a calculated column for "distance along trip"
#### also calculate speed and duration between each reported location, based on distance fields.

In [88]:
bustime['stop_dist_on_trip'] = bustime['stop_dist_on_trip'].convert_objects(convert_numeric=True)
bustime['dist_from_stop'] = bustime['dist_from_stop'].convert_objects(convert_numeric=True)
bustime['veh_dist_along_trip'] = bustime['stop_dist_on_trip'] - bustime['dist_from_stop']

In [89]:
# show range.  B41 route run the entire length of Flatbush Avenue, approximately 8.5 miles.
print (bustime['veh_dist_along_trip'].min(),bustime['veh_dist_along_trip'].max())

(2.0900000000000318, 12651.950000000001)


In [90]:
bg = bustime.groupby(level=(0,1,2,3))
bustime['seg_speed'] = bg['veh_dist_along_trip'].diff()/(bg['ts_parsed'].diff()/ttools.datetime.timedelta(seconds=1))
bustime['seg_duration'] = bg['ts_parsed'].diff()/ttools.datetime.timedelta(seconds=1)

## 3. Label interruptions 
Currently defined as any ping that comes after average speed of less than 0.4

In [91]:
bustime['interruption_index'] = np.nan

In [92]:
def index_interrupts(df):
    counter = 0
    interruption = False
    for index, row in df.iterrows():
        if row.seg_speed > 0.4:
            interruption = False
        else:
            if interruption is False:
                counter += 1
            else:
                pass
            interruption = True
            df.loc[index,'interruption_index'] = counter
    return df
bustime_interru = bg.apply(index_interrupts)

Show ALL the data from a trip that contained some negative speed.  We want to see how the index_interrupts function worked on that trip's data

In [93]:
i_tups = []
for i,v in bustime_interru.query('seg_speed < -1').iterrows():
    i_tups.append(i[:4])
bustime_interru.xs(i_tups[0],level=(0,1,2,3))

Unnamed: 0_level_0,lat,lon,shape_id,next_stop_id,est_arrival,dist_from_stop,stop_dist_on_trip,presentable_dist,response,veh_dist_along_trip,ts_parsed,seg_speed,seg_duration,interruption_index
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015-12-03T04:20:10.075-05:00,40.609202,-73.921613,MTA_B410308,,,,,,2015-12-03T04:20:20.891-05:00,,04:20:10,,,1
2015-12-03T04:21:44.965-05:00,40.611432,-73.924141,MTA_B410308,MTA_300163,2015-12-03T04:22:00.617-05:00,0.00,327.11,at stop,2015-12-03T04:21:59.617-05:00,327.11,04:21:44,,94,1
2015-12-03T04:22:16.111-05:00,40.611432,-73.924141,MTA_B410308,MTA_300163,2015-12-03T04:22:49.807-05:00,0.00,327.11,at stop,2015-12-03T04:22:48.807-05:00,327.11,04:22:16,0.000000,32,1
2015-12-03T04:23:19.346-05:00,40.612795,-73.925824,MTA_B410308,MTA_303218,2015-12-03T04:23:46.800-05:00,121.72,656.78,approaching,2015-12-03T04:23:37.829-05:00,535.06,04:23:19,3.300794,63,
2015-12-03T04:24:22.860-05:00,40.613579,-73.926879,MTA_B410308,MTA_303219,2015-12-03T04:25:02.511-05:00,240.91,902.81,< 1 stop away,2015-12-03T04:24:26.983-05:00,661.90,04:24:22,2.013333,63,
2015-12-03T04:24:54.054-05:00,40.614076,-73.927421,MTA_B410308,MTA_303219,2015-12-03T04:25:21.894-05:00,169.15,902.81,< 1 stop away,2015-12-03T04:25:16.396-05:00,733.66,04:24:54,2.242500,32,
2015-12-03T04:25:57.404-05:00,40.614250,-73.927621,MTA_B410308,MTA_303219,2015-12-03T04:26:21.026-05:00,143.52,902.81,approaching,2015-12-03T04:26:05.611-05:00,759.29,04:25:57,0.406825,63,
2015-12-03T04:26:28.604-05:00,40.615473,-73.928988,MTA_B410308,MTA_306273,2015-12-03T04:26:56.003-05:00,204.30,1142.00,< 1 stop away,2015-12-03T04:26:54.950-05:00,937.70,04:26:28,5.755161,31,
2015-12-03T04:27:32.579-05:00,40.617887,-73.931688,MTA_B410308,MTA_303222,2015-12-03T04:28:10.016-05:00,229.35,1519.16,< 1 stop away,2015-12-03T04:27:44.301-05:00,1289.81,04:27:32,5.501719,64,
2015-12-03T04:28:04.619-05:00,40.620145,-73.934209,MTA_B410308,MTA_303223,2015-12-03T04:28:40.835-05:00,194.28,1813.20,< 1 stop away,2015-12-03T04:28:34.171-05:00,1618.92,04:28:04,10.284687,32,


## 4. Summarize the interruptions
For now, just show the earliest ping and latest ping within each interruption

In [94]:
interruptions = pd.DataFrame(bustime_interru.set_index('interruption_index',append=True).groupby(level=(0,1,2,3,5))['ts_parsed'].min())

In [95]:
interruptions.join(bustime_interru.set_index('interruption_index',append=True).groupby(level=(0,1,2,3,5))['ts_parsed'].max(),lsuffix='_begin',rsuffix='_end')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,ts_parsed_begin,ts_parsed_end
route,trip_id,trip_date,vehicle_id,interruption_index,Unnamed: 5_level_1,Unnamed: 6_level_1
MTA NYCT_B41,FB_D5-Weekday-SDon-004500_B41_1,2015-12-03,MTA NYCT_7354,1,00:45:25,00:45:25
MTA NYCT_B41,FB_D5-Weekday-SDon-004500_B41_1,2015-12-03,MTA NYCT_7354,2,00:50:37,00:51:40
MTA NYCT_B41,FB_D5-Weekday-SDon-004500_B41_1,2015-12-03,MTA NYCT_7354,3,00:55:19,00:56:23
MTA NYCT_B41,FB_D5-Weekday-SDon-004500_B41_1,2015-12-03,MTA NYCT_7354,4,01:24:05,01:29:17
MTA NYCT_B41,FB_D5-Weekday-SDon-006300_B41_2,2015-12-03,MTA NYCT_4612,1,01:05:09,01:05:09
MTA NYCT_B41,FB_D5-Weekday-SDon-006300_B41_2,2015-12-03,MTA NYCT_4612,2,01:08:18,01:10:22
MTA NYCT_B41,FB_D5-Weekday-SDon-006300_B41_2,2015-12-03,MTA NYCT_4612,3,01:27:17,01:28:21
MTA NYCT_B41,FB_D5-Weekday-SDon-006300_B41_2,2015-12-03,MTA NYCT_4612,4,01:36:51,01:36:51
MTA NYCT_B41,FB_D5-Weekday-SDon-006300_B41_2,2015-12-03,MTA NYCT_4612,5,01:38:26,01:38:26
MTA NYCT_B41,FB_D5-Weekday-SDon-006300_B41_2,2015-12-03,MTA NYCT_4612,6,01:46:25,01:59:32
