In [1]:
import geopandas as gpd
import pandas as pd
import shapely
import haversine
import importlib

In [2]:
simplify_dir = 'GTFS/stop_snapping/GIS Simplify/'
gtfs_dir = 'GTFS/TTC_2016-10-03/'

In [3]:
stop_times_simple = pd.read_csv(gtfs_dir + 'stop_times_simple.csv')
trips = pd.read_csv(gtfs_dir + 'trips.txt')
stops = pd.read_csv(gtfs_dir + 'stops.txt')
routes = pd.read_csv(gtfs_dir + 'routes.txt')

In [4]:
route_trips = trips.merge(routes)[['route_short_name', 'trip_id']].drop_duplicates()
route_trips

Unnamed: 0,route_short_name,trip_id
0,1,32830005
1,1,32830006
2,1,32830007
3,1,32830008
4,1,32830009
...,...,...
137666,93,32966940
137667,93,32966941
137668,307,32949577
137669,307,32949578


In [5]:
route_stop = stop_times_simple.merge(route_trips)[['route_short_name', 'stop_id']].drop_duplicates()

In [6]:
route_stop['route_short_name'] = route_stop['route_short_name'].astype(int)

In [7]:
# define routes not in the 300 series

route_stop = route_stop[~route_stop['route_short_name'].between(299,399)]

In [8]:
maj = gpd.read_file(simplify_dir + 'major_capture.geojson')[['INT_ID', 'stop_id']]
maj_min = gpd.read_file(simplify_dir + 'int_maj_min_capture.geojson')[['INT_ID', 'stop_id']]
min_min = gpd.read_file(simplify_dir + 'int_min_min_capture.geojson')[['INT_ID', 'stop_id']]
maj_col = gpd.read_file(simplify_dir + 'int_maj_col_capture.geojson')[['INT_ID', 'stop_id']]
voronoi = gpd.read_file(simplify_dir + 'int_remaining_capture_new.geojson')[['INT_ID', 'stop_id']]

In [9]:
int_stop = maj.append(maj_min).append(maj_col).append(min_min).append(voronoi)
int_stop = int_stop.dropna(subset = ['stop_id'])

In [10]:
#manual_additions = int_stop[int_stop['INT_ID'].isin([13465757])]

In [11]:
int_route = int_stop.merge(route_stop)[['INT_ID','route_short_name']]
int_stop_full = int_stop.copy()

In [12]:
#int_route = int_route.drop_duplicates() 
#decided to count 3 or more route-stops in intersection instead of counting more than 1 route

In [13]:
int_route = int_route.groupby('INT_ID').count().reset_index()

In [14]:
int_stop = int_stop.merge(int_route, how = 'inner')[['INT_ID', 'stop_id']] 

# Adding subway stations to the dataset

In [15]:
stn = gpd.read_file(simplify_dir + 'station_capture.geojson').drop_duplicates()
stn = stn.merge(pd.read_csv('GIS/stations.csv'))

In [16]:
int_stop = int_stop.append(stn[['INT_ID', 'stop_id']])

In [17]:
# int_stop = int_stop.append(manual_additions)

In [18]:
int_geo = gpd.read_file('/Volumes/Data2/RST/notebook/GIS/intersection-file-wgs84/CENTRELINE_INTERSECTION_WGS84.shp')
int_geo = int_geo[['INT_ID', 'geometry']]

In [19]:
int_geo['int_lon'] = int_geo['geometry'].x
int_geo['int_lat'] = int_geo['geometry'].y

In [20]:
int_geo = int_geo[['INT_ID', 'int_lon', 'int_lat']]
int_geo = int_geo.append(stn.rename(columns = {'longitude':'int_lon', 'latitude':'int_lat'})
                         [['INT_ID', 'int_lon', 'int_lat']].drop_duplicates())

In [21]:
int_stop = int_stop.merge(int_geo[['INT_ID', 'int_lon', 'int_lat']], how = 'left')
int_stop = int_stop.merge(stops[['stop_id', 'stop_code','stop_lon', 'stop_lat', 'stop_name']], how = 'left')

In [22]:
int_stop

Unnamed: 0,INT_ID,stop_id,int_lon,int_lat,stop_code,stop_lon,stop_lat,stop_name
0,13454752,270.0,-79.338874,43.720501,14258,-79.339096,43.720669,DON MILLS RD AT EGLINTON AVE EAST
1,13454752,271.0,-79.338874,43.720501,9125,-79.338799,43.720899,DON MILLS RD AT EGLINTON AVE NORTH SIDE
2,13454752,2888.0,-79.338874,43.720501,2262,-79.339256,43.720195,EGLINTON AVE EAST AT DON MILLS RD
3,13454752,4150.0,-79.338874,43.720501,2263,-79.338511,43.720740,EGLINTON AVE EAST AT DON MILLS RD
4,13454752,5682.0,-79.338874,43.720501,9126,-79.338947,43.720011,DON MILLS RD AT EGLINTON AVE EAST SOUTH SIDE
...,...,...,...,...,...,...,...,...
9877,304,14287.0,-79.257285,43.774648,14768,-79.257735,43.774373,SCARBOROUGH CENTRE STATION
9878,304,14288.0,-79.257285,43.774648,14769,-79.257944,43.774322,SCARBOROUGH CENTRE STATION
9879,304,14542.0,-79.257285,43.774648,14119,-79.257285,43.774648,SCARBOROUGH CENTRE STATION - WESTBOUND PLATFORM
9880,304,14551.0,-79.257285,43.774648,14118,-79.257285,43.774648,SCARBOROUGH CENTRE STATION - EASTBOUND PLATFORM


In [48]:
def hs(df):
    return haversine.haversine((df['int_lon'], df['int_lat']), (df['stop_lon'], df['stop_lat']), unit=haversine.Unit.METERS)

In [49]:
int_stop['dist'] = int_stop.apply(hs, axis = 1)
int_stop

Unnamed: 0,INT_ID,stop_id,int_lon,int_lat,stop_code,stop_lon,stop_lat,stop_name,dist
9519,110,14410.0,-79.396989,43.697748,13798,-79.396989,43.697748,DAVISVILLE STATION - SOUTHBOUND PLATFORM,0.0
9502,205,14473.0,-79.475991,43.652048,13776,-79.475991,43.652048,RUNNYMEDE STATION - EASTBOUND PLATFORM,0.0
9503,205,14524.0,-79.475991,43.652048,13775,-79.475991,43.652048,RUNNYMEDE STATION - WESTBOUND PLATFORM,0.0
9731,214,14515.0,-79.403889,43.667249,13851,-79.403889,43.667249,SPADINA STATION - WESTBOUND PLATFORM,0.0
9129,208,14521.0,-79.452723,43.657023,13770,-79.452723,43.657023,DUNDAS WEST STATION - WESTBOUND PLATFORM,0.0
...,...,...,...,...,...,...,...,...,...
7122,13466991,4029.0,-79.404650,43.648831,,,,,
8321,13455144,8296.0,-79.326552,43.717740,,,,,
8792,13456011,9226.0,-79.324497,43.712363,,,,,
8793,13456011,14652.0,-79.324497,43.712363,,,,,


In [25]:
# exceptions = [13442436,
# 13467839,
# 14036377,
# 13466686,
# 13459228,
# 13452531,
# 14228449,
# 14644292,
# 13445390,
# 13448300,
# 13450748,
# 13450019,
# 13469479,
# 14204030,
# 13468519,
# 13468252,
# 13468572,
# 13468683,
# 13468811,
# 13462790,
# 13448816,
# 13448411,
# 13448504,
# 13461060,
# 14644292,
# 13449180]

In [26]:
# int_stop = int_stop[~int_stop['INT_ID'].isin(exceptions)]
int_stop = int_stop.drop_duplicates()

In [27]:
int_stop

Unnamed: 0,INT_ID,stop_id,int_lon,int_lat,stop_code,stop_lon,stop_lat,stop_name,dist
0,13454752,270.0,-79.338874,43.720501,14258,-79.339096,43.720669,DON MILLS RD AT EGLINTON AVE EAST,0.024940
1,13454752,271.0,-79.338874,43.720501,9125,-79.338799,43.720899,DON MILLS RD AT EGLINTON AVE NORTH SIDE,0.011680
2,13454752,2888.0,-79.338874,43.720501,2262,-79.339256,43.720195,EGLINTON AVE EAST AT DON MILLS RD,0.042953
3,13454752,4150.0,-79.338874,43.720501,2263,-79.338511,43.720740,EGLINTON AVE EAST AT DON MILLS RD,0.040649
4,13454752,5682.0,-79.338874,43.720501,9126,-79.338947,43.720011,DON MILLS RD AT EGLINTON AVE EAST SOUTH SIDE,0.012947
...,...,...,...,...,...,...,...,...,...
9877,304,14287.0,-79.257285,43.774648,14768,-79.257735,43.774373,SCARBOROUGH CENTRE STATION,0.050361
9878,304,14288.0,-79.257285,43.774648,14769,-79.257944,43.774322,SCARBOROUGH CENTRE STATION,0.073588
9879,304,14542.0,-79.257285,43.774648,14119,-79.257285,43.774648,SCARBOROUGH CENTRE STATION - WESTBOUND PLATFORM,0.000000
9880,304,14551.0,-79.257285,43.774648,14118,-79.257285,43.774648,SCARBOROUGH CENTRE STATION - EASTBOUND PLATFORM,0.000000


In [28]:
int_stop = int_stop.sort_values(by = 'dist').drop_duplicates(subset = 'stop_id', keep = 'first')

# correcting in case mistakes were made in matching process

In [29]:
int_stop[['INT_ID', 'int_lon', 'int_lat', 'stop_id', 'stop_code' ,'stop_lon', 'stop_lat', 'stop_name']].drop_duplicates().to_csv(simplify_dir + 'int_stop.csv', index = False)

In [30]:
stop_times_simple.head()

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min
0,32830004,14420,1,6,3
1,32830004,14421,2,6,4
2,32830004,14422,3,6,6
3,32830004,14423,4,6,7
4,32830004,14424,5,6,8


In [31]:
stop_times_reduced = stop_times_simple.merge(int_stop[['INT_ID', 'dist', 'stop_id']], how = 'left')
stop_times_reduced

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min,INT_ID,dist
0,32830004,14420,1,6,3,100.0,0.188435
1,32830004,14421,2,6,4,151.0,0.000000
2,32830004,14422,3,6,6,152.0,0.000000
3,32830004,14423,4,6,7,153.0,0.000000
4,32830004,14424,5,6,8,154.0,0.000000
...,...,...,...,...,...,...,...
4917775,32966941,742,12,25,7,13457248.0,0.019761
4917776,32966941,4769,13,25,8,13457254.0,0.066596
4917777,32966941,5702,14,25,8,13457239.0,0.025071
4917778,32966941,9875,15,25,9,13457403.0,0.035364


In [32]:
stop_times_reduced = stop_times_reduced.dropna(subset =['INT_ID'
                                  ]).sort_values(by = 'dist', ascending = True).drop_duplicates(subset = [
    'trip_id', 'INT_ID'], keep = 'first').sort_values(by = ['trip_id', 'stop_sequence'])

In [33]:
stop_times_simple[stop_times_simple['hr'].isin([7,8,9])]

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min
207,32830010,14435,32,7,1
237,32830011,14433,30,7,0
238,32830011,14434,31,7,3
239,32830011,14435,32,7,6
266,32830012,14430,27,7,0
...,...,...,...,...,...
4917119,32966900,742,12,9,55
4917120,32966900,4769,13,9,56
4917121,32966900,5702,14,9,56
4917122,32966900,9875,15,9,57


In [34]:
stop_times_reduced[stop_times_reduced['hr'].isin([7,8,9])]

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min,INT_ID,dist
962091,32828691,14260,48,7,8,225.0,0.024418
964283,32828692,8673,2,9,9,13467129.0,0.043711
964284,32828692,2958,3,9,10,13466778.0,0.008143
964285,32828692,8763,4,9,11,13466640.0,0.011871
964286,32828692,8999,5,9,12,13466531.0,0.025368
...,...,...,...,...,...,...,...
659396,32970403,1021,49,9,0,13448840.0,0.022608
659398,32970403,6801,51,9,1,13448292.0,0.009699
659399,32970403,9993,52,9,2,13447884.0,0.023180
659401,32970403,664,54,9,4,13447218.0,0.033030


In [35]:
g = stop_times_reduced.groupby('trip_id', as_index=False)
stop_times_reduced['stop_sequence_new'] = g.cumcount() + 1
stop_times_reduced

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min,INT_ID,dist,stop_sequence_new
961948,32828689,8763,1,5,15,13466640.0,0.011871,1
961949,32828689,8999,2,5,16,13466531.0,0.025368,2
961950,32828689,2243,3,5,16,13466677.0,0.048383,3
961951,32828689,7506,4,5,17,13466754.0,0.014747,4
961952,32828689,8412,5,5,18,13466699.0,0.022632,5
...,...,...,...,...,...,...,...,...
4202033,32970576,1268,8,25,56,13461602.0,0.001252,7
4202034,32970576,6656,9,25,57,13461912.0,0.007984,8
4202035,32970576,4443,10,25,57,13462284.0,0.009921,9
4202036,32970576,4650,11,25,58,13462427.0,0.031216,10


In [36]:
import numpy as np

In [37]:
stop_times_reduced['hr'] = np.where(stop_times_reduced['min'] == 60, 
                                    stop_times_reduced['hr'] + 1, stop_times_reduced['hr'])
stop_times_reduced['min'] = np.where(stop_times_reduced['min'] == 60, 
                                    0, stop_times_reduced['min'])

In [38]:
stop_times_reduced[['trip_id', 'stop_sequence_new', 'stop_sequence','hr', 'min', 'INT_ID']].to_csv(gtfs_dir + 'stop_times_reduced.csv', index = False)

# Evaluation of Size

In [39]:
stop_times_simple[stop_times_simple['hr'].isin([7,8,9])]

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min
207,32830010,14435,32,7,1
237,32830011,14433,30,7,0
238,32830011,14434,31,7,3
239,32830011,14435,32,7,6
266,32830012,14430,27,7,0
...,...,...,...,...,...
4917119,32966900,742,12,9,55
4917120,32966900,4769,13,9,56
4917121,32966900,5702,14,9,56
4917122,32966900,9875,15,9,57


In [40]:
stop_times_reduced[stop_times_reduced['hr'].isin([7,8,9])]

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min,INT_ID,dist,stop_sequence_new
962091,32828691,14260,48,7,8,225.0,0.024418,45
964283,32828692,8673,2,9,9,13467129.0,0.043711,1
964284,32828692,2958,3,9,10,13466778.0,0.008143,2
964285,32828692,8763,4,9,11,13466640.0,0.011871,3
964286,32828692,8999,5,9,12,13466531.0,0.025368,4
...,...,...,...,...,...,...,...,...
659396,32970403,1021,49,9,0,13448840.0,0.022608,46
659398,32970403,6801,51,9,1,13448292.0,0.009699,47
659399,32970403,9993,52,9,2,13447884.0,0.023180,48
659401,32970403,664,54,9,4,13447218.0,0.033030,49


In [41]:
stop_times_simple[stop_times_simple['hr'].isin([27,28,29])]

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min
79720,32836186,10262,21,27,0
79721,32836186,1579,22,27,1
79722,32836186,8478,23,27,2
79723,32836186,3566,24,27,3
79724,32836186,4516,25,27,4
...,...,...,...,...,...
4843338,32966103,1181,45,27,9
4843339,32966103,5847,46,27,10
4843340,32966103,9450,47,27,10
4843341,32966103,1277,48,27,11


In [42]:
stop_times_reduced[stop_times_reduced['hr'].isin([27,28,29])]

Unnamed: 0,trip_id,stop_id,stop_sequence,hr,min,INT_ID,dist,stop_sequence_new
79720,32836186,10262,21,27,0,13448931.0,0.027636,17
79721,32836186,1579,22,27,1,13448751.0,0.008672,18
79722,32836186,8478,23,27,2,13448413.0,0.030170,19
79723,32836186,3566,24,27,3,20015401.0,0.005012,20
79724,32836186,4516,25,27,4,13447680.0,0.057398,21
...,...,...,...,...,...,...,...,...
4201384,32970562,4902,55,27,56,13466754.0,0.043676,51
4201385,32970562,1195,56,27,57,13466677.0,0.034584,52
4201386,32970562,5094,57,27,57,13466531.0,0.064618,53
4201387,32970562,5074,58,27,58,13466258.0,0.054643,54


In [43]:
int_stop

Unnamed: 0,INT_ID,stop_id,int_lon,int_lat,stop_code,stop_lon,stop_lat,stop_name,dist
9519,110,14410.0,-79.396989,43.697748,13798,-79.396989,43.697748,DAVISVILLE STATION - SOUTHBOUND PLATFORM,0.0
9502,205,14473.0,-79.475991,43.652048,13776,-79.475991,43.652048,RUNNYMEDE STATION - EASTBOUND PLATFORM,0.0
9503,205,14524.0,-79.475991,43.652048,13775,-79.475991,43.652048,RUNNYMEDE STATION - WESTBOUND PLATFORM,0.0
9731,214,14515.0,-79.403889,43.667249,13851,-79.403889,43.667249,SPADINA STATION - WESTBOUND PLATFORM,0.0
9129,208,14521.0,-79.452723,43.657023,13770,-79.452723,43.657023,DUNDAS WEST STATION - WESTBOUND PLATFORM,0.0
...,...,...,...,...,...,...,...,...,...
7122,13466991,4029.0,-79.404650,43.648831,,,,,
8321,13455144,8296.0,-79.326552,43.717740,,,,,
8792,13456011,9226.0,-79.324497,43.712363,,,,,
8793,13456011,14652.0,-79.324497,43.712363,,,,,


In [45]:
int_stop[int_stop['stop_name'].isna()]

Unnamed: 0,INT_ID,stop_id,int_lon,int_lat,stop_code,stop_lon,stop_lat,stop_name,dist
4647,13468727,463.0,-79.418657,43.635338,,,,,
4652,13468727,15016.0,-79.418657,43.635338,,,,,
4857,13455363,801.0,-79.325139,43.716334,,,,,
4880,14063912,7734.0,-79.409826,43.636237,,,,,
4881,14063912,10399.0,-79.409826,43.636237,,,,,
5669,14255084,1910.0,-79.406841,43.636393,,,,,
5670,14255084,12913.0,-79.406841,43.636393,,,,,
5671,14255084,12919.0,-79.406841,43.636393,,,,,
5814,13467444,3684.0,-79.352482,43.645312,,,,,
5816,13467444,8237.0,-79.352482,43.645312,,,,,
