In [1]:
import pandas as pd
import datetime as dt

## GET STOPS SEQUENCE ALONG EACH LINE

In [2]:
stop_times = pd.read_csv('../GTFS_nyc_Subway/stop_times.txt')

In [3]:
stop_times['service_id']=stop_times['trip_id'].apply(lambda x: x.split('_')[0])
stop_times['sub_trip_id']=stop_times['trip_id'].apply(lambda x: x.split('_')[1])
stop_times['train+direction']=stop_times['trip_id'].apply(lambda x: x.split('_')[2])
del stop_times['stop_headsign']
del stop_times['pickup_type']
del stop_times['drop_off_type']
del stop_times['shape_dist_traveled']
stop_times['train'] = stop_times['train+direction'].apply(lambda x: x.split('.')[0])
stop_times['day'] = stop_times['service_id'].apply(lambda x: x[-3:])

In [4]:
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,service_id,sub_trip_id,train+direction,train,day
0,A20170625SUN_001150_7..S97R,00:11:30,00:11:30,701S,1,A20170625SUN,1150,7..S97R,7,SUN
1,A20170625SUN_001150_7..S97R,00:14:00,00:14:00,702S,2,A20170625SUN,1150,7..S97R,7,SUN
2,A20170625SUN_001150_7..S97R,00:15:30,00:15:30,705S,3,A20170625SUN,1150,7..S97R,7,SUN
3,A20170625SUN_001150_7..S97R,00:16:30,00:16:30,706S,4,A20170625SUN,1150,7..S97R,7,SUN
4,A20170625SUN_001150_7..S97R,00:17:30,00:17:30,707S,5,A20170625SUN,1150,7..S97R,7,SUN


In [5]:
def find_the_longest(line):
    """
    Find the longest route of a certain line,
    return how many stops the longest routes have,
    and route id, service id
    """
    
    North = filter( lambda x: x[3]=='N', set(stop_times['train+direction']))
    South = filter( lambda x: x[3]=='S', set(stop_times['train+direction']))
    ### some train longest route of south bound and north bound are different, 
    ### eg. A train, North bound max 59 stops, South bound max 58 stops.
    
    ## North bound:
    N_length=stop_times[(stop_times['train']==line)&
                       (stop_times['day']=='WKD')&
                       (stop_times['train+direction'].isin(North))]['stop_sequence'].max()
    N_train_direction = list(set(stop_times[(stop_times['train']==line)&
                       (stop_times['day']=='WKD')&
                       (stop_times['train+direction'].isin(North))&
                       (stop_times['stop_sequence']==N_length)]['train+direction']))[0]
    ## South bound:
    S_length=stop_times[(stop_times['train']==line)&
                       (stop_times['day']=='WKD')&
                       (stop_times['train+direction'].isin(South))]['stop_sequence'].max()
    S_train_direction = list(set(stop_times[(stop_times['train']==line)&
                       (stop_times['day']=='WKD')&
                       (stop_times['train+direction'].isin(South))&
                       (stop_times['stop_sequence']==S_length)]['train+direction']))[0]
    
    service_id = list(set(stop_times[(stop_times['train']==line)&
                      (stop_times['day']=='WKD')&
                      (stop_times['stop_sequence']==N_length)]['service_id']))[0]
    return N_length, N_train_direction, S_length, S_train_direction, service_id

In [6]:
train={}
for i in set(stop_times['train']):
    train[i] = find_the_longest(i)
train

### H is A,S train in JFK,
### GS is Time Sq-42 St to Grand Central,
### FS is near brooklyn prospect park

{'1': (37, '1..N03R', 37, '1..S03R', 'A20170625WKD'),
 '2': (61, '2..N08R', 61, '2..S08R', 'A20170625WKD'),
 '3': (34, '3..N01R', 34, '3..S01R', 'A20170625WKD'),
 '4': (54, '4..N13R', 54, '4..S13R', 'A20170625WKD'),
 '5': (39, '5..N60R', 36, '5..S03R', 'A20170625WKD'),
 '6': (38, '6..N01R', 38, '6..S01R', 'A20170625WKD'),
 '7': (22, '7..N97R', 22, '7..S97R', 'A20170625WKD'),
 'A': (59, 'A..N09R', 58, 'A..S74R', 'B20170625WKD'),
 'B': (37, 'B..N45R', 37, 'B..S45R', 'B20170625WKD'),
 'C': (40, 'C..N04R', 40, 'C..S04R', 'B20170625WKD'),
 'D': (41, 'D..N05R', 41, 'D..S05R', 'B20170625WKD'),
 'E': (32, 'E..N05R', 32, 'E..S04R', 'B20170625WKD'),
 'F': (45, 'F..N69R', 45, 'F..S69R', 'B20170625WKD'),
 'FS': (4, 'FS.N01R', 4, 'FS.S01R', 'B20170625WKD'),
 'G': (21, 'G..N14R', 21, 'G..S14R', 'B20170625WKD'),
 'GS': (2, 'GS.N01R', 2, 'GS.S03R', 'A20170625WKD'),
 'H': (5, 'H..N21R', 5, 'H..S21R', 'B20170625WKD'),
 'J': (30, 'J..N12R', 30, 'J..S12R', 'B20170625WKD'),
 'L': (24, 'L..N01R', 24, 'L..S0

In [7]:
df_1= pd.DataFrame(columns=['train','bound','stop_sequence','stop_id',
                            'service_id','route_id'])
for i in train.keys():
#for i in ['1']:
    stops_1 = train[i][0]
    direction_1= train[i][1]
    stops_2 = train[i][2]
    direction_2= train[i][3]
    service_id = train[i][4]
    
    df_2= stop_times[(stop_times['train+direction'] == direction_1)&
            (stop_times['train']==i)&
            (stop_times['service_id']==service_id )].iloc[:stops_1][['stop_id','stop_sequence']]
    df_2['train'] = i
    df_2['bound'] = direction_1[3]
    df_2['service_id'] = service_id
    df_2['route_id'] = direction_1
    df_1 = pd.concat([df_1,df_2])
    
    
    df_2= stop_times[(stop_times['train+direction'] == direction_2)&
            (stop_times['train']==i)&
            (stop_times['service_id']==service_id )].iloc[:stops_2][['stop_id','stop_sequence']]
    df_2['train'] = i
    df_2['bound'] = direction_2[3]
    df_2['service_id'] = service_id
    df_2['route_id'] = direction_2
    df_1 = pd.concat([df_1,df_2])
    

In [8]:
df_1 = df_1.sort_values(['train','bound','stop_sequence'])
df_1 = df_1.reset_index(drop= True)
stops = pd.read_csv('../GTFS_nyc_Subway/stops.txt')
df_1 = df_1.merge(stops[['stop_id','stop_name','stop_lat','stop_lon']],on=['stop_id'],how='left')

In [9]:
df_1.to_csv('../cleaned_data/subway_stops_sequence_weekday(longest_route)')

In [10]:
def find_the_common(line):
    """
    Find the most common route of a certain line,
    return how many stops the longest routes have,
    and route id, service id
    """
    
    keys = stop_times[(stop_times['day']=='WKD')&
           (stop_times['train']==line)&
           (stop_times['stop_sequence']==1)]['train+direction'].value_counts().keys()
    
    North = filter( lambda x: x[3]=='N', keys)[0]  ## route id
    South = filter( lambda x: x[3]=='S', keys)[0]
    
    ### some train longest route of south bound and north bound are different, 
    ### eg. A train, North bound max 59 stops, South bound max 58 stops.
    
    ## North bound:
    N_length=stop_times[(stop_times['train']==line)&
                       (stop_times['day']=='WKD')&
                       (stop_times['train+direction']==North)]['stop_sequence'].max()
#     N_train_direction = list(set(stop_times[(stop_times['train']==line)&
#                        (stop_times['day']=='WKD')&
#                        (stop_times['train+direction']==North)&
#                        (stop_times['stop_sequence']==N_length)]['train+direction']))[0]
    ## South bound:
    S_length=stop_times[(stop_times['train']==line)&
                       (stop_times['day']=='WKD')&
                       (stop_times['train+direction']==South)]['stop_sequence'].max()
#     S_train_direction = list(set(stop_times[(stop_times['train']==line)&
#                        (stop_times['day']=='WKD')&
#                        (stop_times['train+direction']==South)&
#                        (stop_times['stop_sequence']==S_length)]['train+direction']))[0]
    
    service_id = list(set(stop_times[(stop_times['train']==line)&
                      (stop_times['day']=='WKD')&
                      (stop_times['stop_sequence']==N_length)]['service_id']))[0]
    return N_length, North, S_length, South, service_id

In [11]:
train2={}
for i in set(stop_times['train']):
    train2[i] = find_the_common(i)
train2

### H is A and S train in JFK
### GS is Time Sq-42 St to Grand Central,
### FS is near brooklyn prospect park

{'1': (37, '1..N03R', 37, '1..S03R', 'A20170625WKD'),
 '2': (49, '2..N01R', 49, '2..S01R', 'A20170625WKD'),
 '3': (34, '3..N01R', 34, '3..S01R', 'A20170625WKD'),
 '4': (28, '4..N06R', 28, '4..S06R', 'A20170625WKD'),
 '5': (36, '5..N66R', 36, '5..S03R', 'A20170625WKD'),
 '6': (38, '6..N01R', 38, '6..S01R', 'A20170625WKD'),
 '7': (22, '7..N97R', 22, '7..S97R', 'A20170625WKD'),
 'A': (30, 'A..N54R', 30, 'A..S54X010', 'B20170625WKD'),
 'B': (27, 'B..N46R', 27, 'B..S46R', 'B20170625WKD'),
 'C': (40, 'C..N04R', 40, 'C..S04R', 'B20170625WKD'),
 'D': (36, 'D..N07R', 36, 'D..S07R', 'B20170625WKD'),
 'E': (20, 'E..N66R', 20, 'E..S71R', 'B20170625WKD'),
 'F': (45, 'F..N69R', 45, 'F..S69R', 'B20170625WKD'),
 'FS': (4, 'FS.N01R', 4, 'FS.S01R', 'B20170625WKD'),
 'G': (21, 'G..N14R', 21, 'G..S14R', 'B20170625WKD'),
 'GS': (2, 'GS.N03R', 2, 'GS.S01R', 'A20170625WKD'),
 'H': (5, 'H..N21R', 5, 'H..S21R', 'B20170625WKD'),
 'J': (30, 'J..N12R', 30, 'J..S12R', 'B20170625WKD'),
 'L': (24, 'L..N01R', 24, 'L.

In [41]:
df_1= pd.DataFrame(columns=['train','bound','stop_sequence','stop_id',
                            'service_id','route_id'])
for i in train2.keys():
#for i in ['1']:
    stops_1 = train2[i][0]
    direction_1= train2[i][1]
    stops_2 = train2[i][2]
    direction_2= train2[i][3]
    service_id = train2[i][4]
    
    df_2= stop_times[(stop_times['train+direction'] == direction_1)&
            (stop_times['train']==i)&
            (stop_times['service_id']==service_id )].iloc[:stops_1][['stop_id','stop_sequence']]
    df_2['train'] = i
    df_2['bound'] = direction_1[3]
    df_2['service_id'] = service_id
    df_2['route_id'] = direction_1
    df_1 = pd.concat([df_1,df_2])
    
    
    df_2= stop_times[(stop_times['train+direction'] == direction_2)&
            (stop_times['train']==i)&
            (stop_times['service_id']==service_id )].iloc[:stops_2][['stop_id','stop_sequence']]
    df_2['train'] = i
    df_2['bound'] = direction_2[3]
    df_2['service_id'] = service_id
    df_2['route_id'] = direction_2
    df_1 = pd.concat([df_1,df_2])
    

In [42]:
df_1[df_1['train']=='A'].iloc[29:33]

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train
313263,N,A..N54R,B20170625WKD,A02N,30,A
313299,S,A..S54X010,B20170625WKD,A02S,1,A
313300,S,A..S54X010,B20170625WKD,A03S,2,A
313301,S,A..S54X010,B20170625WKD,A05S,3,A


#### this only show one route of one branch of A line, not including the branch to JFK

### Add branch of A lind in JFK

In [14]:
stop_times[stop_times['stop_id']== "H01N"]['train+direction'].value_counts()

A..N55R    164
A..N09R     66
A..N65R      5
A..N85R      2
A..N18R      2
A..N58R      1
Name: train+direction, dtype: int64

In [17]:
stop_times[stop_times['train+direction']=='A..N55R'].iloc[:11][['stop_id','stop_sequence']]

Unnamed: 0,stop_id,stop_sequence
313677,H11N,1
313678,H10N,2
313679,H09N,3
313680,H08N,4
313681,H07N,5
313682,H06N,6
313683,H04N,7
313684,H03N,8
313685,H02N,9
313686,H01N,10


In [43]:
branch_N = pd.DataFrame(columns = ['bound',"route_id",
                                    'service_id','stop_id',
                                   'stop_sequence','train'
                                   ])
branch_N = pd.concat([branch_N,
           stop_times[stop_times['train+direction']==
                      'A..N55R'].iloc[:11][['stop_id','stop_sequence']]])
branch_N['bound']='N'
branch_N['route_id']='A..N55R'
branch_N['service_id']='B20170625WKD'
branch_N['train']='AS'

In [44]:
branch_N

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train
313677,N,A..N55R,B20170625WKD,H11N,1,AS
313678,N,A..N55R,B20170625WKD,H10N,2,AS
313679,N,A..N55R,B20170625WKD,H09N,3,AS
313680,N,A..N55R,B20170625WKD,H08N,4,AS
313681,N,A..N55R,B20170625WKD,H07N,5,AS
313682,N,A..N55R,B20170625WKD,H06N,6,AS
313683,N,A..N55R,B20170625WKD,H04N,7,AS
313684,N,A..N55R,B20170625WKD,H03N,8,AS
313685,N,A..N55R,B20170625WKD,H02N,9,AS
313686,N,A..N55R,B20170625WKD,H01N,10,AS


#### SOUTH BOUND

In [18]:
stop_times[stop_times['stop_id']== "H02S"]['train+direction'].value_counts()

A..S55X009    169
A..S74R        68
A..S87R         4
A..S16R         3
A..S56R         2
A..S78R         1
Name: train+direction, dtype: int64

In [20]:
df_1[(df_1['bound']=='S') & (df_1['train']=='A')].iloc[26:]

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train
313325,S,A..S54X010,B20170625WKD,A61S,27,A
313326,S,A..S54X010,B20170625WKD,A63S,28,A
313327,S,A..S54X010,B20170625WKD,A64S,29,A
313328,S,A..S54X010,B20170625WKD,A65S,30,A


In [21]:
stop_times[stop_times['train+direction']==
                      'A..S55X009'].iloc[26:37]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,service_id,sub_trip_id,train+direction,train,day
313667,B20170625WKD_078600_A..S55X009,14:14:00,14:14:00,A61S,27,B20170625WKD,78600,A..S55X009,A,WKD
313668,B20170625WKD_078600_A..S55X009,14:18:00,14:18:00,H02S,28,B20170625WKD,78600,A..S55X009,A,WKD
313669,B20170625WKD_078600_A..S55X009,14:20:00,14:20:00,H03S,29,B20170625WKD,78600,A..S55X009,A,WKD
313670,B20170625WKD_078600_A..S55X009,14:26:30,14:26:30,H04S,30,B20170625WKD,78600,A..S55X009,A,WKD
313671,B20170625WKD_078600_A..S55X009,14:30:30,14:34:30,H06S,31,B20170625WKD,78600,A..S55X009,A,WKD
313672,B20170625WKD_078600_A..S55X009,14:36:00,14:40:00,H07S,32,B20170625WKD,78600,A..S55X009,A,WKD
313673,B20170625WKD_078600_A..S55X009,14:42:00,14:42:00,H08S,33,B20170625WKD,78600,A..S55X009,A,WKD
313674,B20170625WKD_078600_A..S55X009,14:43:30,14:43:30,H09S,34,B20170625WKD,78600,A..S55X009,A,WKD
313675,B20170625WKD_078600_A..S55X009,14:45:30,14:45:30,H10S,35,B20170625WKD,78600,A..S55X009,A,WKD
313676,B20170625WKD_078600_A..S55X009,14:47:00,14:47:00,H11S,36,B20170625WKD,78600,A..S55X009,A,WKD


In [45]:
branch_S = pd.DataFrame(columns = ['bound',"route_id",
                                    'service_id','stop_id',
                                   'stop_sequence','train'])
branch_S = pd.concat([branch_S,
           stop_times[stop_times['train+direction']==
                      'A..S55X009'].iloc[27:36][['stop_id','stop_sequence']]])
branch_S['bound']='S'
branch_S['route_id']='A..S55X009'
branch_S['service_id']='B20170625WKD'
branch_S['train']='AS'

In [46]:
branch_S

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train
313668,S,A..S55X009,B20170625WKD,H02S,28,AS
313669,S,A..S55X009,B20170625WKD,H03S,29,AS
313670,S,A..S55X009,B20170625WKD,H04S,30,AS
313671,S,A..S55X009,B20170625WKD,H06S,31,AS
313672,S,A..S55X009,B20170625WKD,H07S,32,AS
313673,S,A..S55X009,B20170625WKD,H08S,33,AS
313674,S,A..S55X009,B20170625WKD,H09S,34,AS
313675,S,A..S55X009,B20170625WKD,H10S,35,AS
313676,S,A..S55X009,B20170625WKD,H11S,36,AS


In [47]:
df_1= pd.concat([df_1, branch_N,branch_S])

In [48]:
df_1 = df_1.sort_values(['train','bound','stop_sequence'])
df_1 = df_1.reset_index(drop= True)
stops = pd.read_csv('../GTFS_nyc_Subway/stops.txt')
df_1 = df_1.merge(stops[['stop_id','stop_name','stop_lat','stop_lon']],on=['stop_id'],how='left')

In [49]:
df_1.head()

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train,stop_name,stop_lat,stop_lon
0,N,1..N03R,A20170625WKD,139N,1,1,Rector St,40.707513,-74.013783
1,N,1..N03R,A20170625WKD,138N,2,1,Cortlandt St,40.711835,-74.012188
2,N,1..N03R,A20170625WKD,137N,3,1,Chambers St,40.715478,-74.009266
3,N,1..N03R,A20170625WKD,136N,4,1,Franklin St,40.719318,-74.006886
4,N,1..N03R,A20170625WKD,135N,5,1,Canal St,40.722854,-74.006277


In [50]:
### in most common route ,D line doesn't stop at Dekalb Av, but in longest route, it will
df_1[df_1['train']=='D'].loc[716:718]

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train,stop_name,stop_lat,stop_lon
716,N,D..N07R,B20170625WKD,R31N,15,D,Atlantic Av - Barclays Ctr,40.683666,-73.97881
717,N,D..N07R,B20170625WKD,D22N,16,D,Grand St,40.718267,-73.993753
718,N,D..N07R,B20170625WKD,D21N,17,D,Broadway-Lafayette St,40.725297,-73.996204


In [51]:
df_1.to_csv('../cleaned_data/subway_stops_sequence_weekday2(most_common_route)')

## GET DURATION BETWEEN STOPS

In [33]:
train2

{'1': (37, '1..N03R', 37, '1..S03R', 'A20170625WKD'),
 '2': (49, '2..N01R', 49, '2..S01R', 'A20170625WKD'),
 '3': (34, '3..N01R', 34, '3..S01R', 'A20170625WKD'),
 '4': (28, '4..N06R', 28, '4..S06R', 'A20170625WKD'),
 '5': (36, '5..N66R', 36, '5..S03R', 'A20170625WKD'),
 '6': (38, '6..N01R', 38, '6..S01R', 'A20170625WKD'),
 '7': (22, '7..N97R', 22, '7..S97R', 'A20170625WKD'),
 'A': (30, 'A..N54R', 30, 'A..S54X010', 'B20170625WKD'),
 'B': (27, 'B..N46R', 27, 'B..S46R', 'B20170625WKD'),
 'C': (40, 'C..N04R', 40, 'C..S04R', 'B20170625WKD'),
 'D': (36, 'D..N07R', 36, 'D..S07R', 'B20170625WKD'),
 'E': (20, 'E..N66R', 20, 'E..S71R', 'B20170625WKD'),
 'F': (45, 'F..N69R', 45, 'F..S69R', 'B20170625WKD'),
 'FS': (4, 'FS.N01R', 4, 'FS.S01R', 'B20170625WKD'),
 'G': (21, 'G..N14R', 21, 'G..S14R', 'B20170625WKD'),
 'GS': (2, 'GS.N03R', 2, 'GS.S01R', 'A20170625WKD'),
 'H': (5, 'H..N21R', 5, 'H..S21R', 'B20170625WKD'),
 'J': (30, 'J..N12R', 30, 'J..S12R', 'B20170625WKD'),
 'L': (24, 'L..N01R', 24, 'L.

In [34]:
duration=pd.DataFrame(columns=['train','service_id','route_id','bound',
                               'from_stop_id','from_stop','from_stop_sequence',
                               'to_stop_id','to_stop','to_stop_sequence','duration'])

for i in train2.keys():
#for i in ['1']:
    stops_1 = train2[i][0]       #length of north bound route
    route_id_1= train2[i][1]    # route id for north bound
    stops_2 = train2[i][2]       #length of north bound route
    route_id_2= train2[i][3]    # ruote id for north bound
    service_id = train2[i][4]    # service id for weekday
    
    ### north bound
    df= stop_times[(stop_times['train+direction']==route_id_1)].iloc[:stops_1]
    df.arrival_time = pd.to_datetime(df.arrival_time)

    duration_li=[]
    for j in df.index[1:]:
        duration_li.append(int((df.loc[j,'arrival_time']-
                                df.loc[j-1,'arrival_time']).total_seconds()))
    duration_1 = pd.DataFrame(data=duration_li,columns=['duration'])
    duration_1['route_id'] = route_id_1
    duration_1['service_id'] =service_id
    duration_1['bound'] = 'N'
    duration_1['train'] = i
    duration_1['from_stop_sequence']= range(1,stops_1)
    duration_1['to_stop_sequence']= range(2,stops_1+1)
    duration_1['from_stop_id'] = df['stop_id'][:-1].values
    duration_1['to_stop_id'] = df['stop_id'][1:].values
    duration = pd.concat([duration,duration_1])
    
    ### south bound
    df= stop_times[(stop_times['train+direction']==route_id_2)].iloc[:stops_2]
    df.arrival_time = pd.to_datetime(df.arrival_time)

    duration_li=[]
    for j in df.index[1:]:
        duration_li.append(int((df.loc[j,'arrival_time']-
                                df.loc[j-1,'arrival_time']).total_seconds()))
    duration_1 = pd.DataFrame(data=duration_li,columns=['duration'])
    duration_1['route_id'] = route_id_2
    duration_1['service_id'] =service_id
    duration_1['bound'] = 'S'
    duration_1['train'] = i
    duration_1['from_stop_sequence']= range(1,stops_2)
    duration_1['to_stop_sequence']= range(2,stops_2+1)
    duration_1['from_stop_id'] = df['stop_id'][:-1].values
    duration_1['to_stop_id'] = df['stop_id'][1:].values
    duration = pd.concat([duration,duration_1])

#### add north bound branch of A line

In [57]:
duration_N = pd.DataFrame(columns=['train','service_id','route_id',
                                   'bound','from_stop_id','from_stop',
                                   'from_stop_sequence','to_stop_id',
                                   'to_stop','to_stop_sequence','duration'])



In [75]:
duration_N.from_stop_id= branch_N['stop_id'][:-1].values
duration_N.from_stop_sequence= branch_N['stop_sequence'][:-1].values
duration_N.to_stop_id= branch_N['stop_id'][1:].values
duration_N.to_stop_sequence= branch_N['stop_sequence'][1:].values
duration_N.train = 'A'
duration_N.service_id = 'B20170625WKD'
duration_N.route_id = 'A..N55R'
duration_N.bound='N'

In [80]:
stops_1 = 11       #length of north bound route
route_id_1= 'A..N55R'   # route id for north bound
#stops_2 = train2[i][2]       #length of north bound route
#route_id_2= train2[i][3]    # ruote id for north bound
service_id = 'B20170625WKD'

df= stop_times[(stop_times['train+direction']==route_id_1)].iloc[:stops_1]
df.arrival_time = pd.to_datetime(df.arrival_time)

duration_li=[]
for j in df.index[1:]:
    duration_li.append(int((df.loc[j,'arrival_time']-
                                df.loc[j-1,'arrival_time']).total_seconds()))

duration_N.duration = duration_li

#### add south bound branch of A line

In [86]:
duration_S = pd.DataFrame(columns=['train','service_id','route_id',
                                   'bound','from_stop_id','from_stop',
                                   'from_stop_sequence','to_stop_id',
                                   'to_stop','to_stop_sequence','duration'])



In [87]:
branch_S

Unnamed: 0,bound,route_id,service_id,stop_id,stop_sequence,train
313668,S,A..S55X009,B20170625WKD,H02S,28,AS
313669,S,A..S55X009,B20170625WKD,H03S,29,AS
313670,S,A..S55X009,B20170625WKD,H04S,30,AS
313671,S,A..S55X009,B20170625WKD,H06S,31,AS
313672,S,A..S55X009,B20170625WKD,H07S,32,AS
313673,S,A..S55X009,B20170625WKD,H08S,33,AS
313674,S,A..S55X009,B20170625WKD,H09S,34,AS
313675,S,A..S55X009,B20170625WKD,H10S,35,AS
313676,S,A..S55X009,B20170625WKD,H11S,36,AS


In [88]:
duration_S.from_stop_id= branch_S['stop_id'][:-1].values
duration_S.from_stop_sequence= branch_S['stop_sequence'][:-1].values
duration_S.to_stop_id= branch_S['stop_id'][1:].values
duration_S.to_stop_sequence= branch_S['stop_sequence'][1:].values
duration_S.train = 'A'
duration_S.service_id = 'B20170625WKD'
duration_S.route_id = 'A..S55X009'
duration_S.bound='S'

In [97]:
#stops_1 = 11       #length of north bound route
route_id_1= 'A..S55X009'   # route id for north bound
#stops_2 = train2[i][2]       #length of north bound route
#route_id_2= train2[i][3]    # ruote id for north bound
service_id = 'B20170625WKD'

df= stop_times[(stop_times['train+direction']==route_id_1)].iloc[27:36]
df.arrival_time = pd.to_datetime(df.arrival_time)

duration_li=[]
for j in df.index[1:]:
    duration_li.append(int((df.loc[j,'arrival_time']-
                                df.loc[j-1,'arrival_time']).total_seconds()))

duration_S.duration = duration_li

In [101]:
duration = pd.concat([duration,duration_N,duration_S])

In [102]:
stops = pd.read_csv('../GTFS_nyc_Subway/stops.txt')
duration= duration.reset_index(drop= True)
for i in range(len(duration)):
    from_stop_id = duration.loc[i,'from_stop_id']
    to_stop_id = duration.loc[i,'to_stop_id']
    duration.loc[i,'from_stop'] = stops[stops['stop_id']==from_stop_id]['stop_name'].values[0]
    duration.loc[i,'to_stop'] = stops[stops['stop_id']==to_stop_id]['stop_name'].values[0]

In [103]:
duration.to_csv('../cleaned_data/subway_duration_between_stops_weekday(common_route)')

## shapes.txt, is showing the sequence of polyline, if mapping the trace of any train route

In [104]:
shapes = pd.read_csv('../GTFS_nyc_Subway/shapes.txt')
shapes.head()

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
0,1..N03R,40.702068,-74.013664,0,
1,1..N03R,40.703199,-74.014792,1,
2,1..N03R,40.703226,-74.01482,2,
3,1..N03R,40.703253,-74.014846,3,
4,1..N03R,40.70328,-74.01487,4,


In [105]:
### the route of S train from Time Sq - 42 St to Grand Central
shapes[shapes.shape_id == 'GS.N01R']

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled
88597,GS.N01R,40.752769,-73.979189,0,
88598,GS.N01R,40.755522,-73.985728,1,
88599,GS.N01R,40.755545,-73.985777,2,
88600,GS.N01R,40.75557,-73.985823,3,
88601,GS.N01R,40.755595,-73.985867,4,
88602,GS.N01R,40.755622,-73.985909,5,
88603,GS.N01R,40.755649,-73.985949,6,
88604,GS.N01R,40.755678,-73.985987,7,
88605,GS.N01R,40.755708,-73.986022,8,
88606,GS.N01R,40.755738,-73.986055,9,
