In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
def process_trip(single_trip):
    
    single_trip = single_trip[single_trip['timint']<2]
    arrivals = single_trip[['station_char']].drop_duplicates()
    
    arrival_max = single_trip.sort_values(by = 'request_date', ascending = False)
    arrival_max = arrival_max.drop_duplicates(subset = ['station_char'])[['station_char', 'estimated_arrival']]


    arrivals_at_station = single_trip[single_trip['timint']==0]
    arrivals_at_station = arrivals_at_station.sort_values(by = 'estimated_arrival')
    arrivals_at_station  = arrivals_at_station.drop_duplicates(subset = ['station_char'])[['station_char', 'estimated_arrival']]

    arrivals = arrivals.merge(arrival_max, how = 'left')

    arrivals_at_station = arrivals_at_station.rename(columns = {'estimated_arrival':'at_station_time'})
    arrivals = arrivals.merge(arrivals_at_station, how = 'left')
    arrivals['arrival'] = True
    
    arrivals['arrival_time'] = np.where(arrivals['at_station_time'].isna(),
                                        arrivals['estimated_arrival'], arrivals['at_station_time'])



    arrivals = arrivals.sort_values(by = 'estimated_arrival').reset_index(drop = True)
    
    arrivals = arrivals[['station_char', 'arrival', 'arrival_time']].rename(columns =
                                                                        {'arrival_time':'estimated_arrival'})
    if len(arrivals) != 0:
        single_trip = single_trip.merge(arrivals)
    else:
        return -1 # no eligible arrivals, continue to next trip

    single_trip = single_trip.sort_values(by = 'estimated_arrival')
    
    single_trip = single_trip[['station_char', 'subwayline', 'timint','stationid', 'traindirection', 'trip','trainid',
                 'estimated_arrival', 'arrival']]
    
    single_trip = single_trip.reset_index().merge(single_trip.shift(-1).reset_index()[['station_char',
                                                                                       'estimated_arrival','stationid',
                                                                                       'index']], 
                                left_on = 'index', right_on = 'index', suffixes = ['_o', '_d'])
    single_trip = single_trip.dropna()
    return single_trip

In [11]:
df = pd.read_csv('cleaned_subway/raw_subway_2019-11-29_PM.csv')

In [12]:
df['request_date'] = pd.to_datetime(df['request_date'])

In [13]:
df = df.sort_values(by = 'timint')

In [14]:
stn_lookup = pd.read_csv('train_arrival_stations_lookup.csv')

In [15]:
df['platformid'] = df['station_char'].str[3]

In [16]:
df

Unnamed: 0,requestid,id,station_char,subwayline,timint,traindirection,trainid,train_message,stationid,lineid,create_date,pollid,request_date,date,hour,dow,period,platformid
115970,738032,34288841325,QUN2,YUS,0.000000,South,196,AtStation,18,1,2019-11-29 20:59:43,10417,2019-11-29 20:59:43.688017-05:00,2019-11-29,20,4,PM,2
42287,687227,34285606344,YIE2,SHEP,0.000000,West,814,Delayed,30,1,2019-11-29 18:28:44,9705,2019-11-29 18:28:43.612801-05:00,2019-11-29,18,4,PM,2
42284,687227,34285606050,SHP1,YUS,0.000000,North,115,AtStation,30,1,2019-11-29 18:28:44,9705,2019-11-29 18:28:43.612801-05:00,2019-11-29,18,4,PM,1
118399,740796,34288146754,OSS1,BD,0.000000,East,222,AtStation,44,2,2019-11-29 20:24:52,10454,2019-11-29 20:24:51.302882-05:00,2019-11-29,20,4,PM,1
131594,750754,34287205166,PAP1,BD,0.000000,East,241,AtStation,55,2,2019-11-29 19:41:49,10592,2019-11-29 19:41:51.291830-05:00,2019-11-29,19,4,PM,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36233,684203,34287938682,KEN1,BD,17.334796,East,206,Arriving,63,2,2019-11-29 20:14:49,9663,2019-11-29 20:14:51.294143-05:00,2019-11-29,20,4,PM,1
15455,668422,34282700263,WIL1,YUS,17.460351,North,158,Arriving,2,1,2019-11-29 16:15:43,9439,2019-11-29 16:15:43.598633-05:00,2019-11-29,16,4,PM,1
115411,737942,34282381405,WIL1,YUS,17.460351,North,152,Arriving,2,1,2019-11-29 16:01:42,10416,2019-11-29 16:01:43.904510-05:00,2019-11-29,16,4,PM,1
38955,685945,34282764416,WIL1,YUS,17.715351,North,159,Arriving,2,1,2019-11-29 16:18:42,9688,2019-11-29 16:18:43.638384-05:00,2019-11-29,16,4,PM,1


# Filtering for only the most recent data in a request

Since each request has multiple train arrival times.

We're filtering on `station_char` and `requestid`. This might cause an issue with terminus, but we'll deal with it later.

In [17]:
df_newest = df.drop_duplicates(subset = ['requestid', 'station_char'])

In [23]:
train_list = []
for line in list(df_newest['subwayline'].drop_duplicates()):
    df_line = df_newest[df_newest['subwayline'] == line]
    
    train_ids = df_line['trainid'].drop_duplicates()
    
    for train_id in train_ids:
        
        single_train = df_line[df_line['trainid'] == train_id]
        
        single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')

        single_train = single_train.sort_values(by = 'request_date')

        single_train['trip'] = ((single_train['traindirection'] != single_train['traindirection'].shift(1))
                               | ((single_train['request_date'] - single_train['request_date'].shift(1)).dt.seconds > 1800)
                               ).cumsum() - 1        
        trip_num = single_train['trip'].max()
        
        
        arrival_list = []
        arrival_limit = None
        for i in range(trip_num + 1):
            if i == 0:
                current_arrivals = process_trip(single_train[single_train['trip'] == i])
                if isinstance(current_arrivals, int):
                    print('cont')
                    continue

            elif isinstance(current_arrivals, int): # when the previous current arrivals does not return a result
                current_arrivals = process_trip(single_train[single_train['trip'] == i])
                if isinstance(current_arrivals, int): # if the successives trip still does not return a result
                    print('cont')
                    continue
            else:
                
                if i < trip_num:
                    next_arrivals = process_trip(single_train[single_train['trip'] == i + 1])
                try:
                    arrival_limit = next_arrivals.head(1)[['estimated_arrival_d']].iloc[0,0] - datetime.timedelta(seconds = 45)

                except: # if the next trip doesn't return a result, then the arrival limit id not used

                    if len(current_arrivals['station_char_d']) == 0:
                        pass
                    else:
                        arrival_limit = current_arrivals.tail(1)[['estimated_arrival_d']].iloc[0,0] + datetime.timedelta(seconds = 45)

                if len(current_arrivals['station_char_d']) == 1:
                    pass
                elif len(current_arrivals['station_char_d']) == 0:
                    pass

                elif i < trip_num:

                    current_arrivals = current_arrivals[current_arrivals['estimated_arrival_d'] < arrival_limit]
                else:
                    pass
                if i < trip_num:
                    past_arrivals = current_arrivals
                    current_arrivals = next_arrivals
                else:
                    pass
            try:
                arrival_list.append(past_arrivals.copy())
            except:
                arrival_list.append(current_arrivals.copy())





        if len(arrival_list) == 0:
            continue
        else:
            arrival_trip = pd.concat(arrival_list)
        
        if arrival_trip['estimated_arrival_d'].max().hour < 11:
            if arrival_trip['subwayline'].drop_duplicates().iloc[0] == 'BD':
                cutoff = arrival_trip[arrival_trip['stationid_d'] == 55]['estimated_arrival_d'].max()

            elif (arrival_trip['subwayline'].drop_duplicates().iloc[0] == 'YUS') & (arrival_trip.tail(1)['stationid_d'].iloc[0] == 80):
                cutoff = arrival_trip[arrival_trip['stationid_d'] == 80]['estimated_arrival_d'].max()

            elif arrival_trip['subwayline'].drop_duplicates().iloc[0] == 'YUS':
                cutoff = arrival_trip[arrival_trip['stationid_d'] == 1]['estimated_arrival_d'].max()
            else:
                cutoff = arrival_trip['estimated_arrival_d'].max()
        else:
            cutoff = arrival_trip['estimated_arrival_d'].max()

        arrival_trip = arrival_trip[arrival_trip['estimated_arrival_d'] <= cutoff]


        arrival_trip = arrival_trip.reset_index(drop = True)
        
        arrival_trip = arrival_trip.merge(stn_lookup[['station_char', 'sequence']], left_on = ['station_char_o'], 
                   right_on = ['station_char']).drop(columns = ['station_char']).rename(
            columns = {'sequence':'sequence_o'})

        arrival_trip = arrival_trip.merge(stn_lookup[['station_char', 'sequence']], left_on = ['station_char_d'], 
                           right_on = ['station_char']).drop(columns = ['station_char']).rename(
            columns = {'sequence':'sequence_d'})
        
        train_list.append(arrival_trip)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

cont


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

cont


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [24]:
train_df = pd.concat(train_list)

In [25]:
len(train_df)

12380

In [26]:
train_df['cost'] = (train_df['estimated_arrival_d'] - train_df['estimated_arrival_o']).dt.seconds/60
train_df = train_df[train_df['cost']<60]
len(train_df)

12380

In [27]:
abs(train_df['sequence_o'] - train_df['sequence_d'])

0    1
1    1
2    1
3    1
4    1
    ..
3    2
4    1
5    1
6    2
7    1
Length: 12380, dtype: int64

In [30]:
train_df[train_df['trainid'] == 196]

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,sequence_o,sequence_d,cost
0,0,STC2,YUS,0.387822,25,South,0,196,2019-11-29 20:51:06.956848840-05:00,True,SUM2,2019-11-29 20:52:43.444845-05:00,24.0,31,30,1.6
1,1,SUM2,YUS,0.0,24,South,0,196,2019-11-29 20:52:43.444845-05:00,True,ROS2,2019-11-29 20:53:43.988218-05:00,23.0,30,29,1.0
2,2,ROS2,YUS,0.0,23,South,0,196,2019-11-29 20:53:43.988218-05:00,True,BLO2,2019-11-29 20:55:43.685313-05:00,22.0,29,28,1.983333
3,3,BLO2,YUS,0.0,22,South,0,196,2019-11-29 20:55:43.685313-05:00,True,WEL2,2019-11-29 20:56:43.574773-05:00,21.0,28,27,0.983333
4,4,WEL2,YUS,0.0,21,South,0,196,2019-11-29 20:56:43.574773-05:00,True,COL2,2019-11-29 20:57:43.346175-05:00,20.0,27,26,0.983333
5,5,COL2,YUS,0.0,20,South,0,196,2019-11-29 20:57:43.346175-05:00,True,DUN2,2019-11-29 20:58:43.276039-05:00,19.0,26,25,0.983333
6,6,DUN2,YUS,0.0,19,South,0,196,2019-11-29 20:58:43.276039-05:00,True,QUN2,2019-11-29 20:59:43.688017-05:00,18.0,25,24,1.0


In [28]:
train_df[abs(train_df['sequence_o'] - train_df['sequence_d']) == 0]

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,sequence_o,sequence_d,cost
96,32,VMC2,YUS,0.964922,80,South,0,115,2019-11-29 17:07:48.690675140-05:00,True,VMC1,2019-11-29 17:09:48.695254140-05:00,80.0,1,1,2.000000
97,32,VMC2,YUS,0.964922,80,South,0,115,2019-11-29 17:07:48.690675140-05:00,True,VMC1,2019-11-29 17:09:48.695254140-05:00,80.0,1,1,2.000000
99,37,VMC2,YUS,0.937352,80,South,3,123,2019-11-29 20:16:39.753829860-05:00,True,VMC1,2019-11-29 20:16:39.753829860-05:00,80.0,1,1,0.000000
92,36,VMC1,YUS,0.937352,80,South,2,171,2019-11-29 19:19:40.165872860-05:00,True,VMC2,2019-11-29 19:22:43.545539-05:00,80.0,1,1,3.050000
2,37,VMC1,YUS,0.937352,80,South,3,133,2019-11-29 20:40:39.843660860-05:00,True,VMC2,2019-11-29 20:40:50.715629-05:00,80.0,1,1,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0,KIP2,BD,0.254518,33,East,3,29,2019-11-29 18:12:58.729533419-05:00,True,KIP1,2019-11-29 18:13:43.597100-05:00,33.0,39,39,0.733333
30,0,KIP1,BD,1.265017,33,West,2,231,2019-11-29 18:18:06.673903200-05:00,True,KIP2,2019-11-29 18:18:06.673903200-05:00,33.0,39,39,0.000000
32,0,KIP2,BD,0.254518,33,East,3,231,2019-11-29 18:17:59.154223419-05:00,True,KIP1,2019-11-29 18:18:43.773433-05:00,33.0,39,39,0.733333
0,0,KEN2,BD,0.566031,63,East,0,34,2019-11-29 19:09:25.243803979-05:00,True,KEN1,2019-11-29 19:09:51.283696-05:00,63.0,69,69,0.433333


In [36]:
train_df[abs(train_df['sequence_o'] - train_df['sequence_d']) > 1].reset_index(drop = True)

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,sequence_o,sequence_d,cost
0,0,KEN2,BD,0.0,63,West,4,225,2019-12-02 11:08:44.520436-05:00,True,VPK2,2019-12-02 11:16:26.695520500-05:00,61.0,69,67,7.7
1,1,KIP1,BD,0.0,33,East,2,336,2019-12-02 10:13:37.619817-05:00,True,SHE1,2019-12-02 10:41:37.347513-05:00,51.0,39,57,27.983333
2,0,KIP1,BD,0.0,33,East,4,202,2019-12-02 11:14:44.532431-05:00,True,OML1,2019-12-02 11:16:53.980977-05:00,36.0,39,42,2.15
3,1,RYK1,BD,0.0,35,East,6,241,2019-12-02 11:14:44.531661-05:00,True,KIP1,2019-12-02 11:16:37.502858-05:00,33.0,41,39,1.866667
4,2,VPK2,BD,0.686247,61,West,3,16,2019-12-02 11:11:26.696050500-05:00,True,WDB2,2019-12-02 11:14:44.522808-05:00,59.0,67,65,3.283333
5,4,COX2,BD,0.0,58,West,3,16,2019-12-02 11:15:45.521792-05:00,True,MST2,2019-12-02 11:18:02.999888540-05:00,60.0,64,66,2.283333
6,5,MST2,BD,0.291297,60,West,3,16,2019-12-02 11:18:02.999888540-05:00,True,GWD2,2019-12-02 11:23:28.944792760-05:00,57.0,66,63,5.416667
7,12,BAT1,BD,1.021368,46,East,4,19,2019-12-02 09:50:46.634486140-05:00,True,SGL1,2019-12-02 09:53:44.633731840-05:00,48.0,52,54,2.95
8,17,SHE1,BD,1.119917,51,East,4,19,2019-12-02 09:58:52.548972720-05:00,True,BRD1,2019-12-02 10:02:00.758450720-05:00,53.0,57,59,3.133333
9,4,WDB2,BD,1.555969,59,West,1,32,2019-12-02 11:41:18.884871060-05:00,True,PAP2,2019-12-02 11:47:45.529033-05:00,55.0,65,61,6.433333


In [35]:
train_df.sort_values(by = 'cost', ascending = False).head(50)

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,sequence_o,sequence_d,cost
11,11,LAW1,YUS,0.0,28,North,0,198,2019-12-02 07:21:52.730326-05:00,True,YKM1,2019-12-02 08:02:22.190776460-05:00,29.0,34,35,40.483333
60,37,VMC2,YUS,1.565961,80,South,1,112,2019-12-02 09:35:11.776734720-05:00,True,VMC1,2019-12-02 10:15:35.239404120-05:00,80.0,1,1,40.383333
3,30,STC1,YUS,0.0,25,North,2,101,2019-12-02 09:52:37.401913-05:00,True,NYC1,2019-12-02 10:26:37.731992-05:00,31.0,31,37,34.0
30,1,KIP1,BD,0.0,33,East,2,336,2019-12-02 10:13:37.619817-05:00,True,SHE1,2019-12-02 10:41:37.347513-05:00,51.0,39,57,27.983333
6,6,DON2,BD,0.0,56,West,0,337,2019-12-02 09:26:45.332796-05:00,True,PAP2,2019-12-02 09:53:45.350688-05:00,55.0,62,61,27.0
5,5,GWD2,BD,0.0,57,West,1,346,2019-12-02 09:29:45.334005-05:00,True,DON2,2019-12-02 09:54:45.345193-05:00,56.0,63,62,25.0
5,5,COX2,BD,0.0,58,West,0,432,2019-12-02 09:32:45.341695-05:00,True,GWD2,2019-12-02 09:56:45.346491-05:00,57.0,64,63,24.0
11,5,GWD2,BD,0.0,57,West,3,332,2019-12-02 09:32:45.335164-05:00,True,DON2,2019-12-02 09:55:50.099046580-05:00,56.0,63,62,23.066667
109,34,YKM1,YUS,0.0,29,North,2,167,2019-12-02 10:00:37.349153-05:00,True,NYC1,2019-12-02 10:23:37.433371-05:00,31.0,35,37,23.0
15,30,STC1,YUS,0.793473,25,North,2,109,2019-12-02 10:09:25.259083040-05:00,True,YKM1,2019-12-02 10:31:37.434956-05:00,29.0,31,35,22.2


In [78]:
train_df[(train_df['cost'] > 0.4) & (train_df['cost'] < 15)]

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,cost
0,0,BYV1,SHEP,0.000000,65,East,0,463,2019-11-28 07:01:51.417028-05:00,True,BSS1,2019-11-28 07:03:45.173927240-05:00,66.0,1.883333
1,1,BSS1,SHEP,0.895973,66,East,0,463,2019-11-28 07:03:45.173927240-05:00,True,LES1,2019-11-28 07:04:51.416831-05:00,67.0,1.100000
2,2,LES1,SHEP,0.000000,67,East,0,463,2019-11-28 07:04:51.416831-05:00,True,DML1,2019-11-28 07:07:31.888566640-05:00,68.0,2.666667
4,0,DML2,SHEP,0.000000,68,West,1,463,2019-11-28 07:07:51.419204-05:00,True,LES2,2019-11-28 07:12:51.427952-05:00,67.0,5.000000
5,1,LES2,SHEP,0.000000,67,West,1,463,2019-11-28 07:12:51.427952-05:00,True,BSS2,2019-11-28 07:14:46.391402980-05:00,66.0,1.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,1,PVL1,YUS,0.227492,78,North,0,52,2019-11-28 10:54:57.147737420-05:00,True,YUN1,2019-11-28 10:56:43.655887-05:00,77.0,1.766667
2,2,YUN1,YUS,0.000000,77,North,0,52,2019-11-28 10:56:43.655887-05:00,True,FIW1,2019-11-28 10:58:43.494615-05:00,76.0,1.983333
3,3,FIW1,YUS,0.000000,76,North,0,52,2019-11-28 10:58:43.494615-05:00,True,DNP1,2019-11-28 11:01:43.995897-05:00,75.0,3.000000
4,4,DNP1,YUS,0.000000,75,North,0,52,2019-11-28 11:01:43.995897-05:00,True,SHW1,2019-11-28 11:04:35.538133500-05:00,1.0,2.850000


In [79]:
train_df[(train_df['cost'] < 0.4)].sort_values(by = 'cost')

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,cost
3,3,DML1,SHEP,0.674497,68,East,0,463,2019-11-28 07:07:31.888566640-05:00,True,DML2,2019-11-28 07:07:31.888566640-05:00,68.0,0.000000
46,29,KIP2,BD,0.862818,33,West,3,203,2019-11-28 08:29:43.113328720-05:00,True,KIP1,2019-11-28 08:29:43.113328720-05:00,33.0,0.000000
31,30,KIP1,BD,0.862818,33,West,1,236,2019-11-28 07:54:43.099116720-05:00,True,KIP2,2019-11-28 07:54:43.099116720-05:00,33.0,0.000000
0,0,KIP1,BD,0.604911,33,West,0,213,2019-11-28 07:01:20.270808140-05:00,True,KIP2,2019-11-28 07:01:20.270808140-05:00,33.0,0.000000
75,31,KEN1,BD,0.820667,63,East,2,201,2019-11-28 09:17:40.594182240-05:00,True,KEN2,2019-11-28 09:17:40.594182240-05:00,63.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1,YIE1,SHEP,0.000000,64,East,16,462,2019-11-28 09:46:51.367945-05:00,True,YIE2,2019-11-28 09:47:13.844385300-05:00,30.0,0.366667
91,0,YIE2,SHEP,0.503356,64,East,23,461,2019-11-28 11:09:20.804812300-05:00,True,YIE1,2019-11-28 11:09:43.666333-05:00,30.0,0.366667
115,0,YIE2,SHEP,0.503356,64,East,26,462,2019-11-28 11:37:20.823004300-05:00,True,YIE1,2019-11-28 11:37:43.725591-05:00,30.0,0.366667
52,0,YIE2,SHEP,0.503356,64,East,12,462,2019-11-28 09:03:21.549763300-05:00,True,YIE1,2019-11-28 09:03:43.842750-05:00,30.0,0.366667


In [80]:
train_df[(train_df['station_char_o'].str[0:3] == train_df['station_char_d'].str[0:3])]

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,cost
3,3,DML1,SHEP,0.674497,68,East,0,463,2019-11-28 07:07:31.888566640-05:00,True,DML2,2019-11-28 07:07:31.888566640-05:00,68.0,0.000000
7,0,YIE2,SHEP,0.503356,64,East,2,463,2019-11-28 07:19:21.630879300-05:00,True,YIE1,2019-11-28 07:19:43.566730-05:00,30.0,0.350000
11,0,DML1,SHEP,0.418792,68,West,3,463,2019-11-28 07:29:16.561617760-05:00,True,DML2,2019-11-28 07:29:51.434776-05:00,68.0,0.566667
16,5,YIE2,SHEP,1.006711,30,West,3,463,2019-11-28 07:40:51.850606540-05:00,True,YIE1,2019-11-28 07:40:51.850606540-05:00,30.0,0.000000
21,4,DML2,SHEP,0.551678,68,East,4,463,2019-11-28 07:50:24.412125120-05:00,True,DML1,2019-11-28 07:50:51.310334-05:00,68.0,0.433333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,36,VMC1,YUS,0.961779,80,South,2,135,2019-11-28 10:01:48.822732120-05:00,True,VMC2,2019-11-28 10:04:43.825634-05:00,80.0,2.916667
66,36,VMC1,YUS,0.961779,80,South,1,139,2019-11-28 10:12:41.229193120-05:00,True,VMC2,2019-11-28 10:15:43.817663-05:00,80.0,3.033333
40,9,VMC2,YUS,0.961779,80,South,5,123,2019-11-28 09:43:41.252144120-05:00,True,VMC1,2019-11-28 09:45:05.184776820-05:00,80.0,1.383333
17,17,VMC1,YUS,0.357577,80,South,0,514,2019-11-28 11:05:21.052241820-05:00,True,VMC2,2019-11-28 11:07:43.441701-05:00,80.0,2.366667


In [81]:
train_df[(train_df['station_char_o'].str[0:3] != train_df['station_char_d'].str[0:3]) & (train_df['cost'] < 0.4 )].reset_index(drop = True)

Unnamed: 0,index,station_char_o,subwayline,timint,stationid_o,traindirection,trip,trainid,estimated_arrival_o,arrival,station_char_d,estimated_arrival_d,stationid_d,cost
0,1,BSS2,SHEP,0.916107,66,West,11,463,2019-11-28 09:05:46.313604980-05:00,True,BYV2,2019-11-28 09:05:51.348379-05:00,65.0,0.083333
1,2,BYV1,SHEP,1.278523,65,East,11,461,2019-11-28 09:03:08.058430400-05:00,True,BSS1,2019-11-28 09:03:08.984957920-05:00,66.0,0.0
2,25,RUN2,BD,0.801858,38,West,2,213,2019-11-28 08:38:31.581716720-05:00,True,RYK2,2019-11-28 08:38:43.977257-05:00,35.0,0.2
3,15,MUS2,YUS,0.0,11,South,2,151,2019-11-28 09:39:43.558247-05:00,True,KNG2,2019-11-28 09:39:56.879535740-05:00,17.0,0.216667
4,19,STA2,YUS,0.667184,15,South,2,151,2019-11-28 09:44:23.789823900-05:00,True,DUP2,2019-11-28 09:44:43.719085-05:00,8.0,0.316667
5,12,BLO2,YUS,1.94381,50,South,2,158,2019-11-28 09:57:40.222264380-05:00,True,DUP2,2019-11-28 09:57:46.118908740-05:00,8.0,0.083333
6,20,GCN2,YUS,1.119987,5,South,2,158,2019-11-28 10:07:50.962539260-05:00,True,STA2,2019-11-28 10:08:11.148869620-05:00,15.0,0.333333
7,21,OSG2,YUS,1.181165,14,South,2,158,2019-11-28 10:08:54.327205860-05:00,True,LWW2,2019-11-28 10:09:12.044801320-05:00,4.0,0.283333
8,14,BLO2,YUS,0.0,22,South,1,134,2019-11-28 08:53:43.477477-05:00,True,SPA2,2019-11-28 08:53:49.356477840-05:00,47.0,0.083333
9,15,DUP2,YUS,0.745118,8,South,1,134,2019-11-28 08:55:28.079760820-05:00,True,WEL2,2019-11-28 08:55:43.339253-05:00,21.0,0.25


In [82]:
train_df[[
    'index', 'subwayline','station_char_o', 'stationid_o', 'traindirection', 'trip', 'trainid', 'estimated_arrival_o',
    'station_char_d',  'stationid_d', 'estimated_arrival_d', 'cost'
]].to_csv('processed_subway/train-arrival_2019-11-28_AM.csv', index = False)

In [92]:
train_df[train_df['subw'[['station_char_o', 'stationid_o']].drop_duplicates().sort_values(by = 'stationid_o').iloc[100:150]

Unnamed: 0,station_char_o,stationid_o
11,BSP1,47
7,SPA1,47
53,SPA2,47
75,BSP2,47
80,SGU2,48
111,SGU1,48
106,SGL1,48
14,SGL2,48
14,BAU2,49
46,BAU1,49
