In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
def process_trip(single_trip):

    arrivals = single_trip[['station_char']].drop_duplicates()
    arrival_max = single_trip[single_trip['timint']!=0]
    arrival_max = arrival_max.sort_values(by = 'request_date', ascending = False)
    arrival_max = arrival_max.drop_duplicates(subset = ['station_char'])[['station_char', 'estimated_arrival']]

    arrivals_at_station = single_trip[single_trip['timint']==0]
    arrivals_at_station = arrivals_at_station.sort_values(by = 'estimated_arrival')
    arrivals_at_station  = arrivals_at_station.drop_duplicates(subset = ['station_char'])[['station_char',
                                                                                           'estimated_arrival']]
    arrivals = arrivals.merge(arrival_max, how = 'left')

    arrivals_at_station = arrivals_at_station.rename(columns = {'estimated_arrival':'at_station_time'})
    arrivals = arrivals.merge(arrivals_at_station, how = 'left')
    arrivals['arrival'] = True

    arrivals['estimated_arrival'] = np.where(arrivals['estimated_arrival'].isna(), 
                                             arrivals['at_station_time'], arrivals['estimated_arrival'])

    arrivals['estimated_arrival'] = np.where(arrivals['estimated_arrival']>arrivals['at_station_time'], 
                                             arrivals['at_station_time'], arrivals['estimated_arrival'])

    arrivals = arrivals.sort_values(by = 'estimated_arrival').reset_index(drop = True)
    
    departures = single_trip[['station_char']].drop_duplicates()
    departures_max = single_trip[single_trip['timint'] == 0]
    departures_max = departures_max.sort_values(by = 'estimated_arrival', ascending = False)
    departures_max = departures_max.drop_duplicates(subset = ['station_char'])[['station_char', 'estimated_arrival']]

    departures = departures.merge(departures_max, how = 'left')
    departures['departure'] = True

    departures = departures.merge(arrivals.rename(columns = {'estimated_arrival':'before_station_time'}))

    departures['estimated_arrival'] = np.where(departures['estimated_arrival'].isna(), departures['before_station_time'], departures['estimated_arrival'])
    departures = departures[['station_char', 'estimated_arrival', 'departure']]
    departures = departures.sort_values(by = 'estimated_arrival').reset_index(drop = True)
    arrivals['previous_depart'] = departures['estimated_arrival'].shift(1)


    arrivals['estimated_arrival'] = np.where(arrivals['estimated_arrival'] < arrivals['previous_depart'], 
                                         arrivals['at_station_time'], arrivals['estimated_arrival'])
    
    arrivals = arrivals[['station_char', 'estimated_arrival', 'arrival']]
    
    single_trip = single_trip.merge(arrivals, how = 'left')
    single_trip = single_trip.merge(departures, how = 'left')

    single_trip = single_trip.sort_values(by = 'estimated_arrival')
    
    single_trip = single_trip[['station_char', 'subwayline', 'timint', 'traindirection', 'trip','trainid',
                 'stationid', 'estimated_arrival', 'arrival', 'departure']]
    
    return single_trip[~single_trip['departure'].isna()].reset_index(drop = True), single_trip[~single_trip['arrival'].isna()].reset_index(drop = True)

In [4]:
df = pd.read_csv('cleaned_subway/raw_subway_2019-12-02_AM.csv')

In [5]:
df['request_date'] = pd.to_datetime(df['request_date'])

In [6]:
df = df.sort_values(by = 'timint')

In [7]:
df['platformid'] = df['station_char'].str[3]

In [8]:
df

Unnamed: 0,requestid,id,station_char,subwayline,timint,traindirection,trainid,train_message,stationid,lineid,create_date,pollid,request_date,date,hour,dow,period,platformid
101304,713150,34353816578,PAP1,BD,0.000000,East,225,AtStation,55,2,2019-12-02 10:52:44,10264,2019-12-02 10:52:44.519723-05:00,2019-12-02,10,0,AM,1
19350,653171,34353340655,EGL1,YUS,0.000000,North,113,AtStation,27,1,2019-12-02 10:32:40,9431,2019-12-02 10:32:37.309955-05:00,2019-12-02,10,0,AM,1
111745,719939,34350533251,SPA1,YUS,0.000000,North,155,AtStation,9,1,2019-12-02 08:36:42,10359,2019-12-02 08:36:37.646941-05:00,2019-12-02,8,0,AM,1
69092,687075,34350435874,GWD1,BD,0.000000,East,237,AtStation,57,2,2019-12-02 08:32:42,9901,2019-12-02 08:32:45.307863-05:00,2019-12-02,8,0,AM,1
19332,653168,34353340462,SUM1,YUS,0.000000,North,119,AtStation,24,1,2019-12-02 10:32:39,9431,2019-12-02 10:32:37.308882-05:00,2019-12-02,10,0,AM,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46298,669839,34352171727,SPA1,YUS,32.599441,North,123,Arriving,47,2,2019-12-02 09:43:46,9661,2019-12-02 09:43:45.348904-05:00,2019-12-02,9,0,AM,1
46035,669795,34352168218,SPA1,YUS,32.599441,North,123,Arriving,9,1,2019-12-02 09:43:38,9661,2019-12-02 09:43:37.816021-05:00,2019-12-02,9,0,AM,1
46310,669840,34352171730,SGU1,YUS,33.777479,North,123,Arriving,48,2,2019-12-02 09:43:46,9661,2019-12-02 09:43:45.348516-05:00,2019-12-02,9,0,AM,1
46047,669796,34352168221,SGU1,YUS,33.777479,North,123,Arriving,10,1,2019-12-02 09:43:38,9661,2019-12-02 09:43:37.816409-05:00,2019-12-02,9,0,AM,1


# Filtering for only the most recent data in a request

Since each request has multiple train arrival times.

We're filtering on `station_char` and `requestid`. This might cause an issue with terminus, but we'll deal with it later.

In [9]:
df_newest = df.drop_duplicates(subset = ['requestid', 'station_char'])

In [10]:
train_list = []
for line in list(df_newest['subwayline'].drop_duplicates()):
    df_line = df_newest[df_newest['subwayline'] == line]
    
    train_ids = df_line['trainid'].drop_duplicates()
    
    for train_id in train_ids:
        
        single_train = df_line[df_line['trainid'] == train_id]
        
        single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')

        single_train = single_train.sort_values(by = 'request_date')

        single_train['trip'] = (single_train['traindirection'] != single_train['traindirection'].shift(1)).cumsum() - 1
        
        trip_num = single_train['trip'].max()
        
        arrival_list = []
        departure_list = []
        for i in range(trip_num + 1):
            if i == 0:
                current_departures, current_arrivals = process_trip(single_train[single_train['trip'] == i])

            if i < trip_num:
                next_departures, next_arrivals = process_trip(single_train[single_train['trip'] == i + 1])

            arrival_limit = next_arrivals.head(1)[['estimated_arrival']].iloc[0,0] - datetime.timedelta(seconds = 45)

            if len(current_arrivals['station_char']) == 1:
                pass

            elif i < trip_num:

                current_departures = current_departures[current_departures['estimated_arrival'] < arrival_limit]
                current_arrivals = current_arrivals[current_arrivals['estimated_arrival'] < arrival_limit]
            else:
                pass

            arrival_list.append(current_arrivals.copy())
            departure_list.append(current_departures.copy())

            current_arrivals = next_arrivals
            current_departures = next_departures


        arrival_trip = pd.concat(arrival_list)
        departure_trip = pd.concat(departure_list)        

        if arrival_trip['estimated_arrival'].max().hour < 11:
            if arrival_trip['subwayline'].drop_duplicates().iloc[0] == 'BD':
                cutoff = arrival_trip[arrival_trip['stationid'] == 55]['estimated_arrival'].max()

            elif (arrival_trip['subwayline'].drop_duplicates().iloc[0] == 'YUS') & (arrival_trip.tail(1)['stationid'].iloc[0] == 80):
                cutoff = arrival_trip[arrival_trip['stationid'] == 80]['estimated_arrival'].max()

            elif arrival_trip['subwayline'].drop_duplicates().iloc[0] == 'YUS':
                cutoff = arrival_trip[arrival_trip['stationid'] == 1]['estimated_arrival'].max()
            else:
                cutoff = arrival_trip['estimated_arrival'].max()
        else:
            cutoff = arrival_trip['estimated_arrival'].max()

        arrival_trip = arrival_trip[arrival_trip['estimated_arrival'] <= cutoff]
        departure_trip = departure_trip[departure_trip['estimated_arrival'] <= cutoff]  

        arrival_trip = arrival_trip.reset_index(drop = True).reset_index()

        departure_trip = departure_trip.reset_index(drop = True).reset_index()
        departure_trip['index'] = departure_trip['index'] + 1

        departure_trip = departure_trip.rename(columns = {'station_char':'dep_stn_char', 'estimated_arrival':'dep_time'})
        arrival_trip = arrival_trip.rename(columns = {'station_char':'arr_stn_char', 'estimated_arrival':'arr_time'})

        train = departure_trip.merge(arrival_trip[['index', 'arr_stn_char', 'arr_time']]).rename(
            columns = {'index':'sequence'})[['sequence','dep_stn_char', 'arr_stn_char', 'subwayline', 'trainid',
                                                 'trip', 'dep_time', 'arr_time']]
        
        
        train_list.append(train)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  single_train['estimated_arrival'] = single_train['request_date'] + pd.to_timedelta(single_train['timint'], 'm')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [11]:
train_df = pd.concat(train_list)

In [12]:
train_df['cost'] = (train_df['arr_time'] - train_df['dep_time']).dt.seconds/60

In [20]:
train_df.sort_values(by = 'cost', ascending = False).head(50)

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
91,92,GCN2,LWW2,YUS,104,3,2019-12-02 11:47:37.559395-05:00,2019-12-02 11:45:37.561797-05:00,1438.0
34,35,MST2,COX2,BD,16,3,2019-12-02 11:18:02.999888540-05:00,2019-12-02 11:15:45.521792-05:00,1437.7
80,81,LAW1,YKM1,YUS,108,2,2019-12-02 10:31:37.434549-05:00,2019-12-02 10:28:37.532990-05:00,1437.0
103,104,WIL2,DNP2,YUS,165,2,2019-12-02 11:29:37.727817-05:00,2019-12-02 11:26:37.403865-05:00,1436.983333
99,100,SHW2,DNP2,YUS,171,3,2019-12-02 11:49:37.709224-05:00,2019-12-02 11:45:45.541204-05:00,1436.116667
98,99,WIL2,SHW2,YUS,171,3,2019-12-02 11:47:37.558173-05:00,2019-12-02 11:43:37.734541-05:00,1436.0
58,59,VMC2,PVL2,YUS,112,1,2019-12-02 09:35:11.776734720-05:00,2019-12-02 09:30:37.472344-05:00,1435.416667
9,10,SGL2,YNG2,BD,230,0,2019-12-02 07:25:44.733108-05:00,2019-12-02 07:20:37.342157-05:00,1434.866667
87,88,SHW2,DNP2,YUS,168,3,2019-12-02 11:47:37.557542-05:00,2019-12-02 11:39:37.585256-05:00,1432.0
11,12,CHE2,GWD2,BD,244,1,2019-12-02 07:49:44.729326-05:00,2019-12-02 07:41:44.722315-05:00,1431.983333


In [14]:
train_df

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
0,1,BRD2,CFK2,BD,225,0,2019-12-02 07:00:44.731187-05:00,2019-12-02 07:02:08.400114280-05:00,1.383333
1,2,CFK2,SHE2,BD,225,0,2019-12-02 07:04:45.730701-05:00,2019-12-02 07:05:44.737062-05:00,0.983333
2,3,SHE2,YNG2,BD,225,0,2019-12-02 07:05:44.737062-05:00,2019-12-02 07:07:37.582704-05:00,1.866667
3,4,YNG2,BAU2,BD,225,0,2019-12-02 07:07:44.732029-05:00,2019-12-02 07:08:33.293692-05:00,0.800000
4,5,BAU2,SGL2,BD,225,0,2019-12-02 07:08:44.732580-05:00,2019-12-02 07:09:44.732858-05:00,1.000000
...,...,...,...,...,...,...,...,...,...
123,124,YIE1,BYV1,SHEP,464,27,2019-12-02 11:49:45.526567-05:00,2019-12-02 11:52:45.525570-05:00,2.983333
124,125,BYV1,BSS1,SHEP,464,27,2019-12-02 11:52:45.525570-05:00,2019-12-02 11:54:03.162404920-05:00,1.283333
125,126,BSS1,LES1,SHEP,464,27,2019-12-02 11:54:03.162404920-05:00,2019-12-02 11:55:38.438471700-05:00,1.583333
126,127,LES1,DML1,SHEP,464,27,2019-12-02 11:55:45.526001-05:00,2019-12-02 11:58:17.625520120-05:00,2.533333


In [15]:
train_df[(train_df['cost'] > 0.4) & (train_df['cost'] < 15)]

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
0,1,BRD2,CFK2,BD,225,0,2019-12-02 07:00:44.731187-05:00,2019-12-02 07:02:08.400114280-05:00,1.383333
1,2,CFK2,SHE2,BD,225,0,2019-12-02 07:04:45.730701-05:00,2019-12-02 07:05:44.737062-05:00,0.983333
2,3,SHE2,YNG2,BD,225,0,2019-12-02 07:05:44.737062-05:00,2019-12-02 07:07:37.582704-05:00,1.866667
3,4,YNG2,BAU2,BD,225,0,2019-12-02 07:07:44.732029-05:00,2019-12-02 07:08:33.293692-05:00,0.800000
4,5,BAU2,SGL2,BD,225,0,2019-12-02 07:08:44.732580-05:00,2019-12-02 07:09:44.732858-05:00,1.000000
...,...,...,...,...,...,...,...,...,...
121,122,BYV2,YIE2,SHEP,464,26,2019-12-02 11:45:07.993526640-05:00,2019-12-02 11:48:05.458278900-05:00,2.950000
123,124,YIE1,BYV1,SHEP,464,27,2019-12-02 11:49:45.526567-05:00,2019-12-02 11:52:45.525570-05:00,2.983333
124,125,BYV1,BSS1,SHEP,464,27,2019-12-02 11:52:45.525570-05:00,2019-12-02 11:54:03.162404920-05:00,1.283333
125,126,BSS1,LES1,SHEP,464,27,2019-12-02 11:54:03.162404920-05:00,2019-12-02 11:55:38.438471700-05:00,1.583333


In [16]:
train_df[(train_df['cost'] < 0.4)].sort_values(by = 'cost')

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
51,52,KEN1,KEN1,BD,328,2,2019-12-02 10:50:03.273416420-05:00,2019-12-02 10:50:03.273416420-05:00,0.000000
27,28,YIE2,YIE2,SHEP,463,6,2019-12-02 08:02:14.926577300-05:00,2019-12-02 08:02:14.926577300-05:00,0.000000
22,23,DML1,DML2,SHEP,463,5,2019-12-02 07:51:21.120314160-05:00,2019-12-02 07:51:21.120314160-05:00,0.000000
17,18,YIE2,YIE1,SHEP,463,4,2019-12-02 07:40:15.928511300-05:00,2019-12-02 07:40:15.928511300-05:00,0.000000
12,13,DML1,DML2,SHEP,463,3,2019-12-02 07:28:58.047373680-05:00,2019-12-02 07:28:58.047373680-05:00,0.000000
...,...,...,...,...,...,...,...,...,...
85,86,STP2,QPK2,YUS,113,3,2019-12-02 11:45:51.271307500-05:00,2019-12-02 11:46:02.756189220-05:00,0.183333
29,30,BYV1,BSS1,SHEP,464,7,2019-12-02 08:13:31.395878940-05:00,2019-12-02 08:13:44.724500-05:00,0.216667
26,27,OSG1,STA1,YUS,135,1,2019-12-02 08:01:00.903982260-05:00,2019-12-02 08:01:16.372397440-05:00,0.250000
56,57,DNP2,PVL2,YUS,117,1,2019-12-02 09:18:42.987971340-05:00,2019-12-02 09:18:59.131839340-05:00,0.266667


In [17]:
train_df[(train_df['dep_stn_char'].str[0:3] == train_df['arr_stn_char'].str[0:3])]

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
51,52,KEN1,KEN1,BD,328,2,2019-12-02 10:50:03.273416420-05:00,2019-12-02 10:50:03.273416420-05:00,0.000000
30,31,KIP2,KIP1,BD,336,2,2019-12-02 10:13:09.007220500-05:00,2019-12-02 10:13:09.007220500-05:00,0.000000
46,47,KEN2,KEN1,BD,336,3,2019-12-02 11:07:45.518561-05:00,2019-12-02 11:10:04.761684360-05:00,2.316667
47,48,KEN2,KEN2,BD,336,4,2019-12-02 11:10:04.761684360-05:00,2019-12-02 11:10:04.761684360-05:00,0.000000
67,68,KIP2,KIP1,BD,323,3,2019-12-02 11:20:53.083569520-05:00,2019-12-02 11:20:53.083569520-05:00,0.000000
...,...,...,...,...,...,...,...,...,...
103,104,YIE2,YIE1,SHEP,464,23,2019-12-02 11:05:15.719709300-05:00,2019-12-02 11:05:15.719709300-05:00,0.000000
108,109,DML1,DML2,SHEP,464,24,2019-12-02 11:15:09.646093760-05:00,2019-12-02 11:15:09.646093760-05:00,0.000000
113,114,YIE2,YIE2,SHEP,464,25,2019-12-02 11:26:55.153922480-05:00,2019-12-02 11:26:55.153922480-05:00,0.000000
122,123,YIE2,YIE1,SHEP,464,27,2019-12-02 11:48:05.458278900-05:00,2019-12-02 11:48:05.458278900-05:00,0.000000


In [25]:
train_df[(train_df['dep_stn_char'].str[0:3] != train_df['arr_stn_char'].str[0:3]) & (train_df['cost'] < 0.4 )].reset_index(drop = True)

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
0,3,MST2,VPK2,BD,334,1,2019-12-02 08:24:44.726902-05:00,2019-12-02 08:24:47.994538860-05:00,0.05
1,86,STP2,QPK2,YUS,113,3,2019-12-02 11:45:51.271307500-05:00,2019-12-02 11:46:02.756189220-05:00,0.183333
2,57,DNP2,PVL2,YUS,117,1,2019-12-02 09:18:42.987971340-05:00,2019-12-02 09:18:59.131839340-05:00,0.266667
3,109,GCN2,YKD2,YUS,102,3,2019-12-02 11:47:31.939129180-05:00,2019-12-02 11:47:37.558609-05:00,0.083333
4,89,STP2,QPK2,YUS,160,2,2019-12-02 10:53:51.074948500-05:00,2019-12-02 10:54:02.629922220-05:00,0.183333
5,76,WIL2,SHW2,YUS,140,2,2019-12-02 10:27:29.606386360-05:00,2019-12-02 10:27:37.628821-05:00,0.133333
6,34,GCN1,CVL1,YUS,151,1,2019-12-02 08:18:37.473425-05:00,2019-12-02 08:18:57.484025640-05:00,0.333333
7,27,OSG1,STA1,YUS,135,1,2019-12-02 08:01:00.903982260-05:00,2019-12-02 08:01:16.372397440-05:00,0.25
8,30,BYV1,BSS1,SHEP,464,7,2019-12-02 08:13:31.395878940-05:00,2019-12-02 08:13:44.724500-05:00,0.216667


In [24]:
train_df[(train_df['cost'] > 1000)].reset_index(drop = True)

Unnamed: 0,sequence,dep_stn_char,arr_stn_char,subwayline,trainid,trip,dep_time,arr_time,cost
0,35,MST2,COX2,BD,16,3,2019-12-02 11:18:02.999888540-05:00,2019-12-02 11:15:45.521792-05:00,1437.7
1,12,CHE2,GWD2,BD,244,1,2019-12-02 07:49:44.729326-05:00,2019-12-02 07:41:44.722315-05:00,1431.983333
2,13,SHE2,DON2,BD,244,1,2019-12-02 08:03:44.729879-05:00,2019-12-02 07:43:45.723984-05:00,1420.0
3,14,COX2,CHE2,BD,244,1,2019-12-02 10:14:45.359936-05:00,2019-12-02 07:48:45.729156-05:00,1294.0
4,15,GWD2,BRD2,BD,244,1,2019-12-02 10:15:45.354373-05:00,2019-12-02 07:50:45.729902-05:00,1295.0
5,16,DON2,CFK2,BD,244,1,2019-12-02 10:16:37.795481-05:00,2019-12-02 08:01:37.601848-05:00,1304.983333
6,17,PAP2,SHE2,BD,244,1,2019-12-02 10:17:56.227915940-05:00,2019-12-02 08:03:44.729879-05:00,1305.8
7,18,BRD2,YNG2,BD,244,1,2019-12-02 10:20:45.363246-05:00,2019-12-02 08:06:37.609386-05:00,1305.866667
8,19,CFK2,BAU2,BD,244,1,2019-12-02 10:21:45.514388-05:00,2019-12-02 08:08:44.732261-05:00,1306.983333
9,10,SGL2,YNG2,BD,230,0,2019-12-02 07:25:44.733108-05:00,2019-12-02 07:20:37.342157-05:00,1434.866667
