In [1]:
import pandas as pd
import numpy as np

import os

In [7]:
df_list = []
departures = []

for file in os.listdir('cleaned_scarborough'):
    
    if file[0] == '.':
        continue
    
    period = file.split('_')[3].split('.')[0]
    date = file.split('_')[2].split('.')[0]

    raw = pd.read_csv('cleaned_scarborough/' + file)

    raw['platform1_end_time'] = pd.DatetimeIndex(raw['platform1_end_time'], tz = 'America/Toronto')
    raw['platform2_start_time'] = pd.DatetimeIndex(raw['platform2_start_time'], tz = 'America/Toronto')
    raw = raw.sort_values(by = 'platform2_start_time')
    if (period == 'EV') | (period == 'PM') :

        test_left = raw[(raw['station2'] == 'lawrence_east') & (raw['station1'] == 'kennedy')].copy()
        station1 = 'kennedy'
        station2 = 'lawrence_east'
    else:
        test_left = raw[(raw['station1'] == 'lawrence_east') & (raw['station2'] == 'kennedy')].copy()
        station2 = 'kennedy'
        station1 = 'lawrence_east'
    test_left = test_left.reset_index(drop = True).reset_index()
    test_right = test_left.copy()
    test_right['index'] = test_right['index'] - 1


    test = test_left.merge(test_right[['index', 'platform1_end_time', 'platform2_start_time']], left_on = ['index'], right_on = ['index'], suffixes = ['_left', '_right'])

    test['gap'] = (test['platform2_start_time_right'] - test['platform2_start_time_left']).dt.seconds

    test['train_id'] = (test['gap'] > 90).cumsum() # minimum gap needed to indicate a new train is approaching

    train_times = []

    for train_id in list(test['train_id'].drop_duplicates()):
        station1 = station1
        station2 = station2
        depart_time = test[test['train_id'] == train_id]['platform1_end_time_left'].median()
        arrival_time = test[test['train_id'] == train_id]['platform2_start_time_left'].median()

        train_times.append([station1, station2, depart_time, arrival_time])

    df_list.append(test[test['gap']> 450])
    departures.append(pd.DataFrame.from_records(train_times, columns = ['station1', 'station2', 'depart_time', 'arrival_time']))

In [8]:
gaps = pd.concat(df_list)
departures_df = pd.concat(departures)

In [9]:
gaps.to_csv('processed_scarborough/gaps.csv', index = False)
departures_df.to_csv('processed_scarborough/srt_departures.csv', index = False)

In [10]:
gaps.sort_values(by = 'gap', ascending = False).head(50)

Unnamed: 0,index,station1,station2,platform1_end_time_left,platform2_start_time_left,year,month,day,hour,date,dow,period,platform1_end_time_right,platform2_start_time_right,gap,train_id
57,57,lawrence_east,kennedy,2019-11-27 10:36:12.536000-05:00,2019-11-27 10:41:48.892000-05:00,2019,11,27,10,2019-11-27,2,AM,2019-11-27 11:09:34.343000-05:00,2019-11-27 11:15:46.243000-05:00,2037,14
48,48,kennedy,lawrence_east,2019-12-06 20:59:17.589000-05:00,2019-12-06 21:11:52.463000-05:00,2019,12,7,20,2019-12-06,4,EV,2019-12-06 21:25:43.355000-05:00,2019-12-06 21:35:03.076000-05:00,1390,13
98,98,kennedy,lawrence_east,2019-12-02 23:07:21.696000-05:00,2019-12-02 23:14:30.547000-05:00,2019,12,3,23,2019-12-02,0,EV,2019-12-02 23:25:59.899000-05:00,2019-12-02 23:37:12.939000-05:00,1362,32
112,112,kennedy,lawrence_east,2019-12-11 00:29:11.271000-05:00,2019-12-11 00:35:14.731000-05:00,2019,12,11,0,2019-12-10,1,EV,2019-12-11 00:49:16.951000-05:00,2019-12-11 00:57:30.392000-05:00,1335,42
24,24,kennedy,lawrence_east,2019-12-27 20:54:16.953000-05:00,2019-12-27 21:04:55.082000-05:00,2019,12,28,20,2019-12-27,4,EV,2019-12-27 21:19:37.821000-05:00,2019-12-27 21:26:12.607000-05:00,1277,11
114,114,kennedy,lawrence_east,2019-12-17 00:30:11.335000-05:00,2019-12-17 00:41:35.323000-05:00,2019,12,17,0,2019-12-16,0,EV,2019-12-17 00:56:18.979000-05:00,2019-12-17 01:02:51.334000-05:00,1276,42
111,111,kennedy,lawrence_east,2019-12-05 23:50:11.107000-05:00,2019-12-05 23:56:50.684000-05:00,2019,12,6,23,2019-12-05,3,EV,2019-12-06 00:10:18.051000-05:00,2019-12-06 00:17:26.712000-05:00,1236,37
6,6,kennedy,lawrence_east,2019-12-17 20:22:53.783000-05:00,2019-12-17 20:27:56.350000-05:00,2019,12,18,20,2019-12-17,1,EV,2019-12-17 20:41:44.895000-05:00,2019-12-17 20:48:21.897000-05:00,1225,4
212,212,kennedy,lawrence_east,2019-12-17 20:22:53.783000-05:00,2019-12-17 20:27:56.350000-05:00,2019,12,18,20,2019-12-17,1,PM,2019-12-17 20:41:44.895000-05:00,2019-12-17 20:48:21.897000-05:00,1225,56
33,33,kennedy,lawrence_east,2019-12-04 21:22:29.201000-05:00,2019-12-04 21:27:57.881000-05:00,2019,12,5,21,2019-12-04,2,EV,2019-12-04 21:39:29.905000-05:00,2019-12-04 21:48:20.958000-05:00,1223,17
