In [1]:
import pandas as pd
import networkx as nx
import numpy as np
import sys
import os

In [2]:
def period_lookup(period):

    if period == 'AM':
        hr_list = [7,8,9,10,11]
        departure_hour = [7]
    elif period == 'MD':
        hr_list = [11,12,13,14,15]
        departure_hour = [11]
    elif period == 'PM':
        hr_list = [17,18,19,20,21]
        departure_hour = [17]
    elif period == 'EV':
        hr_list = [20,21,22,23,0]
        departure_hour = [20]
    elif period == 'EM':
        hr_list = [4,5,6,7,8]
        departure_hour = [4]

    return hr_list, departure_hour

In [3]:
station = pd.read_csv('train_arrival_stations_lookup.csv')

In [4]:
walk_links = pd.read_csv('int_tts_walk_time19.csv')
walk_links['duration'] = round(walk_links['duration']/60).astype(int)
walk_links['gta06'] = walk_links['gta06'] + 1000

# Surface

In [5]:
def stop_times_od_func(hr_list, departure_hour, date, period):
    
    # inputs
    stop_times_reduced_surf = pd.read_csv('processed_surface/processed-surface_' + date + '_' + period + '.csv')
    subway = pd.read_csv('processed_subway/train-arrival_' + date + '_' + period + '.csv')
    srt = pd.read_csv('processed_scarborough/GTFS/srt_' + period + '.csv')


    # surface
    stop_times_reduced_surf = stop_times_reduced_surf.drop(columns = 'sequence')

    stop_times_reduced_surf = stop_times_reduced_surf[stop_times_reduced_surf['hr'].isin(hr_list)]

    stop_times_reduced_surf['hr'] = np.where(stop_times_reduced_surf['hr']>23, stop_times_reduced_surf['hr'] - 24, stop_times_reduced_surf['hr'])

    stop_times_reduced_surf_o = stop_times_reduced_surf.copy()
    stop_times_reduced_surf_d = stop_times_reduced_surf.copy()
    stop_times_reduced_surf_d['stop_sequence_new'] = stop_times_reduced_surf_d['stop_sequence_new'] - 1

    stop_times_od_surf = stop_times_reduced_surf_o.merge(stop_times_reduced_surf_d, left_on = ['id', 'stop_sequence_new', 'route'],
                               right_on = ['id', 'stop_sequence_new','route'], suffixes = ['_o', '_d'])

    stop_times_od_surf['min_o'] = stop_times_od_surf['min_o'].apply(lambda x: '{0:0>2}'.format(x))
    stop_times_od_surf['min_d'] = stop_times_od_surf['min_d'].apply(lambda x: '{0:0>2}'.format(x))

    stop_times_od_surf['hr_o'] = stop_times_od_surf['hr_o'].astype(str)
    stop_times_od_surf['hr_d'] = stop_times_od_surf['hr_d'].astype(str)

    stop_times_od_surf['o_time_str'] = stop_times_od_surf['hr_o'] + ':' + stop_times_od_surf['min_o']
    stop_times_od_surf['d_time_str'] = stop_times_od_surf['hr_d'] + ':' + stop_times_od_surf['min_d']

    stop_times_od_surf['o_time'] = pd.to_datetime(stop_times_od_surf['o_time_str'], format = '%H:%M')
    stop_times_od_surf['d_time'] = pd.to_datetime(stop_times_od_surf['d_time_str'], format = '%H:%M')


    stop_times_od_surf['cost'] = ((stop_times_od_surf['d_time'] - stop_times_od_surf['o_time']).dt.seconds)/60

    stop_times_od_surf['o_time'] = stop_times_od_surf['o_time'].dt.time
    stop_times_od_surf['d_time'] = stop_times_od_surf['d_time'].dt.time

    stop_times_od_surf['node_o'] = stop_times_od_surf['INT_ID_o'].astype(
        int).astype(str) + '-' + stop_times_od_surf['route'].astype(int).astype(str) +'-'+ stop_times_od_surf['hr_o'].astype(
        str) + '-' + stop_times_od_surf['min_o'].astype(int).astype(str)
    stop_times_od_surf['node_d'] = stop_times_od_surf['INT_ID_d'].astype(
        int).astype(str) + '-' + stop_times_od_surf['route'].astype(int).astype(str) +'-'+ stop_times_od_surf['hr_d'].astype(
        str) + '-' + stop_times_od_surf['min_d'].astype(int).astype(str)

    stop_times_od_surf['link'] = '0' + '-' + stop_times_od_surf['route'].astype(int).astype(str) +'-' + stop_times_od_surf['id'].astype(str) + '-' + 'dir' + '_' + stop_times_od_surf['node_o'] + '_' + stop_times_od_surf['node_d']

    stop_times_od_surf['cost'] = stop_times_od_surf['cost'].astype(int)

    stop_times_od_surf = stop_times_od_surf.rename(columns = {'id':'trip_id'})
    stop_times_od_surf = stop_times_od_surf[['trip_id','stop_sequence_new', 'INT_ID_o', 'hr_o', 'min_o', 'INT_ID_d', 'hr_d', 'min_d', 'cost','node_o', 'node_d', 'link']]

    #subway

    subway = subway.merge(station[['station_char', 'INT_ID']], left_on = ['station_char_o'], 
                 right_on = ['station_char']).rename(columns = {'INT_ID':'INT_ID_o'})

    subway = subway.merge(station[['station_char', 'INT_ID']], left_on = ['station_char_d'], 
                 right_on = ['station_char']).rename(columns = {'INT_ID':'INT_ID_d'})

    subway['route'] = np.nan
    subway['route'] = np.where(subway['subwayline'] == 'YUS', 1, subway['route'])
    subway['route'] = np.where(subway['subwayline'] == 'BD', 2, subway['route'])
    subway['route'] = np.where(subway['subwayline'] == 'SHEP', 4, subway['route'])

    subway['trip_id'] = subway['trainid'] * 1000 + subway['trip'] 
    subway['estimated_arrival_o'] = pd.DatetimeIndex(subway['estimated_arrival_o'])
    subway['estimated_arrival_d'] = pd.DatetimeIndex(subway['estimated_arrival_d'])

    subway['hr_d'] = subway['estimated_arrival_d'].dt.hour
    subway['hr_o'] = subway['estimated_arrival_o'].dt.hour

    subway['min_d'] = subway['estimated_arrival_d'].dt.minute
    subway['min_o'] = subway['estimated_arrival_o'].dt.minute

    subway['min_o'] = subway['min_o'].apply(lambda x: '{0:0>2}'.format(x))
    subway['min_d'] = subway['min_d'].apply(lambda x: '{0:0>2}'.format(x))

    subway['hr_o'] = subway['hr_o'].astype(str)
    subway['hr_d'] = subway['hr_d'].astype(str)

    subway['o_time_str'] = subway['hr_o'] + ':' + subway['min_o']
    subway['d_time_str'] = subway['hr_d'] + ':' + subway['min_d']

    subway['o_time'] = pd.to_datetime(subway['o_time_str'], format = '%H:%M')
    subway['d_time'] = pd.to_datetime(subway['d_time_str'], format = '%H:%M')

    subway['cost'] = ((subway['d_time'] - subway['o_time']).dt.seconds)/60

    subway['o_time'] = subway['o_time'].dt.time
    subway['d_time'] = subway['d_time'].dt.time

    subway['node_o'] = subway['INT_ID_o'].astype(
        int).astype(str) + '-' + subway['route'].astype(int).astype(str) +'-'+ subway['hr_o'].astype(
        str) + '-' + subway['min_o'].astype(int).astype(str)
    subway['node_d'] = subway['INT_ID_d'].astype(
        int).astype(str) + '-' + subway['route'].astype(int).astype(str) +'-'+ subway['hr_d'].astype(
        str) + '-' + subway['min_d'].astype(int).astype(str)

    subway['link'] = ('0' + '-' + subway['route'].astype(int).astype(str) +'-' + 
                                  subway['trip_id'].astype(str) + '-' + subway['traindirection'] + '_' + 
                                  subway['node_o'] + '_' + subway['node_d'])

    subway['cost'] = subway['cost'].astype(int)

    subway = subway.rename(columns = {'index':'stop_sequence_new'})

    subway = subway[['trip_id','stop_sequence_new', 'INT_ID_o', 'hr_o', 'min_o', 'INT_ID_d', 'hr_d', 'min_d', 'cost','node_o', 'node_d', 'link']]

    # srt
    srt = srt.rename(columns = {'hour':'hr', 'minute':'min'})
    srt['min'] = np.where(srt['seconds']>29, srt['min'] + 1, srt['min'])

    srt['hr'] = np.where(srt['min'] == 60, 
                                        srt['hr'] + 1, srt['hr'])
    srt['min'] = np.where(srt['min'] == 60, 
                                        0, srt['min'])
    srt['route'] = 3

    srt = srt.drop(columns = ['seconds'])

    srt= srt[srt['hr'].isin(hr_list)]

    srt['hr'] = np.where(srt['hr']>23, srt['hr'] - 24, srt['hr'])

    srt_o = srt.copy()
    srt_d = srt.copy()
    srt_d['stop_sequence'] = srt_d['stop_sequence'] - 1

    srt_od = srt_o.merge(srt_d, left_on = ['trip_id', 'stop_sequence', 'route'],
                               right_on = ['trip_id', 'stop_sequence','route'], suffixes = ['_o', '_d'])

    srt_od['min_o'] = srt_od['min_o'].apply(lambda x: '{0:0>2}'.format(x))
    srt_od['min_d'] = srt_od['min_d'].apply(lambda x: '{0:0>2}'.format(x))

    srt_od['hr_o'] = srt_od['hr_o'].astype(str)
    srt_od['hr_d'] = srt_od['hr_d'].astype(str)

    srt_od['o_time_str'] = srt_od['hr_o'] + ':' + srt_od['min_o']
    srt_od['d_time_str'] = srt_od['hr_d'] + ':' + srt_od['min_d']

    srt_od['o_time'] = pd.to_datetime(srt_od['o_time_str'], format = '%H:%M')
    srt_od['d_time'] = pd.to_datetime(srt_od['d_time_str'], format = '%H:%M')

    srt_od['cost'] = ((srt_od['d_time'] - srt_od['o_time']).dt.seconds)/60

    srt_od['o_time'] = srt_od['o_time'].dt.time
    srt_od['d_time'] = srt_od['d_time'].dt.time

    srt_od['node_o'] = srt_od['INT_ID_o'].astype(
        int).astype(str) + '-' + srt_od['route'].astype(int).astype(str) +'-'+ srt_od['hr_o'].astype(
        str) + '-' + srt_od['min_o'].astype(int).astype(str)
    srt_od['node_d'] = srt_od['INT_ID_d'].astype(
        int).astype(str) + '-' + srt_od['route'].astype(int).astype(str) +'-'+ srt_od['hr_d'].astype(
        str) + '-' + srt_od['min_d'].astype(int).astype(str)

    srt_od['link'] = ('0' + '-' + srt_od['route'].astype(int).astype(str) +'-' + 
                                  srt_od['trip_id'].astype(str) + '-' + 'dir' + '_' + 
                                  srt_od['node_o'] + '_' + srt_od['node_d'])

    srt_od['cost'] = srt_od['cost'].astype(int)

    srt_od = srt_od.rename(columns = {'stop_sequence':'stop_sequence_new'})

    srt_od = srt_od[['trip_id','stop_sequence_new', 'INT_ID_o', 'hr_o', 'min_o', 'INT_ID_d', 'hr_d', 'min_d', 'cost','node_o', 'node_d', 'link']]


    # append
    append_len = len(stop_times_od_surf.append(srt_od).append(subway)[['trip_id']].drop_duplicates())
    df_len = (len(srt_od[['trip_id']].drop_duplicates()) + len(stop_times_od_surf[['trip_id']].drop_duplicates())
     + len(subway[['trip_id']].drop_duplicates()))

    if df_len != append_len:
        print('ERROR')
        sys.exit()

    stop_times_od = stop_times_od_surf.append(srt_od).append(subway).copy()
    return stop_times_od

In [6]:
file_list = os.listdir('cleaned_surface')

for file in file_list:

    date = file.split('_')[2]
    period = file.split('_')[3].split('.csv')[0]
    
    if np.isin(period, ['MD', 'EV', 'EM']) == False:
        continue
        
    hr_list, departure_hour = period_lookup(period)
    
    stop_times_od = stop_times_od_func(hr_list, departure_hour, date, period)


    G = nx.DiGraph()

    nodes = list((stop_times_od['INT_ID_o'].append(stop_times_od['INT_ID_d'])).drop_duplicates())

    # node times
    node_list = []
    for node in nodes:
        for hr_i in hr_list:
            if hr_i > 23:  
                hr = hr_i - 24
            else:
                hr = hr_i
            for minute in range(60):
                node_id = str(int(node)) + '-0-' + str(hr) + '-' + str(minute)
                node_list.append(node_id)

    G.add_nodes_from(node_list)

    # boarding and alighting nodes

    boarding_nodes = list(stop_times_od['node_o'].drop_duplicates())
    G.add_nodes_from(boarding_nodes)

    alighting_nodes = list(stop_times_od['node_d'].drop_duplicates())
    G.add_nodes_from(alighting_nodes)

    # travel links

    link_list = []
    for index, row in stop_times_od.iterrows():
        attributes = {'cost': row['cost']}
        link = (row['node_o'], row['node_d'], attributes)
        link_list.append(link)

    G.add_edges_from(link_list)

    # waiting links
    transfer_list = []
    for node in nodes:
        next_hr = hr_list[0]
        next_minute = 0
        node = int(node)
        for hr_i in hr_list:
            if hr_i > 23:
                current_hr = hr_i - 24 
            else:
                current_hr = hr_i
            for minute in range(60):
                current_minute = minute
                if current_minute == 59:
                    next_minute = 0
                    next_hr = current_hr + 1

                else:
                    next_minute = current_minute + 1
                node_o = str(node) + '-0-' + str(current_hr) + '-' + str(current_minute)
                node_d = str(node) + '-0-' + str(next_hr) + '-' + str(next_minute)

                link_name = '1-0-0-9_' + node_o + '_' + node_d

                attributes = {'cost': 1}


                link = (node_o, node_d, attributes)
                transfer_list.append(link)
    G.add_edges_from(transfer_list)

    # boarding and alighting links

    boarding_link = []
    for node in boarding_nodes:
        node_split = node.split('-')
        int_id = node_split[0]
        boarding_hr = int(node_split[2])
        boarding_minute = int(node_split[3])

        stop_minute = boarding_minute - 2
        if stop_minute < 0:
            stop_hr = boarding_hr -1
            stop_minute = 60 + stop_minute
        else:
            stop_hr = boarding_hr

        stop_node = int_id + '-0-' + str(stop_hr) + '-' + str(stop_minute)
        attributes = {'cost': 2}

        link = (stop_node, node, attributes)

        boarding_link.append(link)

    G.add_edges_from(boarding_link)

    alighting_link = []
    for node in alighting_nodes:
        node_split = node.split('-')
        int_id = node_split[0]
        alighting_hr = int(node_split[2])
        alighting_minute = int(node_split[3])

        stop_minute = alighting_minute + 2
        if stop_minute > 59:
            stop_hr = alighting_hr + 1
            stop_minute = stop_minute - 60
        else:
            stop_hr = alighting_hr

        stop_node = int_id + '-0-' + str(stop_hr) + '-' + str(stop_minute)
        attributes = {'cost': 2}

        link = (node, stop_node, attributes)

        alighting_link.append(link)
    G.add_edges_from(alighting_link)

    # tts zone origins and destintinations

    tts_zones = list(walk_links['gta06'].drop_duplicates())

    node_stationary_list = []
    for zone in tts_zones:
        for hr_i in hr_list:
            if hr_i > 23:  
                hr = hr_i - 24
            else:
                hr = hr_i
            for minute in range(60):
                node_id = str(int(zone)) + '-0-' + str(hr) + '-' + str(minute)
                node_stationary_list.append(node_id)

    G.add_nodes_from(node_stationary_list)

    zone_destinations = []
    for zone in tts_zones:
        node_id = str(int(zone)) + '-0-99-99'
        zone_destinations.append(node_id)

    G.add_nodes_from(zone_destinations)

    # from tts to intersections

    walk_link_outbound = []
    for index, row in walk_links.iterrows():

        zone = int(row['gta06'])
        node = int(row['INT_ID'])
        cost = int(row['duration'])

        for hr_i in departure_hour:
            if hr_i > 23:
                current_hr = hr_i - 24 
            else:
                current_hr = hr_i
            for minute in range(60):
                current_minute = minute
                next_hr = current_hr
                next_minute = current_minute + cost
                if next_minute > 59:
                    next_hr= current_hr + 1
                    next_minute = next_minute - 60

                node_o = str(zone) + '-0-' + str(current_hr) + '-' + str(current_minute)
                node_d = str(node) + '-0-' + str(next_hr) + '-' + str(next_minute)

                link_name = '2-0-0-0_' + node_o + '_' + node_d

                attributes = {'cost': cost}

                link = (node_o, node_d, attributes)
                walk_link_outbound.append(link)

    # from intersections to tts

    walk_link_inbound = []
    for index, row in walk_links.iterrows():

        zone = int(row['gta06'])
        node = int(row['INT_ID'])
        cost = int(row['duration'])

        for hr_i in hr_list:
            if hr_i > 23:
                current_hr = hr_i - 24 
            else:
                current_hr = hr_i
            for minute in range(60):
                current_minute = minute

                node_o = str(node) + '-0-' + str(current_hr) + '-' + str(current_minute)
                node_d = str(zone) + '-0-99-99'

                link_name = '2-0-0-1_' + node_o + '_' + node_d

                attributes = {'cost': cost}

                link = (node_o, node_d, attributes)
                walk_link_inbound.append(link)

    G.add_edges_from(walk_link_outbound)

    G.add_edges_from(walk_link_inbound)

    # writing

    nx.write_graphml(G,'networks/' + date + '_' + period + '.graphml')
    stop_times_od.to_csv('networks/' + date + '_' + period + '.csv', index = False)