# Route planning finding the top K routes

_Now that we can compute one path, we wanted to have multiple options. We achieved this by having an array of arrival times for the final stop._

In [1]:
%%configure
{"conf": {
    "spark.app.name": "group100_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9346,application_1589299642358_3926,pyspark,idle,Link,Link,
9363,application_1589299642358_3943,pyspark,idle,Link,Link,
9369,application_1589299642358_3949,pyspark,idle,Link,Link,
9376,application_1589299642358_3956,pyspark,idle,Link,Link,
9379,application_1589299642358_3959,pyspark,idle,Link,Link,
9380,application_1589299642358_3960,pyspark,idle,Link,Link,
9382,application_1589299642358_3962,pyspark,idle,Link,Link,
9383,application_1589299642358_3963,pyspark,idle,Link,Link,
9384,application_1589299642358_3964,pyspark,idle,Link,Link,
9385,application_1589299642358_3965,pyspark,idle,Link,Link,


In [2]:
# Initialization

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9402,application_1589299642358_3985,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
username = 'mjouve'

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from datetime import time, datetime, timedelta
from collections import defaultdict
import numpy as np

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Loading previously obtained dataframe

In [5]:
stops = spark.read.orc("/user/{}/zurich_stops.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
reachable_pair_grouped = spark.read.orc("/user/{}/reachable_pair_grouped.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
stop_times = spark.read.orc("/user/{}/stop_times_filtered.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
connexions = spark.read.orc("/user/{}/connexions.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Helpers methods

In [9]:
def compute_footpaths_dict(reachable_pair_df):
    """
    Given a pyspark Dataframe of reachable pairs grouped,
    returns the footpaths dictionary used by our algorithm
    """
    return dict(((row.id_1, row.destinations) for row in reachable_pair_df.collect()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
def to_datetime(str_time):
    """
    Given a string representing a time (format 'H:M:s', H: hour, M: minute, s:second), convert it to a datetime object
    """
    hour, minute, second = str_time.split(':')
    
    # convert it to int and remove potential errors by taking a modulo
    hour = int(hour) % 24
    minute = int(minute) % 60
    second = int(second) % 60
    
    # the year, month and day are dummies heres
    return datetime(year=2020, month=1, day=1, hour=hour, minute=minute, second=second)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
def sort_connexions(connexions_df, departure = True):
    """
    Given a pyspark DataFrame of connexions, returns an array of sorted connexions in ascending order of departure
    if departure = True, else in descending order of arrival
    """

    connexions_array = [{'departure_location': row.stop_id_1, 
                         'departure_time': to_datetime(row.departure_time_1), 
                         'arrival_location': row.stop_id_2, 
                         'arrival_time': to_datetime(row.arrival_time_2), 
                         'trip_id': row.trip_id} for row in connexions_df.collect()]
    
    if departure:
        sorted_connexions = sorted(connexions_array, key = (lambda tup: tup['departure_time']))
    
    else:
        sorted_connexions = sorted(connexions_array, key = (lambda tup: tup['arrival_time']), reverse = True)
        
    return sorted_connexions

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Main algorithms

## Updating arrival times

In [12]:
def updates_times_dict_given_departure_top_K(times, sorted_connexions, footpaths, departure_location, departure_time, final_location, K):
    """
    Given an initialized times dictionary, the array of sorted connexion, the footpaths dictionary
    a departure_location given as a stop_id (str),
    and a departure time (datetime object)
    updates the times dictionary (K answer for the final_location)
    """
    
    # each entry is now an array
    times[departure_location][0] = ( (departure_time, None) )
    

    # Initalize a dictionary of trips taken. For each trip already taken, 
    # we map it to the first departure location and departure time where we could have taken this trip. 
    # Returns None if the key is not assigned to another value thanks to defaultdict.
    trips_taken = defaultdict(lambda: None)

    # Iterate over connexions in sorted order
    for c in sorted_connexions:
    
        # trip_id of the current connexion
        trip_id = c['trip_id']
    
        # departure location of the current connexion
        departure_location = c['departure_location']
    
        # departure time of the current connexion
        departure_time = c['departure_time']
    
        # arrival location of the current connexion
        arrival_location = c['arrival_location']
    
        # arrival time of the current connexion
        arrival_time = c['arrival_time']
    
        # If current trip could have been taken earlier
        if trips_taken[trip_id]:
        
            # obtain data about this current trip (where we could have taken it and when)
            trip_data = trips_taken[trip_id]
        
            # get the arrival array for arrival_location
            arrival_array = times[arrival_location]
            
            # if it is the final location we store more than one solution
            if arrival_location == final_location and len(arrival_array) < K:
                
                arrival_array.append((arrival_time, {'departure_location': trip_data[0],
                                                              'departure_time': trip_data[1],
                                                              'arrival_location':arrival_location,
                                                              'arrival_time': arrival_time,
                                                              'trip_id': trip_id}))
                
                arrival_array.sort(key = (lambda tup: tup[0]))
            
            # else we check if the arrival time is better
            elif arrival_time < times[arrival_location][-1][0]:
            
                # update arrival time as well as connexion data for this arrival location
                times[arrival_location][-1] = (arrival_time, {'departure_location': trip_data[0],
                                                              'departure_time': trip_data[1],
                                                              'arrival_location':arrival_location,
                                                              'arrival_time': arrival_time,
                                                              'trip_id': trip_id})
                
                arrival_array.sort(key = (lambda tup: tup[0]))
            
            # obtain the stops reachable by walking
            reachable_stops_walking = footpaths.get(arrival_location, None)
            
            
            if reachable_stops_walking:
                
                # for each possible destination
                for destination in reachable_stops_walking:
                    
                    # obtain the stop_id
                    location = destination[0]
                    
                    # obtain the walk duration from arrival_location (convert it to float)
                    walking_time = float(destination[1])
                    
                    # compute the new arrival time if using this path
                    new_arrival_time = arrival_time + timedelta(seconds = walking_time)
                    
                    # obtain the current arrival time array for this location
                    curr_arrival_time_array = times[location]
                      
                    # if it is the final location we store more than one solution
                    if location == final_location and len(curr_arrival_time_array) < K:
                        
                        curr_arrival_time_array.append((new_arrival_time, {'departure_location': arrival_location,
                                                              'departure_time': arrival_time,
                                                              'arrival_location':location,
                                                              'arrival_time': new_arrival_time,
                                                              'trip_id': 'Walking during {s} seconds'.format(s = walking_time)}))
                        
                        curr_arrival_time_array.sort(key = (lambda tup: tup[0]))
                    
                    
                    # if it improves the current best arrival time, we update our dictionary
                    elif new_arrival_time < curr_arrival_time_array[-1][0]:
                        curr_arrival_time_array[-1] = (new_arrival_time, {'departure_location': arrival_location,
                                                              'departure_time': arrival_time,
                                                              'arrival_location':location,
                                                              'arrival_time': new_arrival_time,
                                                              'trip_id': 'Walking during {s} seconds'.format(s = walking_time)})
                        
                        curr_arrival_time_array.sort(key = (lambda tup: tup[0]))
    
        # if we can take this connexion
        elif (times[departure_location][0][0] + timedelta(seconds = 120)) <= departure_time:

            # update trips taken with this new trip
            trips_taken[trip_id] = (departure_location, departure_time)
        
            # obtain the array
            arrival_location_array = times[arrival_location]
            
            # if it is the final array we store more than one solution
            if arrival_location == final_location and len(arrival_location_array) < K:
                
                arrival_location_array.append((arrival_time, {'departure_location': departure_location,
                                                              'departure_time': departure_time,
                                                              'arrival_location':arrival_location,
                                                              'arrival_time': arrival_time,
                                                              'trip_id': trip_id})  )
                
                arrival_location_array.sort(key=(lambda tup: tup[0]))
                
            # if the arrival time is better than the current best
            elif arrival_time < times[arrival_location][-1][0]:
            
                # update the best time for the arrival location
                arrival_location_array[-1] = (arrival_time, c)  
                
                arrival_location_array.sort(key=(lambda tup: tup[0]))
            
            # obtain the stops reachable by walking
            reachable_stops_walking = footpaths.get(arrival_location, None) 
            
            
            if reachable_stops_walking:
                
                # for each possible destination
                for destination in reachable_stops_walking:
                    
                    # obtain the stop_id
                    location = destination[0]
                    
                    # obtain the walk duration from arrival_location (convert it to float)
                    walking_time = float(destination[1])
                    
                    # compute the new arrival time if using this path
                    new_arrival_time = arrival_time + timedelta(seconds = walking_time)
                    
                    # obtain the current arrival time array
                    curr_arrival_time_array = times[location]
                      
                    # if it is the final stop we store more than one solution
                    if location == final_location and len(curr_arrival_time_array) < K:
                        
                        curr_arrival_time_array.append((new_arrival_time, {'departure_location': arrival_location,
                                                              'departure_time': arrival_time,
                                                              'arrival_location':location,
                                                              'arrival_time': new_arrival_time,
                                                              'trip_id': 'Walking during {s} seconds'.format(s = walking_time)}))
                        
                        curr_arrival_time_array.sort(key = (lambda tup: tup[0]))
                    
                    
                    # if it improves the current best arrival time, we update our dictionary
                    elif new_arrival_time < curr_arrival_time_array[-1][0]:
                        curr_arrival_time_array[-1] = (new_arrival_time, {'departure_location': arrival_location,
                                                              'departure_time': arrival_time,
                                                              'arrival_location':location,
                                                              'arrival_time': new_arrival_time,
                                                              'trip_id': 'Walking during {s} seconds'.format(s = walking_time)})
                        
                        curr_arrival_time_array.sort(key = (lambda tup: tup[0]))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Constructing the paths

In [13]:
def backward_K(departure_stop, arrival_stop, times, K):
    """
    Returns a list of paths taken from departure_stop to arrival_stop given the times dictionary computed
    """

    routes = []
    
    for i in range(K):
        
        paths = []
        
        current_connexion = times[arrival_stop][i][1]
        current_stop = None
    
        while current_stop != departure_stop:
        
            current_stop = current_connexion['departure_location']
            arrival_location = current_connexion['arrival_location']
            trip = current_connexion['trip_id']
        
            if 'Walking' in trip:
                path = trip + ' from {d} to {a}'.format(d = current_stop, a = arrival_location)
        
            else:
                path = 'From {d_l} (at {d_t}) to {a_l} (at {a_t}) using trip: {t}'.format(d_l = current_stop,
                                                                                      d_t = current_connexion['departure_time'].time(),
                                                                                      a_l = arrival_location,
                                                                                      a_t = current_connexion['arrival_time'].time(),
                                                                                      t = current_connexion['trip_id'])
        
            paths.append(path)
        
        
            current_connexion = times[current_stop][0][1]
        
        route = "\n".join(path for path in paths[::-1])
        routes.append(route)
    
    return routes   

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Running the algorithm

In [14]:
footpaths = compute_footpaths_dict(reachable_pair_grouped)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
sorted_connexions = sort_connexions(connexions)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# initialize time dictionary
times_top_K = dict(((row.stop_id, [(datetime(year=2020, month=1, day=6, hour=23, minute=59, second = 59), None)])
                    for row in stops.select(stops.stop_id).collect()))


departure_stop = '8503000'
arrival_stop = '8591049'
departure_time = '12:00:00'
K = 4


hour, minute, second = departure_time.split(':')
hour = int(hour)
minute = int(minute)
second = int(second)
    
departure_time_datetime = datetime(year=2020, month=1, day=1, hour=hour, minute=minute, second=second)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
updates_times_dict_given_departure_top_K(times_top_K, sorted_connexions, footpaths, departure_stop, departure_time_datetime, arrival_stop, K)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
routes = backward_K(departure_stop, arrival_stop, times_top_K, K)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
for i, route in enumerate(routes):
    print('Route {nb}:'.format(nb = i+1))
    print(route)
    print('\n')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Route 1:
From 8503000 (at 12:05:00) to 8503006 (at 12:11:00) using trip: 32.TA.80-159-Y-j19-1.8.H
Walking during 72.0 seconds from 8503006 to 8580449
From 8580449 (at 12:15:00) to 8591049 (at 12:24:00) using trip: 1914.TA.26-11-A-j19-1.27.R


Route 2:
From 8503000 (at 12:07:00) to 8503310 (at 12:17:00) using trip: 20.TA.26-9-A-j19-1.2.H
Walking during 70.0 seconds from 8503310 to 8590620
From 8590620 (at 12:23:00) to 8591049 (at 12:29:00) using trip: 168.TA.26-12-A-j19-1.2.H


Route 3:
From 8503000 (at 12:05:00) to 8503006 (at 12:11:00) using trip: 32.TA.80-159-Y-j19-1.8.H
Walking during 72.0 seconds from 8503006 to 8580449
From 8580449 (at 12:15:00) to 8591049 (at 12:24:00) using trip: 1914.TA.26-11-A-j19-1.27.R
Walking during 176.0 seconds from 8591049 to 8591297
Walking during 176.0 seconds from 8591297 to 8591049


Route 4:
From 8503000 (at 12:05:00) to 8503006 (at 12:11:00) using trip: 32.TA.80-159-Y-j19-1.8.H
Walking during 72.0 seconds from 8503006 to 8580449
From 8580449 (at 12

# Next step:

_Now that our algorithm can find multiple routes, we wanted to include a dummy confidence estimation using some defined probability while we were developping our predictive model. You can see this algorithm in the notebook `route_planning_proba.ipynb`._