# Route planning with probability

_While constructing our predictive model, we first made a version of the route planning with dummy probabilities._

In [1]:
%%configure
{"conf": {
    "spark.app.name": "group100_final"
}}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9204,application_1589299642358_3771,pyspark,idle,Link,Link,
9205,application_1589299642358_3772,pyspark,idle,Link,Link,
9216,application_1589299642358_3783,pyspark,idle,Link,Link,
9217,application_1589299642358_3784,pyspark,idle,Link,Link,
9226,application_1589299642358_3793,pyspark,busy,Link,Link,
9230,application_1589299642358_3797,pyspark,idle,Link,Link,
9232,application_1589299642358_3799,pyspark,idle,Link,Link,
9236,application_1589299642358_3804,pyspark,idle,Link,Link,
9237,application_1589299642358_3805,pyspark,busy,Link,Link,
9239,application_1589299642358_3808,pyspark,busy,Link,Link,


In [2]:
username = 'mjouve'

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9272,application_1589299642358_3843,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from datetime import time, datetime, timedelta
from collections import defaultdict
import numpy as np

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Loading previously obtained dataframe

In [4]:
stops = spark.read.orc("/user/{}/zurich_stops.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
reachable_pair_grouped = spark.read.orc("/user/{}/reachable_pair_grouped.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
stop_times = spark.read.orc("/user/{}/stop_times_filtered.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
connexions = spark.read.orc("/user/{}/connexions.orc".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Helpers methods

In [8]:
def compute_footpaths_dict(reachable_pair_df):
    """
    Given a pyspark Dataframe of reachable pairs grouped,
    returns the footpaths dictionary used by our algorithm
    """
    return dict(((row.id_1, row.destinations) for row in reachable_pair_df.collect()))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
def to_datetime(str_time):
    """
    Given a string representing a time (format 'H:M:s', H: hour, M: minute, s:second), convert it to a datetime object
    """
    hour, minute, second = str_time.split(':')
    
    # convert it to int and remove potential errors by taking a modulo
    hour = int(hour) % 24
    minute = int(minute) % 60
    second = int(second) % 60
    
    # the year, month and day are dummies heres
    return datetime(year=2020, month=1, day=1, hour=hour, minute=minute, second=second)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
def sort_connexions(connexions_df, departure = True):
    """
    Given a pyspark DataFrame of connexions, returns an array of sorted connexions in ascending order of departure
    if departure = True, else in descending order of arrival
    """

    connexions_array = [{'departure_location': row.stop_id_1, 
                         'departure_time': to_datetime(row.departure_time_1), 
                         'arrival_location': row.stop_id_2, 
                         'arrival_time': to_datetime(row.arrival_time_2), 
                         'trip_id': row.trip_id} for row in connexions_df.collect()]
    
    if departure:
        sorted_connexions = sorted(connexions_array, key = (lambda tup: tup['departure_time']))
    
    else:
        sorted_connexions = sorted(connexions_array, key = (lambda tup: tup['arrival_time']), reverse = True)
        
    return sorted_connexions

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Dummy method for returning a probability given a lambda and a time_left

In [12]:
# for lambdaa we pass the arrival_delay median
def proba_trip(lambdaa, time_left): #time left in seconds
    if lambdaa < 0:
        return 0.95
    else:
        
        return 1 - np.exp(- lambdaa * time_left)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Main algorithms

## Updating arrival times

In [13]:
def updates_times_dict_given_departure_with_proba(times, sorted_connexions, footpaths, lambda_dict, departure_location, departure_time, desired_probability):
    """
    Given an initialized times dictionary, the array of sorted connexion, the footpaths dictionary
    a departure_location given as a stop_id (str),
    and a departure time (datetime object)
    updates the times dictionary
    """
    
    times[departure_location] = (departure_time, 1, None)

    # Initalize a dictionary of trips taken. For each trip already taken, 
    # we map it to the first departure location and departure time where we could have taken this trip. 
    # Returns None if the key is not assigned to another value thanks to defaultdict.
    trips_taken = defaultdict(lambda: None)

    # Iterate over connexions in sorted order
    for c in sorted_connexions:
    
        # trip_id of the current connexion
        trip_id = c['trip_id']
    
        # departure location of the current connexion
        departure_location = c['departure_location']
    
        # departure time of the current connexion
        departure_time = c['departure_time']
    
        # arrival location of the current connexion
        arrival_location = c['arrival_location']
    
        # arrival time of the current connexion
        arrival_time = c['arrival_time']
    
        # If current trip could have been taken earlier
        if trips_taken[trip_id]:
        
            # obtain data about this current trip (where we could have taken it and when)
            trip_data = trips_taken[trip_id]
        
            # if arrival_time is earlier than the current best time assigned for this location
            if arrival_time < times[arrival_location][0]:
            
                # access old proba
                if times[trip_data[0]][2] == None:
                    old_proba = 1
                    new_proba = 1
                else:
                
                    prev_connex_data = times[trip_data[0]][2]                   
                    old_proba = times[trip_data[0]][1]
                    prev_arrival_time = prev_connex_data['arrival_time']
                    
                    if prev_connex_data.get('walking', None):
                        prev_arrival_time = prev_arrival_time - timedelta(seconds = prev_connex_data['walking'])
                    
                
                    lambdaa = lambda_dict[(departure_location, prev_arrival_time)]
                
                    new_proba = proba_trip(lambdaa, (departure_time - prev_arrival_time).seconds)
                    
#                     print(old_proba * new_proba)
                
                
                
                if new_proba > desired_proba:
                
                    # update arrival time as well as connexion data for this arrival location
                    times[arrival_location] = (arrival_time, old_proba * new_proba, {'departure_location': trip_data[0],
                                                              'departure_time': trip_data[1],
                                                              'arrival_location':arrival_location,
                                                              'arrival_time': arrival_time,
                                                              'trip_id': trip_id})
            
                # obtain the stops reachable by walking
                reachable_stops_walking = footpaths.get(arrival_location, None)
            
            
                if reachable_stops_walking:
                
                    # for each possible destination
                    for destination in reachable_stops_walking:
                    
                        # obtain the stop_id
                        location = destination[0]
                    
                        # obtain the current arrival time
                        curr_arrival_time = times[location][0]    
      
                        # obtain the walk duration from arrival_location (convert it to float)
                        walking_time = float(destination[1])
                    
                        # compute the new arrival time if using this path
                        new_arrival_time = arrival_time + timedelta(seconds = walking_time)
                    
                    
                        # if it improves the current best arrival time, we update our dictionary
                        if new_arrival_time < curr_arrival_time:
                            
                            if new_proba > desired_proba:
                                times[location] = (new_arrival_time, old_proba * new_proba, {'departure_location': arrival_location,
                                                              'departure_time': arrival_time,
                                                              'arrival_location':location,
                                                              'arrival_time': new_arrival_time,
                                                              'trip_id': trip_id,
                                                              'walking': walking_time})
        
    
        # if we can take this connexion
        elif (times[departure_location][0] + timedelta(seconds = 120)) <= departure_time:

            # update trips taken with this new trip
            trips_taken[trip_id] = (departure_location, departure_time)
        
            # if the arrival time is better than the current best
            if arrival_time < times[arrival_location][0]:
                
                
                # access old proba
                if times[departure_location][2] == None:
                    old_proba = 1
                    new_proba = 1
                else:
                    prev_connex_data = times[departure_location][2]
                    
                    old_proba = times[departure_location][1]
                    prev_arrival_time = prev_connex_data['arrival_time']
                    
                    if prev_connex_data.get('walking', None):
                        prev_arrival_time = prev_arrival_time - timedelta(seconds = prev_connex_data['walking'])
                    
                    
                
                    lambdaa = lambda_dict[(departure_location, prev_arrival_time)]
                
                    new_proba = proba_trip(lambdaa, (departure_time - prev_arrival_time).seconds)
                    
#                     print(old_proba * new_proba)
                
                if new_proba > desired_proba:
                
                    # update the best time for the arrival location
                    times[arrival_location] = (arrival_time, old_proba * new_proba, c)           
            
            
                # obtain the stops reachable by walking
                reachable_stops_walking = footpaths.get(arrival_location, None)
                
                
                
                if reachable_stops_walking:
                
                    # for each possible destination
                    for destination in reachable_stops_walking:
                    
                        # obtain the stop_id
                        location = destination[0]
                    
                        # obtain the current arrival time
                        curr_arrival_time = times[location][0]    
                      
                        # obtain the walk duration from arrival_location (convert it to float)
                        walking_time = float(destination[1])
                    
                        # compute the new arrival time if using this path
                        new_arrival_time = arrival_time + timedelta(seconds = walking_time)
                    
                    
                        
                        # if it improves the current best arrival time, we update our dictionary
                        if new_arrival_time < curr_arrival_time:
                            
                            if new_proba > desired_proba:
                                times[location] = (new_arrival_time, old_proba * new_proba, {'departure_location': arrival_location,
                                                              'departure_time': arrival_time,
                                                              'arrival_location':location,
                                                              'arrival_time': new_arrival_time,
                                                              'trip_id': trip_id,
                                                              'walking': walking_time})            

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Constructing the paths

In [14]:
def backward_proba(departure_stop, arrival_stop, times):
    """
    Returns a list of paths taken from departure_stop to arrival_stop given the times dictionary computed
    """
    
    paths = []
    
    current_stop_data = times[arrival_stop]
    current_stop = None
    
    while current_stop != departure_stop:
        
        current_connexion = current_stop_data[2]
        
        proba = current_stop_data[1]
        
        current_stop = current_connexion['departure_location']
        arrival_location = current_connexion['arrival_location']
        trip = current_connexion['trip_id']
        
        walking = current_connexion.get('walking', None)
        
        if walking:
            path = 'Walking during {s}s'.format(s = int(walking)) + ' from {d} to {a}'.format(d = current_stop, a = arrival_location)
        
        else:
            path = 'From {d_l} (at {d_t}) to {a_l} (at {a_t}) using trip: {t}. Current probability = {p}'.format(d_l = current_stop,
                                                                                      d_t = current_connexion['departure_time'].time(),
                                                                                      a_l = arrival_location,
                                                                                      a_t = current_connexion['arrival_time'].time(),
                                                                                      t = current_connexion['trip_id'],
                                                                                      p = proba)
        
        paths.append(path)
        
        
        current_stop_data = times[current_stop]
        
    
    return paths[::-1] 

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Running the algorithm

In [15]:
footpaths = compute_footpaths_dict(reachable_pair_grouped)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
sorted_connexions = sort_connexions(connexions)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### defaultdict storing the lambda parameters for the prediction

_Because it will always return -1, each probability returned by the method `proba_trip` will be 0.95._

In [17]:
lambda_dict = defaultdict(lambda: -1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
times = dict(((row.stop_id, 
               (datetime(year=2020, month=1, day=6, hour=23, minute=59, second = 59), 1, None)) 
              for row in stops.select(stops.stop_id).collect()))


departure_stop = '8503000'
arrival_stop = '8591049'
departure_time = '12:00:00'
desired_proba = 0.9


hour, minute, second = departure_time.split(':')
hour = int(hour)
minute = int(minute)
second = int(second)
    
departure_time_datetime = datetime(year=2020, month=1, day=1, hour=hour, minute=minute, second=second)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
updates_times_dict_given_departure_with_proba(times, sorted_connexions, footpaths, lambda_dict, 
                                              departure_stop,
                                              departure_time_datetime,
                                              desired_proba)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
paths = backward_proba('8503000', '8591049', times)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
for path in paths:
    print(path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

From 8503000 (at 12:05:00) to 8503006 (at 12:11:00) using trip: 32.TA.80-159-Y-j19-1.8.H. Current probability = 1
Walking during 72s from 8503006 to 8580449
From 8580449 (at 12:15:00) to 8591049 (at 12:24:00) using trip: 1914.TA.26-11-A-j19-1.27.R. Current probability = 0.95

# Next step:

_Now that our algorithm is ready to use probabilities, let's try to build our predictive model. You can see our thought process in the `predective_model.ipynb` notebook._