# ROBOUST ROUTE PLANNER

### Team: SAFJ

### Members: Serif Soner Serbest - Jelena Banjac - Fatine Benhsain -  Asli Yorusun

This notebook calculates possible routes according to the system inputs

### TABLE OF CONTENTS

#### 1. Data
In this section we import modify the dataset 
#### 2. Confidence Calculation
In this section we build necessary functions and dataframes to calculate confidence for a given route
#### 3. Connection Graph
In this section we calculate the connection graph that shows the reachability of any station pairs.
#### 4. Timetable
In this section we create timetable that provide information about departure and arrival stations and times for a given day
#### 5. Route
In this section we calculate each possible routes by using our timetable and connection graph and provide confidence level of the route with confidence calculation

### System Inputs

In [1]:
date = '2018-06-26'
hour = '15:00:00'
departure_station = 'Zürich HB'
arrival_station = 'Zürich Flughafen'
# min_confidence_level = 0.9

### System Parameters

In [2]:
# change from one station to another in mins
transfer_delay = 5 

# average walking speed is assumed to be 4.5 km/h
# ref: https://www.quora.com/What-is-the-average-walking-speed-of-a-human
# in minutes
max_walking_time = 5

# m/min which corresponds to 4.5 km/h
human_speed = 75 

# as meters
max_walking_distance = human_speed * max_walking_time 

### Dependencies

In [3]:
import pickle
import socket
import getpass
import os

import ast
import numpy as np
from scipy import stats
import pandas as pd

import networkx as nx
from datetime import datetime, timedelta

import math
from math import sin, cos, sqrt, atan2, radians

import random
import json
import matplotlib.patches as mpatches

import matplotlib.pyplot as plt
%matplotlib inline

Change Layout to be able to see spark dataframes

In [4]:
plt.rcParams['figure.figsize'] = (10,6)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')

def fix_layout(width:int=95):
    from IPython.core.display import display, HTML
    display(HTML('<style>.container { width:' + str(width) + '% !important; }</style>'))
    
fix_layout()

### Spark Setup

In [5]:
username = getpass.getuser()

SPARK_LOCAL = False

# on the laptop
if not 'iccluster' in socket.gethostname():
    # set this to the base spark directory on your system
    SPARK_LOCAL = True
    
    if username == "fatine":
        spark_home = '/home/fatine/spark-2.4.1-bin-hadoop2.7'
        
        try:
            import findspark
            findspark.init(spark_home)
        except ModuleNotFoundError as e:
            print('Info: {}'.format(e))
    elif username == "soner":
        spark_home = '/home/soner/Desktop/DSLAB2019/spark-2.4.1-bin-hadoop2.7'
        
        try:
            import findspark
            findspark.init(spark_home)
        except ModuleNotFoundError as e:
            print('Info: {}'.format(e))
            
    elif username == "jelena":
        pass
        
    
        
# cluster
if username == "jbanjac":
    ROOT_PATH = "/home/jbanjac/robust-journey-planning"
    os.environ['PYSPARK_PYTHON'] = '/opt/anaconda3/bin/python'
# local
elif username == "jelena":
    ROOT_PATH = os.getcwd()
    os.environ['PYSPARK_PYTHON'] = '/home/jelena/anaconda3/bin/python'

In [6]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import unix_timestamp, udf, desc
from pyspark.sql.types import *
from pyspark.sql import Window
from pyspark.sql.types import IntegerType

from pyspark.ml.feature import VectorAssembler,Normalizer,  PCA,VectorIndexer
from pyspark.ml.clustering import KMeans, BisectingKMeans, GaussianMixture

In [7]:
if username == "jelena":
    spark = (SparkSession \
                .builder \
                .appName('sbb-{0}'.format(getpass.getuser())) \
                .master('local[4]') \
                .config('spark.driver.memory', '10g') \
                .config('spark.executor.memory', '4g') \
                .config('spark.executor.instances', '5') \
                .config('spark.port.maxRetries', '100') \
                .getOrCreate())
    CLUSTER_URL = "hdfs://iccluster042.iccluster.epfl.ch:8020"

elif SPARK_LOCAL:
    spark = SparkSession \
                .builder \
                .master("local") \
                .appName("roboust-journey-planing") \
                .config("spark.driver.host", "localhost") \
                .getOrCreate()
    CLUSTER_URL = ""
else:
    spark = SparkSession \
                .builder \
                .master("yarn") \
                .appName('sbb-{0}'.format(getpass.getuser())) \
                .config('spark.executor.memory', '4g') \
                .config('spark.executor.instances', '5') \
                .config('spark.port.maxRetries', '100') \
                .getOrCreate()
    CLUSTER_URL = ""

In [8]:
sc = spark.sparkContext
spark

## DATA

Read the dataframe of given day

In [22]:
departure_day = datetime.strptime(date, '%Y-%m-%d')

if SPARK_LOCAL:
    if username == "fatine":
        df_departure_day = spark.read.csv(f"/home/fatine/Documents/Cours Semestre Printemps/Lab in DS/{departure_day.strftime('%Y-%m')}/{departure_day.strftime('%Y-%m-%d')}istdaten.csv", header=True, sep=';').cache()
    elif username == "soner":
        df_departure_day = spark.read.csv(f"/home/soner/Desktop/DSLAB2019/Project/{departure_day.strftime('%Y-%m')}/{departure_day.strftime('%Y-%m-%d')}istdaten.csv", header=True, sep=';').cache()
    elif username == "jelena":
        df_departure_day = spark.read.csv(f"{CLUSTER_URL}/datasets/sbb/{str(departure_day.year)}/{str(departure_day.month).zfill(2)}/{departure_day.strftime('%Y-%m-%d')}istdaten.csv.bz2", sep=';', header=True).cache()

else:
    df_departure_day = spark.read.csv(f"{CLUSTER_URL}/datasets/sbb/{str(departure_day.year)}/{str(departure_day.month).zfill(2)}/{departure_day.strftime('%Y-%m-%d')}istdaten.csv.bz2", sep=';', header=True).cache()


To make the work easier for us, we translated our column names to English.

In [23]:
df_departure_day = df_departure_day.withColumnRenamed("BETRIEBSTAG", "TRIP_DATE")\
                                    .withColumnRenamed("FAHRT_BEZEICHNER", "TRIP_ID")\
                                    .withColumnRenamed("BETREIBER_ID", "OPERATOR_ID")\
                                    .withColumnRenamed("BETREIBER_ABK", "OPERATOR_ABK")\
                                    .withColumnRenamed("BETREIBER_NAME", "OPERATOR_NAME")\
                                    .withColumnRenamed("PRODUKT_ID", "TRANSPORT_TYPE")\
                                    .withColumnRenamed("LINIEN_ID", "TRAIN_ID")\
                                    .withColumnRenamed("LINIEN_TEXT", "TRAIN_NAME")\
                                    .withColumnRenamed("UMLAUF_ID", "CIRCULATING_ID")\
                                    .withColumnRenamed("VERKEHRSMITTEL_TEXT", "SERVICE_TYPE")\
                                    .withColumnRenamed("ZUSATZFAHRT_TF", "ADDITIONAL_DRIVING")\
                                    .withColumnRenamed("FAELLT_AUS_TF", "FAILED")\
                                    .withColumnRenamed("BPUIC", "STATION_ID")\
                                    .withColumnRenamed("HALTESTELLEN_NAME", "STATION_NAME")\
                                    .withColumnRenamed("ANKUNFTSZEIT", "SCHEDULE_ARRIVE_TIME")\
                                    .withColumnRenamed("AN_PROGNOSE", "ACTUAL_ARRIVE_TIME")\
                                    .withColumnRenamed("AN_PROGNOSE_STATUS", "ACTUAL_ARRIVE_TIME_STATUS")\
                                    .withColumnRenamed("ABFAHRTSZEIT", "SCHEDULE_DEPART_TIME")\
                                    .withColumnRenamed("AB_PROGNOSE", "ACTUAL_DEPART_TIME")\
                                    .withColumnRenamed("AB_PROGNOSE_STATUS", "ACTUAL_DEPART_TIME_STATUS")\
                                    .withColumnRenamed("DURCHFAHRT_TF", "PASSES_BY")

In [24]:
df_departure_day.printSchema()

root
 |-- TRIP_DATE: string (nullable = true)
 |-- TRIP_ID: string (nullable = true)
 |-- OPERATOR_ID: string (nullable = true)
 |-- OPERATOR_ABK: string (nullable = true)
 |-- OPERATOR_NAME: string (nullable = true)
 |-- TRANSPORT_TYPE: string (nullable = true)
 |-- TRAIN_ID: string (nullable = true)
 |-- TRAIN_NAME: string (nullable = true)
 |-- CIRCULATING_ID: string (nullable = true)
 |-- SERVICE_TYPE: string (nullable = true)
 |-- ADDITIONAL_DRIVING: string (nullable = true)
 |-- FAILED: string (nullable = true)
 |-- STATION_ID: string (nullable = true)
 |-- STATION_NAME: string (nullable = true)
 |-- SCHEDULE_ARRIVE_TIME: string (nullable = true)
 |-- ACTUAL_ARRIVE_TIME: string (nullable = true)
 |-- ACTUAL_ARRIVE_TIME_STATUS: string (nullable = true)
 |-- SCHEDULE_DEPART_TIME: string (nullable = true)
 |-- ACTUAL_DEPART_TIME: string (nullable = true)
 |-- ACTUAL_DEPART_TIME_STATUS: string (nullable = true)
 |-- PASSES_BY: string (nullable = true)



Filter Data by stations around Zurich obtained in Distance Analysis notebook

In [25]:
# load Zurich stations set
with open('distance/zurich_stations_set.pickle', 'rb') as handle:
    zurich_stations_set = pickle.load(handle)
    
df_zurich = df_departure_day.where(F.col("STATION_ID").isin(zurich_stations_set)).cache()

Drop rows with no arrival time and departure time information

In [26]:
# We check the null values in the departure and arrival times : 
df_zurich.select([pyspark.sql.functions.count(pyspark.sql.functions.when(F.col(c).isNull(), c)).alias(c) for c in ["SCHEDULE_ARRIVE_TIME", "SCHEDULE_DEPART_TIME"]]).show()

# For now we drop the connections with a null time
df_zurich = df_zurich.na.drop(subset=["SCHEDULE_ARRIVE_TIME", "SCHEDULE_DEPART_TIME"])

+--------------------+--------------------+
|SCHEDULE_ARRIVE_TIME|SCHEDULE_DEPART_TIME|
+--------------------+--------------------+
|               16399|               16413|
+--------------------+--------------------+



Filter Data based on the decisions taken in Data_Analysis notebook

In [27]:
df_zurich = df_zurich.filter("TRANSPORT_TYPE is not null")
df_zurich = df_zurich.filter(df_zurich.ADDITIONAL_DRIVING==False)
df_zurich = df_zurich.filter(df_zurich.FAILED==False)

@F.udf
def fix_station_name(station_name):
    fixed_station_name = station_name.replace("�", "ü")
    return fixed_station_name

df_zurich = df_zurich.withColumn('STATION_NAME', fix_station_name(df_zurich.STATION_NAME))
df_zurich = df_zurich.filter(df_zurich.PASSES_BY == False)

df_zurich = df_zurich.filter(df_zurich.ACTUAL_ARRIVE_TIME_STATUS != "UNBEKANNT")
df_zurich = df_zurich.filter(df_zurich.ACTUAL_DEPART_TIME_STATUS != "UNBEKANNT")

df_zurich = df_zurich.filter("ACTUAL_ARRIVE_TIME is not null and SCHEDULE_ARRIVE_TIME is not null")
df_zurich = df_zurich.filter("ACTUAL_ARRIVE_TIME is not null and SCHEDULE_ARRIVE_TIME is not null")
df_zurich = df_zurich.filter("not(ACTUAL_DEPART_TIME is null and SCHEDULE_DEPART_TIME is null)")
df_zurich = df_zurich.filter("ACTUAL_DEPART_TIME is not null and SCHEDULE_DEPART_TIME is not null")

## CONFIDENCE CALCULATION

create helper functions

In [28]:
delay_arrive = F.unix_timestamp('ACTUAL_ARRIVE_TIME', format='dd.MM.yyyy HH:mm:ss') - F.unix_timestamp('SCHEDULE_ARRIVE_TIME', format='dd.MM.yyyy HH:mm')

delay_depart = F.unix_timestamp('ACTUAL_DEPART_TIME', format='dd.MM.yyyy HH:mm:ss') - F.unix_timestamp('SCHEDULE_DEPART_TIME', format='dd.MM.yyyy HH:mm')

@F.udf
def convert_to_min(delay_sec):
    minutes = math.ceil(delay_sec/60)
    return minutes

@F.udf
def convert_to_weekday_1(date):
    return str(datetime.strptime(date, '%d.%m.%Y').strftime('%w'))
@F.udf
def convert_to_weekday_2(date):
    return str(datetime.strptime(date, '%d.%m.%Y %H:%M:%S').strftime('%w'))
@F.udf
def convert_to_hour(date):
    return str(datetime.strptime(date, '%d.%m.%Y %H:%M:%S').hour)
@F.udf
def convert_to_month_1(date):
    return str(datetime.strptime(date, '%d.%m.%Y').month)
@F.udf
def convert_to_month_2(date):
    return str(datetime.strptime(date, '%d.%m.%Y %H:%M:%S').month)


create dataframe spesifically for delays

In [29]:
df_zurich_delays = df_zurich.withColumn("delay_arrive", delay_arrive).cache()
df_zurich_delays = df_zurich_delays.withColumn("delay_depart", delay_depart)
df_zurich_delays = df_zurich_delays.withColumn('delay_arrive', convert_to_min(df_zurich_delays.delay_arrive))
df_zurich_delays = df_zurich_delays.withColumn('delay_depart', convert_to_min(df_zurich_delays.delay_depart))
df_zurich_delays = df_zurich_delays.withColumn("TRIP_DATE_month",convert_to_month_1(df_zurich_delays['TRIP_DATE']))\
                            .withColumn('TRIP_DATE_week_day', convert_to_weekday_1(df_zurich_delays['TRIP_DATE']))
    
df_zurich_delays = df_zurich_delays.withColumn("ACTUAL_ARRIVE_TIME_month",convert_to_month_2(df_zurich_delays['ACTUAL_ARRIVE_TIME']))\
                            .withColumn('ACTUAL_ARRIVE_TIME_week_day', convert_to_weekday_2(df_zurich_delays['ACTUAL_ARRIVE_TIME']))\
                            .withColumn("ACTUAL_ARRIVE_TIME_hour",convert_to_hour(df_zurich_delays['ACTUAL_ARRIVE_TIME']))

df_zurich_delays = df_zurich_delays.withColumn("ACTUAL_DEPART_TIME_month",convert_to_month_2(df_zurich_delays['ACTUAL_DEPART_TIME']))\
                            .withColumn('ACTUAL_DEPART_TIME_week_day', convert_to_weekday_2(df_zurich_delays['ACTUAL_DEPART_TIME']))\
                            .withColumn("ACTUAL_DEPART_TIME_hour",convert_to_hour(df_zurich_delays['ACTUAL_DEPART_TIME']))


 

import clusters obtained in Data Analysis notebook

In [30]:
def get_cluster_num(s, clusters):
    for k, v in clusters.items():
        if s in clusters[k]:
            return k
    return None

def read_clusters(file_name):
    with open('clusters/' + file_name, 'r') as f:
        clusters = json.load(f)
    return clusters

cluster_SERVICE_TYPE = read_clusters('SERVICE_TYPE.json')
cluster_OPERATOR_ID = read_clusters('OPERATOR_ID.json')
cluster_STATION_ID = read_clusters('STATION_ID.json')

cluster_ACTUAL_ARRIVE_TIME_month = read_clusters('ACTUAL_ARRIVE_TIME_month.json')
cluster_ACTUAL_ARRIVE_TIME_hour = read_clusters('ACTUAL_ARRIVE_TIME_hour.json')
cluster_ACTUAL_ARRIVE_TIME_week_day = read_clusters('ACTUAL_ARRIVE_TIME_week_day.json')

cluster_ACTUAL_DEPART_TIME_month = read_clusters('ACTUAL_DEPART_TIME_month.json')
cluster_ACTUAL_DEPART_TIME_hour = read_clusters('ACTUAL_DEPART_TIME_hour.json')
cluster_ACTUAL_DEPART_TIME_week_day = read_clusters('ACTUAL_DEPART_TIME_week_day.json')

In [31]:
def arrive_distribution(arrive = df_zurich_full, date=None, time=None, trip_id=None, service_type=None, operator_id=None, transport_type=None, station_id=None):
    """ Get arrival delay probability and arrival delay distribution coefficients
    
    This method calculates the arrival delay probability and it calculates the exponential distribution coefficients
    
    Parameters
    ----------
    arrive: spark dataframe
        As default we get the full SBB dataset to calculate the probability
    date: string
        Date in format %d.%m.%Y
    time: string
        Time in format %H:%M
    trip_id: string
        Id of the trip
    service_type: string
        Type of service of the trip
    operator_id: string
        Operator id of the trip
    transport_type: string
        Transport type of the trip
    station_id: string
        Station ID
        
    Returns
    -------
    arrive_delay_distribution: tuple
        Coefficients of the exponential distribution
    arrive_delay_probability: float
        Probability of this arrival setting to be delayed
    """
    
    # extract date information
    if date:
        week_day = str(datetime.strptime(date, '%d.%m.%Y').strftime('%w'))
        month = str(datetime.strptime(date, '%d.%m.%Y').month)
    if time:
        hour = str(datetime.strptime(time, '%H:%M').hour)

    if trip_id:
        arrive = arrive.filter(arrive.TRIP_ID==trip_id).cache()
    if transport_type:
        arrive = arrive.filter(arrive.TRANSPORT_TYPE==transport_type).cache()
    
    oi_cn = get_cluster_num(operator_id, cluster_OPERATOR_ID)
    if oi_cn: arrive = arrive.where(arrive.OPERATOR_ID.isin(cluster_OPERATOR_ID[oi_cn])).cache()

    st_cn = get_cluster_num(service_type, cluster_SERVICE_TYPE)
    if st_cn: arrive = arrive.where(arrive.SERVICE_TYPE.isin(cluster_SERVICE_TYPE[st_cn])).cache()

        
    si_cn = get_cluster_num(station_id, cluster_STATION_ID)
    if si_cn: arrive = arrive.where(arrive.STATION_ID.isin(cluster_STATION_ID[si_cn])).cache()
    
    aat_m_cn = get_cluster_num(month, cluster_ACTUAL_ARRIVE_TIME_month)
    if aat_m_cn: arrive = arrive.where(arrive.ACTUAL_ARRIVE_TIME_month.isin(cluster_ACTUAL_ARRIVE_TIME_month[aat_m_cn])).cache()
        
    aat_h_cn = get_cluster_num(hour, cluster_ACTUAL_ARRIVE_TIME_hour)
    if aat_h_cn: arrive = arrive.where(arrive.ACTUAL_ARRIVE_TIME_hour.isin(cluster_ACTUAL_ARRIVE_TIME_hour[aat_h_cn])).cache()
        
    aat_wd_cn = get_cluster_num(week_day, cluster_ACTUAL_ARRIVE_TIME_week_day)
    if aat_wd_cn: arrive = arrive.where(arrive.ACTUAL_ARRIVE_TIME_week_day.isin(cluster_ACTUAL_ARRIVE_TIME_week_day[aat_wd_cn])).cache()

    # sum of all arrival trips
    sum_arrive_trips = arrive.count()

    # sum of all delayed trips
    arrive = arrive.filter('delay_arrive > 0')
    sum_delay_arrive_trips = arrive.count()

    # get probabilities of arrival delay
    arrive_delay_probability = sum_delay_arrive_trips/sum_arrive_trips
    data = arrive.select('delay_arrive').toPandas()
    arrive_delay_distribution = stats.expon.fit(np.array(list(map(int, data['delay_arrive'].values))), floc=0, scale=1)

    return arrive_delay_distribution, arrive_delay_probability

### Probability of catching the next trip

Here we are calculating the probability of catching the next trip. We know the distribution of arrival delays, as well as the time difference between this and the next trip that make a connection. Also, we use the probability of arrival delay to be delayed. After the exploring following papers: 
- [Stochastic Modelling of Train Delays and Delay Propagation in Stations](https://repository.tudelft.nl/islandora/object/uuid:caa72522-26b1-4088-afc0-59c6e5c346f6/datastream/OBJ/download)
- [Adi Botea, Stefano Braghin, "Contingent versus Deterministic Plans in Multi-Modal Journey Planning". ICAPS 2015: 268-272](https://dl.acm.org/citation.cfm?id=3038699)
- [Mathematical modeling and methods for rescheduling
trains under disrupted operations](https://tel.archives-ouvertes.fr/tel-00453640/document)
- [Adi Botea, Stefano Braghin, "Contingent versus Deterministic Plans in Multi-Modal Journey Planning". ICAPS 2015: 268-272.](https://dl.acm.org/citation.cfm?id=3038699)
- [Adi Botea, Evdokia Nikolova, Michele Berlingerio, "Multi-Modal Journey Planning in the Presence of Uncertainty". ICAPS 2013.](https://www.aaai.org/ocs/index.php/ICAPS/ICAPS13/paper/view/6023)

we decided to use the next way of calculating the probability:

In [32]:
def calculate_probability_to_catch(arrive_delay_distribution, timediff, arrive_delay_probability):
    """ Calculating the probability of catching the next trip
    
    Parameters
    ----------
    arrive_delay_distribution: tuple
        Coefficients of arrive delay exponential distribution
    timediff: float
        Time difference between this and next trip
    arrive_delay_probability: float
        Probability of arrival delay
    
    Returns
    -------
    p: float
        Probability of catching the next trip
    """
    timediff = int(timediff)
    quantile = np.arange (0, timediff + 1, 1) 
    R = stats.expon.cdf(quantile, loc = 0, scale = arrive_delay_distribution[1])
    
    if arrive_delay_probability == 0:
        p = 1
    else:
        p = (1 - arrive_delay_probability) + arrive_delay_probability * R[timediff]

    return p

In [33]:
def calculate_confidence(connections_info):
    """ Calculate the confidence of the whole route
    
    Parameters
    ----------
    connections_info: dict
        Information of how one found route is connected
        
    Returns
    -------
    p: float
        Confidence of this connection to succeed
    """
    p = 1.0
    
    for idx in range(len(connections_info)-1):
        
        arrive_delay_distribution, arrive_delay_probability = None, None

        if connections_info[idx]['transport_type'] != 'walk' and connections_info[idx+1]['transport_type'] != 'walk':

            arrive_delay_distribution, arrive_delay_probability = arrive_distribution(date=connections_info[idx]['arrive_date'].strftime('%d.%m.%Y'),
                                                          time=connections_info[idx]['arrive_date'].strftime('%H:%M'),
                                                          trip_id=connections_info[idx]['trip_id'],
                                                          service_type=connections_info[idx]['service_type'],
                                                          operator_id=connections_info[idx]['operator_id'],
                                                          transport_type=connections_info[idx]['transport_type'],
                                                          station_id=connections_info[idx]['arrive_station_id'])

            timediff = int((connections_info[idx+1]['depart_date']- connections_info[idx]['arrive_date']).seconds)/60

            p *= calculate_probability_to_catch(arrive_delay_distribution, timediff, arrive_delay_probability)
            
        elif connections_info[idx]['transport_type'] == 'walk' and connections_info[idx+1]['transport_type'] != 'walk':
            if idx == 0:
                arrive_delay_probability = 0
                arrive_delay_distribution = stats.expon.fit(np.zeros(1000), floc=0, scale=1)
            
            timediff = float((connections_info[idx+1]['depart_date'] - connections_info[idx]['arrive_date']).seconds)/60
            
            p *= calculate_probability_to_catch(arrive_delay_distribution, timediff, arrive_delay_probability)
        
        elif connections_info[idx]['transport_type'] != 'walk' and connections_info[idx+1]['transport_type'] == 'walk':
            arrive_delay_distribution, arrive_delay_probability = arrive_distribution(date=connections_info[idx]['arrive_date'].strftime('%d.%m.%Y'),
                                                          time=connections_info[idx]['arrive_date'].strftime('%H:%M'),
                                                          trip_id=connections_info[idx]['trip_id'],
                                                          service_type=connections_info[idx]['service_type'],
                                                          operator_id=connections_info[idx]['operator_id'],
                                                          transport_type=connections_info[idx]['transport_type'],
                                                          station_id=connections_info[idx]['arrive_station_id'])
            timediff = float((connections_info[idx+1]['depart_date']- connections_info[idx]['arrive_date']).seconds)/60

            p *= calculate_probability_to_catch(arrive_delay_distribution, timediff, arrive_delay_probability)
        
    return p

# CONNECTION GRAPH

In [34]:
from pyspark.sql.types import TimestampType, IntegerType, StructType, ArrayType, StructField

In [35]:
df_zurich.printSchema()

root
 |-- TRIP_DATE: string (nullable = true)
 |-- TRIP_ID: string (nullable = true)
 |-- OPERATOR_ID: string (nullable = true)
 |-- OPERATOR_ABK: string (nullable = true)
 |-- OPERATOR_NAME: string (nullable = true)
 |-- TRANSPORT_TYPE: string (nullable = true)
 |-- TRAIN_ID: string (nullable = true)
 |-- TRAIN_NAME: string (nullable = true)
 |-- CIRCULATING_ID: string (nullable = true)
 |-- SERVICE_TYPE: string (nullable = true)
 |-- ADDITIONAL_DRIVING: string (nullable = true)
 |-- FAILED: string (nullable = true)
 |-- STATION_ID: string (nullable = true)
 |-- STATION_NAME: string (nullable = true)
 |-- SCHEDULE_ARRIVE_TIME: string (nullable = true)
 |-- ACTUAL_ARRIVE_TIME: string (nullable = true)
 |-- ACTUAL_ARRIVE_TIME_STATUS: string (nullable = true)
 |-- SCHEDULE_DEPART_TIME: string (nullable = true)
 |-- ACTUAL_DEPART_TIME: string (nullable = true)
 |-- ACTUAL_DEPART_TIME_STATUS: string (nullable = true)
 |-- PASSES_BY: string (nullable = true)



In [36]:
# create mappings f of stations from id to index and index to id
#station_index_to_id = df_zurich.select("STATION_ID").distinct().rdd.flatMap(lambda x: x).collect()
station_index_to_id = list(df_zurich.select('STATION_ID').distinct().toPandas()['STATION_ID'])

In [37]:
station_id_to_index = {}
for index, station_id in enumerate(station_index_to_id):
    station_id_to_index[station_id] = index

## Transportation Adjacency Matrix

Create the dataframe for trips by cleaning the dataframe for stations in Zurich

In [38]:
# define several udf
return_index = udf(lambda station_id: station_id_to_index[station_id], IntegerType())
return_datetime = udf(lambda date: datetime.strptime(date, "%d.%m.%Y %H:%M"), TimestampType())
#return_datetime = udf(lambda date: datetime.strptime(date, "%H:%M"), TimestampType())

# Ignore failed and additional trips
df_trips = df_zurich.filter(F.col('ADDITIONAL_DRIVING')=='false')\
                       .filter(F.col('PASSES_BY')=='false')\
                       .select(F.col('TRIP_ID'),
                             F.col('TRANSPORT_TYPE'),
                             F.col('STATION_ID'),
                             return_index('STATION_ID').astype('int').alias('STATION_INDEX'),
                             F.col('STATION_NAME'),
                             return_datetime(F.col('SCHEDULE_ARRIVE_TIME')).alias('SCHEDULE_ARRIVE_TIME'),
                             return_datetime(F.col('SCHEDULE_DEPART_TIME')).alias('SCHEDULE_DEPART_TIME')).cache()

#window by trip id and sort by arrival time
trip_id_window = Window.partitionBy('TRIP_ID').orderBy(F.asc('SCHEDULE_ARRIVE_TIME'))
#keep the order of stations in the trip
df_trips = df_trips.withColumn('TRIP_ORDER', F.rank().over(trip_id_window)).cache()

In [39]:
df_trips.printSchema()
#df_trips.show(5)

root
 |-- TRIP_ID: string (nullable = true)
 |-- TRANSPORT_TYPE: string (nullable = true)
 |-- STATION_ID: string (nullable = true)
 |-- STATION_INDEX: integer (nullable = true)
 |-- STATION_NAME: string (nullable = true)
 |-- SCHEDULE_ARRIVE_TIME: timestamp (nullable = true)
 |-- SCHEDULE_DEPART_TIME: timestamp (nullable = true)
 |-- TRIP_ORDER: integer (nullable = true)



Create all station pairs connected in a trip

In [40]:
# define udf and struct type to get all connected station pairs
schema = StructType([
    StructField("DEPART", ArrayType(IntegerType()), False),
    StructField("ARRIVE", ArrayType(IntegerType()), False)
])

#iterate over each trip and connect every stations in the trip
def return_all_connected_station_pairs(station_indices):
    depart = []
    arrive = []
    for i in range(len(station_indices)):
        for j in range(i+1, len(station_indices)):
            depart.append(station_indices[i])
            arrive.append(station_indices[j])
    return [depart, arrive]

return_all_connected_station_pairs_udf = F.udf(return_all_connected_station_pairs, schema)

# apply udf to df_trips
df_connections = df_trips.groupBy('TRIP_ID')\
                       .agg(F.collect_list('STATION_INDEX').alias('STATION_INDEX'))\
                       .withColumn('STATION_INDEX', return_all_connected_station_pairs_udf('STATION_INDEX'))\
                       .select("TRIP_ID", "STATION_INDEX.DEPART", "STATION_INDEX.ARRIVE")
df_connections.printSchema()
# df_connections.show(5)

root
 |-- TRIP_ID: string (nullable = true)
 |-- DEPART: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- ARRIVE: array (nullable = true)
 |    |-- element: integer (containsNull = true)



Build transportation matrix which is a adjacency matrix for a directed graph thath shows if a station is reachable from another station.

In [42]:
transportation_matrix = np.zeros((len(station_index_to_id),len(station_index_to_id)))

# collect all pairs as (depart, arrive)
depart = df_connections.select('DEPART').rdd.flatMap(lambda x: x).collect()
arrive = df_connections.select('ARRIVE').rdd.flatMap(lambda x: x).collect()

# fill conection matrix with pairs
depart_list = [item for trip in depart for item in trip]
arrive_list = [item for trip in arrive for item in trip]

for i in range(len(depart_list)):
    transportation_matrix[depart_list[i],arrive_list[i]] = 1

## Walk Adjcacency Matrix

In [44]:
# load distance dictionary
with open('distance/distance_dictionary.pickle', 'rb') as handle:
    distance_dictionary = pickle.load(handle)

In [45]:
transfer_delay = 5 # change from one station to another in mins

# average walking speed is assumed to be 4.5 km/h
# ref: https://www.quora.com/What-is-the-average-walking-speed-of-a-human
max_walking_time = 5 # min
human_speed = 75 # m/min which corresponds to 4.5 km/h
max_walking_distance = human_speed * max_walking_time # as meters

In [46]:
walk_matrix = np.zeros((len(station_index_to_id), len(station_index_to_id)))


for station_from, station_to in distance_dictionary:
    # check if stations are exist in this day
    if str(station_from) in  station_id_to_index and str(station_to) in station_id_to_index:
        # check if the distance is acceptable
        if distance_dictionary[(station_from, station_to)] <= max_walking_distance:
            walk_matrix[station_id_to_index[str(station_from)], station_id_to_index[str(station_to)]] = 1
    

## Connection Graph

Build connection graph by using two adjacency matrix created above

In [47]:
# merged conenction_matrix and wal_matrix into a one single adjcency matrix of all possible paths
connection_matrix = np.logical_or(transportation_matrix, walk_matrix).astype(int)
connection_graph = nx.DiGraph(connection_matrix)

In [48]:
for path in nx.all_simple_paths(connection_graph, 670, 122, 2):
    print(path)

debug the paths

In [49]:
# id to station name
station_index_to_id[160]
df_zurich.filter("STATION_ID = '8588078'").first().STATION_NAME

# station name to id
i = df_zurich.filter("STATION_NAME = 'Zürich HB'").first().STATION_ID
station_id_to_index[i]

'8503000'

In [53]:
# station name to id
i = df_zurich.filter("STATION_NAME = 'Zürich, Bahnhofplatz/HB'").first().STATION_ID
station_id_to_index[i]

776

In [54]:
# station name to id
i = df_zurich.filter("STATION_NAME = 'Zürich, Bahnhofstrasse/HB'").first().STATION_ID
station_id_to_index[i]

392

# TIMETABLE

Create timetable

In [55]:
df_trips = df_zurich.filter(F.col('ADDITIONAL_DRIVING')=='false')\
                       .filter(F.col('PASSES_BY')=='false')\
                       .select(F.col('OPERATOR_ID'), F.col('SERVICE_TYPE'),F.col('TRIP_ID'),
                             F.col('TRANSPORT_TYPE'),
                             F.col('STATION_ID'),
                             return_index('STATION_ID').astype('int').alias('STATION_INDEX'),
                             F.col('STATION_NAME'),
                             return_datetime(F.col('SCHEDULE_ARRIVE_TIME')).alias('SCHEDULE_ARRIVE_TIME'),
                             return_datetime(F.col('SCHEDULE_DEPART_TIME')).alias('SCHEDULE_DEPART_TIME'))

In [56]:
# create timetable of every possible connection by merging df_depart and df_arrive
df_depart = df_trips.withColumnRenamed('STATION_ID', 'departure_station_id')\
                                            .withColumnRenamed('STATION_NAME', 'departure_station_name')\
                                            .withColumnRenamed('TRIP_ORDER', 'departure_trip_order')\
                                            .withColumnRenamed('STATION_INDEX', 'departure_station_index')\
                                            .drop('SCHEDULE_ARRIVE_TIME')

df_arrive = df_trips.withColumnRenamed('STATION_ID', 'arrival_station_id')\
                                            .withColumnRenamed('STATION_NAME', 'arrival_station_name')\
                                            .withColumnRenamed('TRIP_ORDER', 'arrival_trip_order')\
                                            .withColumnRenamed('STATION_INDEX', 'arrival_station_index')\
                                            .drop('SCHEDULE_DEPART_TIME').drop('type', 'OPERATOR_ID', 'SERVICE_TYPE', 'TRANSPORT_TYPE')


timetable = df_depart.join(df_arrive, on=['TRIP_ID'], how='left_outer').drop('departure_trip_order').drop('arrival_trip_order')

# drop columns with the same departure and arrival stations
mask = timetable.departure_station_name == timetable.arrival_station_name
timetable = timetable[~mask]

# time reverse connections
mask = timetable.SCHEDULE_DEPART_TIME > timetable.SCHEDULE_ARRIVE_TIME
timetable = timetable[~mask]

timetable.printSchema()
timetable.show(10)

root
 |-- TRIP_ID: string (nullable = true)
 |-- OPERATOR_ID: string (nullable = true)
 |-- SERVICE_TYPE: string (nullable = true)
 |-- TRANSPORT_TYPE: string (nullable = true)
 |-- departure_station_id: string (nullable = true)
 |-- departure_station_index: integer (nullable = true)
 |-- departure_station_name: string (nullable = true)
 |-- SCHEDULE_DEPART_TIME: timestamp (nullable = true)
 |-- arrival_station_id: string (nullable = true)
 |-- arrival_station_index: integer (nullable = true)
 |-- arrival_station_name: string (nullable = true)
 |-- SCHEDULE_ARRIVE_TIME: timestamp (nullable = true)

+---------------+-----------+------------+--------------+--------------------+-----------------------+----------------------+--------------------+------------------+---------------------+--------------------+--------------------+
|        TRIP_ID|OPERATOR_ID|SERVICE_TYPE|TRANSPORT_TYPE|departure_station_id|departure_station_index|departure_station_name|SCHEDULE_DEPART_TIME|arrival_station_id

# ROUTE

find fastest connections for each route

In [57]:
# load location dictionary
with open('distance/location_dictionary.pickle', 'rb') as handle:
    location_dictionary = pickle.load(handle)

In [None]:
def filter_paths(connection_graph, departure_station_index, arrival_station_index):
    """ Filter paths we need
    
    Parameters
    ----------
    connection_graph: nx.DiGraph  
        Connection graph
    departure_station_index: int
        Index of the departure station
    arrival_station_index: int
        Index of the arrival station
    transportation_matrix: dict
        Transportation matrix
    walk_matrix: dict
        Walk matrix
    
    Returns
    -------
    filtered_paths: list
        Filtered paths
    """
    filtered_paths = []
    for path in nx.all_simple_paths(connection_graph, departure_station_index, arrival_station_index, 2):
        
        if len(path) == 3:
            # check if both connection is by walk or not
            depart_index, arrive_index = path[0], path[1]
            if walk_matrix[depart_index, arrive_index] == 1:
                depart_index, arrive_index = path[1], path[2]
                # if both connection is by walk, dont add to the paths
                if walk_matrix[depart_index, arrive_index] == 1:
                    continue
        
        filtered_paths.append(path)
            
    return filtered_paths

In [77]:
def calculate_routes(departure_station, arrival_station, date, hour):
    """ Calculate possible routes based on user input
    
    Parameters
    ----------
    departure_station, 
        Departure station ID
    arrival_station: string
        Arrival station ID
    date: string
        Date in format %Y-%m-%d
    hour: string
        Hour in format %HH:%MM:%SS
    
    Returns
    -------
    all_possible_connections: list of dicts
        List of all the possible connections
    all_probabilities: list
        List of probabilities of each of the connections
    """
    full_date = date + ' ' + hour
    # find indices of stations in the graph
    departure_station_index = station_id_to_index[departure_station]
    arrival_station_index = station_id_to_index[arrival_station]
    
    all_possible_connections = []
    all_probabilities = []
    
    # find all simple paths in the graph
    paths = filter_paths(connection_graph, departure_station_index, arrival_station_index)
    i = 0
    for path in paths:    
        print('route:', i)
        i = i + 1
        print('----------------')
        arrival_date = datetime.strptime(full_date, '%Y-%m-%d %H:%M:%S')

        connections_info = []

        route_failed = False
        for depart_index, arrive_index in zip(path[:-1], path[1:]):

            # check if the path is by walk or by transportation
            if walk_matrix[depart_index, arrive_index]:
                mins = distance_dictionary[(str(station_index_to_id[depart_index]), str(station_index_to_id[arrive_index]))] / human_speed

                departure_date = arrival_date 
                arrival_date = departure_date + timedelta(minutes=mins)

                connections_info.append({'depart_station_id': station_index_to_id[depart_index] , 
                                         'arrive_station_id': station_index_to_id[arrive_index]  , 
                                         'depart_date': departure_date,
                                         'arrive_date': arrival_date,
                                         'transport_type': 'walk',
                                         'trip_id': None,
                                         'operator_id':None,
                                         'service_type': None})


            else:
                
                # add walking time to change stations 
                arrival_date = arrival_date + timedelta(minutes=transfer_delay)
                
                # find earliest departure 
                #connection = timetable.filter((F.col('departure_station_index') == depart_index) & (F.col('arrival_station_index') == arrive_index) & (F.col('SCHEDULE_DEPART_TIME') >= arrival_date.strftime('%Y-%m-%d %H:%M:%S'))).first()
                connection = timetable.filter((F.col('departure_station_index') == depart_index) & (F.col('arrival_station_index') == arrive_index) & (F.col('SCHEDULE_DEPART_TIME') >= arrival_date)).first()
                if connection:
                    arrival_date = connection.SCHEDULE_ARRIVE_TIME

                    connections_info.append({'depart_station_id': connection.departure_station_id, 
                                             'arrive_station_id': connection.arrival_station_id, 
                                             'depart_date': connection.SCHEDULE_DEPART_TIME, 
                                             'arrive_date': arrival_date, 
                                             'transport_type': connection.TRANSPORT_TYPE,
                                             'trip_id': connection.TRIP_ID,
                                             'operator_id': connection.OPERATOR_ID,
                                             'service_type': connection.SERVICE_TYPE})
                else:
                    print('route failed')
                    route_failed = True
                    break

        if not route_failed:            
            p = calculate_confidence(connections_info)
        else: 
            p = 0
        
        all_possible_connections.append(connections_info)
        all_probabilities.append(p)

        for connection in connections_info:
            print('departure: ',connection['depart_station_id'], 'arrival: ',connection['arrive_station_id'], 'departure_time: ', connection['depart_date'].strftime('%Y-%m-%d %H:%M:%S'), 'arrival_time: ', connection['arrive_date'].strftime('%Y-%m-%d %H:%M:%S'),'transport_type: ', connection['transport_type'])
        
        
        print(f"confidence = {p}")
        print('----------------')
        
    # add Lon and Lat
    for ci in all_possible_connections:
        for trip in ci:
            trip['depart_station_id_LON'] =  location_dictionary[trip['depart_station_id']][0]#stations_df.filter(stations_df.station_id == trip['depart_station_id']).select('x_coordinate').collect()[0].x_coordinate
            trip['depart_station_id_LAT'] = location_dictionary[trip['depart_station_id']][1]#stations_df.filter(stations_df.station_id == trip['depart_station_id']).select('y_coordinate').collect()[0].y_coordinate
            trip['arrive_station_id_LON'] = location_dictionary[trip['arrive_station_id']][0]#stations_df.filter(stations_df.station_id == trip['arrive_station_id']).select('x_coordinate').collect()[0].x_coordinate
            trip['arrive_station_id_LAT'] = location_dictionary[trip['arrive_station_id']][1]#stations_df.filter(stations_df.station_id == trip['arrive_station_id']).select('y_coordinate').collect()[0].y_coordinate
     
    max_confidence_indices = np.argsort(all_probabilities)[::-1]
    
    all_possible_connections = np.asarray(all_possible_connections)
    all_possible_connections = all_possible_connections[max_confidence_indices]
    
    all_probabilities = np.asarray(all_probabilities)
    all_probabilities = all_probabilities[max_confidence_indices]
    
    return list(all_possible_connections), list(all_probabilities)

In [78]:
departure_station_id = df_zurich.filter("STATION_NAME = '" + departure_station + "'").first().STATION_ID
arrival_station_id = df_zurich.filter("STATION_NAME = '" + arrival_station + "'").first().STATION_ID
all_possible_connections, all_probabilities = calculate_routes(departure_station=departure_station_id, arrival_station=arrival_station_id, date=date, hour=hour)

route: 0
----------------
route failed
departure:  8503000 arrival:  8503202 departure_time:  2018-06-26 23:07:00 arrival_time:  2018-06-26 23:23:00 transport_type:  Zug
confidence = 0
----------------
route: 1
----------------
departure:  8503000 arrival:  8503006 departure_time:  2018-06-26 21:37:00 arrival_time:  2018-06-26 21:44:00 transport_type:  Zug
departure:  8503006 arrival:  8503016 departure_time:  2018-06-26 23:52:00 arrival_time:  2018-06-26 23:56:00 transport_type:  Zug
confidence = 1.0
----------------
route: 2
----------------
departure:  8503000 arrival:  8503307 departure_time:  2018-06-26 21:49:00 arrival_time:  2018-06-26 22:09:00 transport_type:  Zug
departure:  8503307 arrival:  8503016 departure_time:  2018-06-26 22:26:00 arrival_time:  2018-06-26 22:31:00 transport_type:  Zug
confidence = 0.9999880327084439
----------------
route: 3
----------------
route failed
departure:  8503000 arrival:  8503015 departure_time:  2018-06-26 23:44:00 arrival_time:  2018-06-26

In [79]:
all_possible_connections

[[{'depart_station_id': '8503000',
   'arrive_station_id': '8503016',
   'depart_date': datetime.datetime(2018, 6, 26, 17, 9),
   'arrive_date': datetime.datetime(2018, 6, 26, 17, 19),
   'transport_type': 'Zug',
   'trip_id': '85:11:2277:001',
   'operator_id': '85:11',
   'service_type': 'IR',
   'depart_station_id_LON': 8.540192,
   'depart_station_id_LAT': 47.378177,
   'arrive_station_id_LON': 8.562386,
   'arrive_station_id_LAT': 47.450383}],
 [{'depart_station_id': '8503000',
   'arrive_station_id': '8503006',
   'depart_date': datetime.datetime(2018, 6, 26, 21, 37),
   'arrive_date': datetime.datetime(2018, 6, 26, 21, 44),
   'transport_type': 'Zug',
   'trip_id': '85:11:18982:002',
   'operator_id': '85:11',
   'service_type': 'S',
   'depart_station_id_LON': 8.540192,
   'depart_station_id_LAT': 47.378177,
   'arrive_station_id_LON': 8.544115,
   'arrive_station_id_LAT': 47.411529},
  {'depart_station_id': '8503006',
   'arrive_station_id': '8503016',
   'depart_date': dateti

In [80]:
all_probabilities

[1.0,
 1.0,
 0.9999880327084439,
 0.9980695458637723,
 0.9740653123038807,
 0.961225792168278,
 0.8946007754381357,
 0.0,
 0.0,
 0.0]

# Visualization

To run the visualization, please run the whole notebook and open the: http://0.0.0.0:5000/

In [82]:
index_html = """
<!doctype html>
<html lang="en">
<head>
  <link rel="stylesheet" href="https://cdn.rawgit.com/openlayers/openlayers.github.io/master/en/v5.3.0/css/ol.css"
    type="text/css">
  <script src="https://cdn.rawgit.com/openlayers/openlayers.github.io/master/en/v5.3.0/build/ol.js"></script>
</head>

<body>
  <div style="position: absolute; top: 10px; right: 10px; z-index: 100; background: white; padding: 8px;">
  <input type="text" placeholder="Start Station ID..." id="start_station_id" />
  <input type="text" placeholder="End Station ID..." id="end_station_id" />
  <input type="text" placeholder="yyyy-mm-dd" id="in_date" />
  <input type="text" placeholder="HH:MM:SS" id="in_time" />
  <button id="search">Show</button>
  <div id="message"></div>
</div>

  <div id="map" style="width: 100%; height: 100%"></div>

  <script type="text/javascript">
    var features = [];
    var vectorLayer = new ol.layer.Vector({
      style: function (feature, resolution) {
        return feature.get('style');
      },
    });
    var map = new ol.Map({
      target: 'map',
      layers: [
        new ol.layer.Tile({
          source: new ol.source.OSM()
        }),
      ],
      view: new ol.View({
        center: ol.proj.fromLonLat([8.540192, 47.378177]),
        zoom: 14
      })
    });
    map.addLayer(vectorLayer);

    function addLine(start, final, color) {
      var lineString = new ol.geom.LineString([start, final]);
      lineString.transform('EPSG:4326', 'EPSG:3857');
      var feature = new ol.Feature({
        geometry: lineString,
      });
      feature.setStyle(new ol.style.Style({
        stroke: new ol.style.Stroke({
          color: color,
          width: 5
        })
      }));
      features.push(feature);
      vectorLayer.setSource(
        new ol.source.Vector({
          features: features,
        })
      );
    }

    document.getElementById('search').addEventListener('click', function() {
      var start_station_id = document.getElementById('start_station_id').value;
      var end_station_id = document.getElementById('end_station_id').value;
      var in_date = document.getElementById('in_date').value;
      var in_time = document.getElementById('in_time').value;

      var xhttp = new XMLHttpRequest();

      xhttp.onreadystatechange = function() {
        if (this.readyState == 4 && this.status == 200) {
          var data = JSON.parse(this.responseText);

          data.lines.forEach(function(line) {
            addLine(line.start, line.end, line.color);
          });
          document.getElementById("message").innerHTML = data.message;
        }
      };

      xhttp.open('GET', '/search?start=' + start_station_id + '&end=' + end_station_id + '&date=' + in_date + '&time=' + in_time, true);
      xhttp.send();
    });

    // addLine([8.540192, 47.378177], [8.540192, 47.478177], '#00ff00');
    // addLine([8.540192, 47.378177], [8.540192, 47.278177], '#ff0000');
  </script>
</body>
</html>
"""

In [85]:
from flask import Flask, request, jsonify


app = Flask(__name__)

def beautify_trip(trip):
    return f"take a {trip['transport_type']} from {trip['depart_station_id']} at {trip['depart_date'].strftime('%d.%m.%Y %H:%M')} to {trip['arrive_station_id']} at {trip['arrive_date'].strftime('%d.%m.%Y %H:%M')}"

def beautify_print(all_possible_connections, all_probabilities):
    colors = ["green", "red", "blue"]
    ret_val = ""
    for i, (p, ci) in enumerate(zip(all_probabilities, all_possible_connections)):
        ret_val += f"Connection {i+1} [{colors[i].upper()}] - Probability {p}:<br/>"
        for j, trip in enumerate(ci): 
            ret_val += f"{j+1}) {beautify_trip(trip)} <br/>"
        ret_val += "<br/><br/>"
    return ret_val

def make_lines(all_possible_connections):
    out_lines = []
    colors = ['#00ff00', '#ff0000', '#0000ff']
    for i, (p, ci) in enumerate(zip(all_probabilities, all_possible_connections)):
        for j, trip in enumerate(ci): 
            out_lines.append({ 'start': [ trip['depart_station_id_LON'], trip['depart_station_id_LAT']], 'end': [trip['arrive_station_id_LON'], trip['arrive_station_id_LAT']], 'color': colors[i] })
    return out_lines

@app.route('/', methods=['GET'])
def home():
    return index_html


@app.route('/search', methods=['GET'])
def search():
    print('here...')
    # Grab arguments here
    start_arg = request.args.get('start')
    end_arg = request.args.get('end')
    date_arg = request.args.get('date')
    time_arg = request.args.get('time')
    
    print(f'Ajax call executed! Start {start_arg}, end {end_arg}')
    
    # Do some processing
    #all_possible_connections, all_probabilities = find_route_with_probability(departure_station='8503000', arrival_station='8503016', date='2018-02-14', hour='00:00:00')
    all_possible_connections, all_probabilities = calculate_routes(departure_station=start_arg, arrival_station=end_arg, date=date_arg, hour=time_arg)
    all_possible_connections, all_probabilities = all_possible_connections[:3], all_probabilities[:3]
    status_message = beautify_print(all_possible_connections, all_probabilities)
    
    out_lines = make_lines(all_possible_connections)
    
    # Return data in corresponding format
    return jsonify({ 'message': status_message, 'lines': out_lines})


app.run(host='0.0.0.0', port=5000)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)


here...
Ajax call executed! Start 8503000, end 8503016
route: 0
----------------
departure:  8503000 arrival:  8503202 departure_time:  2018-06-26 14:21:00 arrival_time:  2018-06-26 14:38:00 transport_type:  Zug
departure:  8503202 arrival:  8503016 departure_time:  2018-06-26 23:22:00 arrival_time:  2018-06-26 23:56:00 transport_type:  Zug
confidence = 1.0
----------------
route: 1
----------------
departure:  8503000 arrival:  8503006 departure_time:  2018-06-26 05:49:00 arrival_time:  2018-06-26 05:56:00 transport_type:  Zug
departure:  8503006 arrival:  8503016 departure_time:  2018-06-26 08:22:00 arrival_time:  2018-06-26 08:26:00 transport_type:  Zug
confidence = 1.0
----------------
route: 2
----------------
departure:  8503000 arrival:  8503307 departure_time:  2018-06-26 05:49:00 arrival_time:  2018-06-26 06:09:00 transport_type:  Zug
departure:  8503307 arrival:  8503016 departure_time:  2018-06-26 13:56:00 arrival_time:  2018-06-26 14:01:00 transport_type:  Zug
confidence = 

127.0.0.1 - - [16/Jun/2019 15:27:12] "[37mGET /search?start=8503000&end=8503016&date=2018-02-14&time=00:00:00 HTTP/1.1[0m" 200 -


departure:  8503000 arrival:  8503016 departure_time:  2018-06-26 11:07:00 arrival_time:  2018-06-26 11:16:00 transport_type:  Zug
confidence = 1.0
----------------
