# Overview:
In this notebook, we train our delay prediction model. We use the real-world SBB data to train the delay prediction model. We use the follwing features of a trip to train the model:
1. Stop id
2. Product id (bus, tram, zug)
3. Time of the day
4. Day of the week
5. Week of the year

We tried two different models to predict the delays: 1. Linear regression model and 2. Logistic Regression model. \
We found that the logistic regression model performs better and decided to use this for our final algorithm. We put the delays into bins of <0 min, 0-1 min, 1-2 min and upto 12 min. Our trained model then predicts the probability of the given trip to have a delay lying the the corresponding bin. We use these probabilities in our overall algorithm to predict the probability of success of the overall trip.

In [1]:
%%local
import os
username = 'moiseev'
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final", "executorMemory":"8G", "executorCores":4, "numExecutors":10, "driverMemory": "8G" }' % username)

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
7137,application_1618324153128_6842,pyspark,idle,Link,Link,,
7155,application_1618324153128_6860,pyspark,idle,Link,Link,,
7156,application_1618324153128_6861,pyspark,idle,Link,Link,,
7159,application_1618324153128_6864,pyspark,idle,Link,Link,,
7161,application_1618324153128_6866,pyspark,idle,Link,Link,,
7165,application_1618324153128_6871,pyspark,idle,Link,Link,,
7166,application_1618324153128_6872,pyspark,idle,Link,Link,,
7168,application_1618324153128_6874,pyspark,shutting_down,Link,Link,,


In [2]:
%%send_to_spark -i username -t str -n username

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
7169,application_1618324153128_6875,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

In [3]:
import pyspark.sql.functions as F
from pyspark import SparkConf, SparkContext

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Read SBB data

sbb_connections = spark.read.orc('/data/sbb/orc/istdaten')
sbb_connections = sbb_connections.selectExpr(
    "betriebstag as date",
    
    "fahrt_bezeichner as trip_id",
    
    "betreiber_id as operator_id",
    "betreiber_abk as operator_abbr",
    "betreiber_name as operator_name",
    
    "produkt_id as product_id",
    "linien_id as line_id",
    "linien_text as line_text",
    "umlauf_id as circulation_id",
    "verkehrsmittel_text as transportation_text",
    "zusatzfahrt_tf as is_extra",
    "faellt_aus_tf as is_cancelled",
    "haltestellen_name as stop_name",
    # The bpuic corresponds to the stop_id in the sbb_stops from the geostops file
    "bpuic as stop_id",
    
    "ankunftszeit as scheduled_arrival_time",
    "an_prognose as actual_arrival_time",
    "an_prognose_status as arrival_forecast_status", 
    
    "abfahrtszeit as scheduled_departure_time",
    "ab_prognose as actual_departure_time",
    "ab_prognose_status as departure_forecast_status",
    
    "durchfahrt_tf as is_transit"
)
sbb_connections.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- date: string (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- operator_id: string (nullable = true)
 |-- operator_abbr: string (nullable = true)
 |-- operator_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- line_id: string (nullable = true)
 |-- line_text: string (nullable = true)
 |-- circulation_id: string (nullable = true)
 |-- transportation_text: string (nullable = true)
 |-- is_extra: string (nullable = true)
 |-- is_cancelled: string (nullable = true)
 |-- stop_name: string (nullable = true)
 |-- stop_id: string (nullable = true)
 |-- scheduled_arrival_time: string (nullable = true)
 |-- actual_arrival_time: string (nullable = true)
 |-- arrival_forecast_status: string (nullable = true)
 |-- scheduled_departure_time: string (nullable = true)
 |-- actual_departure_time: string (nullable = true)
 |-- departure_forecast_status: string (nullable = true)
 |-- is_transit: string (nullable = true)

In [5]:
# Read other relevant data
stop_map_df = spark.read.parquet("/user/{}/stop_map.parquet".format(username))
trip_map_df = spark.read.parquet("/user/{}/trip_map.parquet".format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
def rename(df, rename_df):
    column = rename_df.columns[0]
    return df.join(rename_df, column, "inner")\
        .drop(column)\
        .withColumnRenamed("new_id", column)

# Training only on fraction of data due to time limitations and Spark cluster instability.
# In our experience, obtained model is still good enough.
conns = rename(sbb_connections, stop_map_df).sample(False, 0.01)
conns = conns.withColumn("product_id", F.lower(F.col("product_id")))
conns.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- date: string (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- operator_id: string (nullable = true)
 |-- operator_abbr: string (nullable = true)
 |-- operator_name: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- line_id: string (nullable = true)
 |-- line_text: string (nullable = true)
 |-- circulation_id: string (nullable = true)
 |-- transportation_text: string (nullable = true)
 |-- is_extra: string (nullable = true)
 |-- is_cancelled: string (nullable = true)
 |-- stop_name: string (nullable = true)
 |-- scheduled_arrival_time: string (nullable = true)
 |-- actual_arrival_time: string (nullable = true)
 |-- arrival_forecast_status: string (nullable = true)
 |-- scheduled_departure_time: string (nullable = true)
 |-- actual_departure_time: string (nullable = true)
 |-- departure_forecast_status: string (nullable = true)
 |-- is_transit: string (nullable = true)
 |-- stop_id: long (nullable = true)

In [7]:
# Identify distinct product ids present in data
product_ids = conns.select(F.col("product_id")).distinct().toPandas()["product_id"].tolist()
product_id_map = {
    u'': 0,
    u'bus': 1,
    u'tram': 2,
    u'zug': 3
}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# Implement one-hot encoding for product_id and stop_id
from pyspark.sql.types import LongType
from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StandardScaler
conns = conns.withColumn("product_id_int", F.udf(lambda x: product_id_map[x], LongType())(F.col("product_id")))
ohe = OneHotEncoder(inputCol="product_id_int", outputCol="product_id_oh", dropLast=False)
conns = ohe.transform(conns)
ohe = OneHotEncoder(inputCol="stop_id", outputCol="stop_id_oh", dropLast=False)
conns = ohe.transform(conns)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# Some helper functions
from datetime import datetime
from pyspark.sql.types import DoubleType, BooleanType

def day_of_week(x):
    date_time = datetime.strptime(x, "%d.%m.%Y")
    return float(date_time.weekday())

def week_of_year(x):
    date_time = datetime.strptime(x, "%d.%m.%Y")
    return float(date_time.isocalendar()[1])

def to_minutes(x):
    try:
        date_time = datetime.strptime(x, "%d.%m.%Y %H:%M")
        a_timedelta = date_time - datetime(1900, 1, 1)
    except:
        try:
            date_time = datetime.strptime(x, "%d.%m.%Y %H:%M:%S")
            a_timedelta = date_time - datetime(1900, 1, 1)
        except:
            raise ValueError(x)
        
    return a_timedelta.total_seconds() / 60

def minute_of_day(minutes):
    return minutes % (24 * 60)

print(to_minutes("02.02.2020 05:52") % (24 * 60))

to_minutes_udf = F.udf(to_minutes, DoubleType())
minute_of_day_udf = F.udf(minute_of_day, DoubleType())
week_of_year_udf = F.udf(week_of_year, DoubleType())
day_of_week_udf = F.udf(day_of_week, DoubleType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

352.0

In [10]:
# Remove unwanted rows (those having empty date, arrival/departure times)
# Introduce columns for week of the year, day of the week, time fo the day
conns = conns\
    .filter("date !=''")\
    .filter("scheduled_arrival_time != ''")\
    .filter("actual_arrival_time != ''")\
    .withColumn("scheduled_arrival_minutes", to_minutes_udf(F.col("scheduled_arrival_time")))\
    .withColumn("actual_arrival_minutes", to_minutes_udf(F.col("actual_arrival_time")))\
    .withColumn("week_of_year", week_of_year_udf(F.col("date")))\
    .withColumn("day_of_week", day_of_week_udf(F.col("date")))

conns = conns.withColumn("delay", F.col("actual_arrival_minutes") - F.col("scheduled_arrival_minutes"))\
    .withColumn("minute_of_day", minute_of_day_udf(F.col("scheduled_arrival_minutes")))\
    .withColumn("square_from_midday", (24 * 60 / 2 - F.col("minute_of_day")) ** 2)\
    .withColumn("abs_from_midday", F.col("square_from_midday") ** 0.5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Assemble the features defined above
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['minute_of_day', 'square_from_midday', 'abs_from_midday', 'product_id_oh', 'stop_id_oh', 'week_of_year', 'day_of_week'],
    outputCol='features')

data = assembler.transform(conns).select('features', 'delay')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# Scale the features
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withStd=True, withMean=True)
scaler_model = scaler.fit(data)
data = scaler_model.transform(data).cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
# Save the model features to HDFS
username = 'moiseev'
scaler.write().overwrite().save('/user/{}/delays_scaler'.format(username))
scaler_model.write().overwrite().save('/user/{}/delays_scaler_model'.format(username))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### 1. Linear regression model

In [14]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='scaled_features', labelCol='delay')
model = lr.fit(data.drop("prediction"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
lr_path = '/user/{}/delays_lr'.format(username)
model_path = '/user/{}/delays_lr_model'.format(username)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
lr.write().overwrite().save(lr_path)
model.write().overwrite().save(model_path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### 2. Logistic Regression

In [17]:
# Transform delay values to bins
def to_label(x):
    bounds = list(range(12))
    for b in bounds[:-1]:
        if x <= b:
            return b
    return bounds[-1]
        
to_label_udf = F.udf(to_label, LongType())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
log_data = data.withColumn("delay_bin", to_label_udf(F.col('delay'))).drop("prediction")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
from pyspark.ml.classification import LogisticRegression

log = LogisticRegression(featuresCol='scaled_features', labelCol='delay_bin')
log_model = log.fit(log_data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
log_path = '/user/{}/delays_log'.format(username)
log_model_path = '/user/{}/delays_log_model'.format(username)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
log.write().overwrite().save(log_path)
log_model.write().overwrite().save(log_model_path)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
log_data_pred = log_model.transform(log_data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…