# Airport weekly penalty for arrivals and departures delays 

In this notebook we are computing a weekly "**penalty**" score for each airport that depends on both the its incoming and outgoing
flights. The score adds `0.5` for each incoming flight that is more than _15 minutes_ late, and `1` for
each outgoing flight that is more than _15 minutes_ late.

Formally speaking, let $w = \{1, 2, \ldots, 52 \}$ be the _week number_ in a year $y$ and $x$ be a flight _leaving from_ or _arriving to_ $airport$ on week $w$ of year $year$. Then we compute the weekly penalties as the following:

$$ ArrivalPenalty(f, y, w) = \begin{cases} 0.5, & \mbox{if } ArrDelay(f) > 15\mbox{ seconds}\\
                                           0, & \mbox{otherwise}
                                 \end{cases}
$$

$$ DeparturePenalty(f, y, w) = \begin{cases} 1, & \mbox{if } DepDelay(f) > 15\mbox{ seconds}\\
                                             0, & \mbox{otherwise}
                                   \end{cases}
$$

$$ WeeklyPenalty(airport, y, w) = \sum_{f \in Flights(airport, y, w)} ArrivalPenalty(f, y, w) + DeparturePenalty(f, y, w)$$

### Initialize PySpark

In [19]:
# Find Apache Spark on this machine
import findspark
findspark.init()

In [20]:
from pyspark.sql import SparkSession

# Dev mode: False when performing real analytics
DEV = False

### threads to be used to run spark worker nodes locally
spark_local_threads = 4 

# Build a Spark SQL Session for DataFrames
master = 'local[{}]'.format(spark_local_threads)
appName = 'Airport Weekly Penalty'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

### Load Data
Try to load the optimized *parquet* format data set. If *parquet* data set is not found, load full compressed data sets, reduce and save them.

In [21]:
from preprocessing_utils import *
if DEV:
    # DEV preprocessing
    perform_DEV_dataset_preprocessing(spark)
else:
    # Production preprocessing
    perform_dataset_preprocessing(spark)

Starting preprocessing of ../dataset/*.csv.bz2
Preprocessing NOT performed.
Preprocessed dataset already exists: ../dataset/preprocessed_dataset.parquet



In [22]:
# Load the parquet dataset
if DEV:
    # Load DEV dataset
    df = load_DEV_preprocessed_dataset(spark)
else:
    # Load production dataset
    df = load_preprocessed_dataset(spark)

Peprocessed dataset loaded.
../dataset/preprocessed_dataset.parquet


In [23]:
# Keep only the dimensions we need
df = df.select(df['Year'], df['Month'], df['DayofMonth'], df['DepTime'], \
               df['ArrTime'], df['ArrDelay'], df['DepDelay'], \
               df['Origin'], df['Dest'])
# Explore the data
df.printSchema()
df.show(10)

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)

+----+-----+----------+-------+-------+--------+--------+------+----+
|Year|Month|DayofMonth|DepTime|ArrTime|ArrDelay|DepDelay|Origin|Dest|
+----+-----+----------+-------+-------+--------+--------+------+----+
|2007|    1|         1|   1232|   1341|       1|       7|   SMF| ONT|
|2007|    1|         1|   1918|   2043|       8|      13|   SMF| PDX|
|2007|    1|         1|   2206|   2334|      34|      36|   SMF| PDX|
|2007|    1|         1|   1230|   1356|      26|      30|   SMF| PDX|
|2007|    1|         1|    831|    957|      -3|       1|   SMF| PDX|
|2007|    1|         1|   1430|   1553|       3|      10|   SMF| PDX|
|2007|    1|

### Compute weekly-penalty analytics

In [24]:
# Drop entries with 'na' departure and arrival time
df = df.dropna(subset=['DepTime', 'ArrTime'])

# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())

week_year = lambda date : date.isocalendar()[1]
week_year = F.udf(week_year, IntegerType())

df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), \
               'DepTime', 'ArrTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest')
df = df.select('Date', week_year('Date').alias('WeekYear'), 'ArrDelay', 'DepDelay', 'Origin', 'Dest')
# df.show(10)

In [25]:
# Flights that are more than 15 minutes late
left_late = df.filter(df['DepDelay'] > 15)
arrived_late = df.filter(df['ArrDelay'] > 15)

# Number of times per week an airport had a departure or an arrival more than 15 minutes late
incoming_late = arrived_late.groupBy([F.year('Date').alias('Year'), 'WeekYear', arrived_late['Dest'].alias('Airport')]).count()
outgoing_late = left_late.groupBy([F.year('Date').alias('Year'), 'WeekYear', left_late['Origin'].alias('Airport')]).count()

# incoming_late.show(10)
# outgoing_late.show(10)

In [26]:
# Penalties on arrivals and departures
incoming_factor = 0.5
outgoing_factor = 1.0

incoming_penalty = incoming_late.select('Year', 'WeekYear', 'Airport', (incoming_late['count'] * incoming_factor).alias('Penalty'))
outgoing_penalty = outgoing_late.select('Year', 'WeekYear', 'Airport', (outgoing_late['count'] * outgoing_factor).alias('Penalty'))

# incoming_penalty.show(10)
# outgoing_penalty.show(10)

In [27]:
# Sum up the penalties
penalties = incoming_penalty \
                .unionAll(outgoing_penalty) \
                .groupBy('Year', 'WeekYear', 'Airport') \
                .sum('Penalty') \
                .withColumnRenamed('sum(Penalty)', 'WeeklyPenalty')

# penalties.show(10)

In [28]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/penalty_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    penalties.write.mode('overwrite').save(final_dataset, format='parquet')

penalties = spark.read.load(final_dataset)

In [29]:
# Output a list of tuples of schema:
# ('Year', 'WeekYear', 'Airport', 'WeeklyPenalty')
penalty_data = penalties.rdd.map(tuple).collect()
print(penalty_data[:100])

# Airports
airports = penalties.select('Airport').distinct().orderBy('Airport').rdd.map(lambda x : x[0]).collect()

[(2007, 2, 'DTW', 784.5), (2007, 3, 'TUL', 209.0), (2007, 2, 'PSP', 96.0), (2007, 3, 'GRB', 67.0), (2007, 2, 'GTF', 12.5), (2007, 5, 'MLU', 27.0), (2007, 3, 'SJT', 13.5), (2007, 5, 'FAI', 16.0), (2007, 7, 'DAY', 148.0), (2007, 8, 'DAB', 30.0), (2007, 7, 'ROA', 50.0), (2007, 8, 'PFN', 28.5), (2007, 6, 'RFD', 5.5), (2007, 6, 'TRI', 3.0), (2007, 12, 'ABQ', 187.5), (2007, 12, 'OKC', 140.5), (2007, 10, 'CHA', 18.0), (2007, 11, 'PWM', 59.0), (2007, 10, 'CPR', 9.5), (2007, 10, 'SWF', 52.5), (2007, 17, 'IAD', 481.0), (2007, 17, 'CAE', 61.0), (2007, 14, 'IDA', 6.5), (2007, 14, 'IYK', 1.5), (2007, 16, 'APF', 2.5), (2007, 21, 'BTR', 56.0), (2007, 22, 'TLH', 12.0), (2007, 19, 'OGG', 32.0), (2007, 20, 'EKO', 1.5), (2007, 22, 'PIH', 2.0), (2007, 24, 'BWI', 1060.0), (2007, 26, 'PHX', 1712.5), (2007, 24, 'LGA', 1122.5), (2007, 25, 'FAR', 29.5), (2007, 26, 'SIT', 6.0), (2007, 26, 'KTN', 18.5), (2007, 30, 'BWI', 1145.0), (2007, 31, 'DAY', 88.0), (2007, 31, 'GSO', 109.0), (2007, 28, 'MYR', 71.5), (2007, 

## Data Visualization

Analytics for airports weekly penalty are reported below.

A **line plot** is used to display penalties as a time series. On the `x` axis the week number is reported, while on the `y` axis we show the weekly penalty.

Moreover, a **bar plot** is chosen to display the *yearly average* weekly-penalty of a given airport.

In [62]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### Weekly penalties over a year analytics 

In [89]:
def get_pd_dataframe(airport, years, df):
    rows = df.filter(F.col('Airport') == airport) \
             .filter(F.col('Year').isin(*years)) \
             .select('Year', 'WeekYear', 'WeeklyPenalty') \
             .orderBy('Year', 'WeekYear') \
             .collect()
    
    nb_years = len(years)
    nb_weeks = 52
    data = np.zeros((nb_weeks, nb_years))
    for row in rows:
        year = row[0] - years[0]
        week = row[1] - 1
        pen = row[2]

        if week > 51: continue
        data[week, year] = pen
    columns = [str(y) for y in years]
    indices = range(1, 53)
    res = pd.DataFrame(data=data, columns=columns, index=indices)
    return res

def plot_penalty_time_series(airport, years, df, ax):
    df = get_pd_dataframe(airport, years, df)
    title = '{} Airport - Weekly penalty'.format(airport)
    if df.empty:
        print('No data for airport {}'.format(airport))
    else:
        df.plot(title=title, grid=True, xticks=range(0, 53, 4), ax=ax)

#### Average penalty in a year

In [90]:
def get_average_df(airport, years, df):
    rows = df.filter(F.col('Airport') == airport) \
             .filter(F.col('Year').isin(*years)) \
             .groupBy('Year') \
             .avg('WeeklyPenalty') \
             .withColumnRenamed('avg(WeeklyPenalty)', 'AveragePenalty') \
             .select('Year', 'AveragePenalty') \
             .collect()
    
    nb_years = len(years)
    data = np.zeros(nb_years)
    for row in rows:
        year = row[0] - years[0]
        avg_pen = row[1]
        data[year] = avg_pen 
    res = pd.DataFrame({airport: data}, index=years)
    return res

def plot_average_penalty(airport, years, df, ax):
    df = get_average_df(airport, years, df)
    title = '{} Airport - Average Penalties'.format(airport)
    if df.empty:
        print('No data for airport {}'.format(airport))
    else:
        df.plot.bar(y=airport, title=title, rot=90, ax=ax)

In [91]:
def ui_callback(airport, years, df):
    plt.figure(figsize=(10,4))
    plt.clf()
    ax = plt.subplot(121)
    plot_penalty_time_series(airport, range(years[0], years[1] + 1), df, ax)
    
    ax = plt.subplot(122)
    plot_average_penalty(airport, range(years[0], years[1] + 1), df, ax)
    plt.show()

# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)
# Airport selection menu
airports_w = widgets.Dropdown(options=airports,
                              value=airports[0],
                              description='Airport')

ui = widgets.HBox([airports_w, years_w])

In [92]:
out = widgets.interactive_output(ui_callback, {'airport': airports_w, 'years': years_w, 'df': widgets.fixed(penalties)})
display(ui, out)

HBox(children=(Dropdown(description='Airport', options=('ABE', 'ABI', 'ABQ', 'ABY', 'ACK', 'ACT', 'ACV', 'ACY'…

Output()