## Initialize PySpark

In [3]:
# Find Apache Spark on this machine
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession

# Dev mode: False when performing real analytics
DEV = True

# Build a Spark SQL Session for DataFrames
master = 'local[2]'
appName = 'Cancelled flights percentages'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

## Load data

In [5]:
from preprocessing_utils import *
if DEV:
    # DEV preprocessing
    perform_DEV_dataset_preprocessing(spark)
else:
    # Production preprocessing
    perform_dataset_preprocessing(spark)

--------- DEV mode ON ---------
Starting preprocessing of ../dataset/1994.csv.bz2
Preprocessing NOT performed.
Preprocessed dataset already exists: ../dataset/preprocessed_dataset_1994.parquet



In [6]:
# Load the parquet dataset
if DEV:
    # Load DEV dataset
    df = load_DEV_preprocessed_dataset(spark)
else:
    # Load production dataset
    df = load_preprocessed_dataset(spark)

--------- DEV mode ON ---------
Peprocessed dataset loaded.
../dataset/preprocessed_dataset_1994.parquet


In [7]:
# Keep only the dimensions we need
df = df.select(df['Year'], df['Month'], df['DayofMonth'], df['Cancelled'])
# Explore the data
df.printSchema()
df.show(10)

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)

+----+-----+----------+---------+
|Year|Month|DayofMonth|Cancelled|
+----+-----+----------+---------+
|1994|    1|         7|        0|
|1994|    1|         8|        0|
|1994|    1|        10|        0|
|1994|    1|        11|        0|
|1994|    1|        12|        0|
|1994|    1|        13|        1|
|1994|    1|        14|        0|
|1994|    1|        15|        0|
|1994|    1|        17|        0|
|1994|    1|        18|        0|
+----+-----+----------+---------+
only showing top 10 rows



In [8]:
df.describe('Cancelled').show()

+-------+--------------------+
|summary|           Cancelled|
+-------+--------------------+
|  count|             5180048|
|   mean|0.012884050495284986|
| stddev|  0.1127743507776497|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [9]:
# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())


#df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), 'Cancelled')
df = df.select(df['Year'], df['Month'], df['DayofMonth'], df['Cancelled'])

#df.show(10)

In [10]:
# Cancelled Flights
cancelled_flights = df.filter(df['Cancelled'] == 1)

# Number of flights per day 
all_flights_count = df.groupBy(['Year', 'Month', 'DayOfMonth']).count()
cancelled_flights_count = cancelled_flights.groupBy(['Year', 'Month', 'DayOfMonth']).count()

all_flights_count.show(10)
cancelled_flights_count.show(10)

+----+-----+----------+-----+
|Year|Month|DayOfMonth|count|
+----+-----+----------+-----+
|1994|   10|        28|14847|
|1994|   12|        26|14751|
|1994|    2|        11|14242|
|1994|   12|        10|13409|
|1994|    4|        13|14450|
|1994|    6|        23|14527|
|1994|    9|        26|14680|
|1994|   12|        11|14158|
|1994|   11|        24|11524|
|1994|   12|         4|14088|
+----+-----+----------+-----+
only showing top 10 rows

+----+-----+----------+-----+
|Year|Month|DayOfMonth|count|
+----+-----+----------+-----+
|1994|   10|        28|   37|
|1994|   12|        26|  159|
|1994|    2|        11| 3649|
|1994|   12|        10|   80|
|1994|    4|        13|  368|
|1994|    6|        23|   88|
|1994|    9|        26|   59|
|1994|   12|        11|   76|
|1994|   11|        24|   14|
|1994|   12|         4|   80|
+----+-----+----------+-----+
only showing top 10 rows



In [11]:
# Store output Dataframe (or load it if already existing)
'''
all_flights_count_dataset = '../dataset/all_flights_count_dataset.parquet'

path= Path(all_flights_count_dataset)
if not path.is_dir():
    all_flights_count.write.mode('overwrite').save(all_flights_count_dataset, format='parquet')

all_flights_count = spark.read.load(all_flights_count_dataset)
    
# Store output Dataframe (or load it if already existing)
cancelled_flights_count_dataset = '../dataset/cancelled_flights_count_dataset.parquet'

path= Path(cancelled_flights_count_dataset)
if not path.is_dir():
    cancelled_flights_count.write.mode('overwrite').save(cancelled_flights_count_dataset, format='parquet')
cancelled_flights_count = spark.read.load(cancelled_flights_count_dataset)
'''

"\nall_flights_count_dataset = '../dataset/all_flights_count_dataset.parquet'\n\npath= Path(all_flights_count_dataset)\nif not path.is_dir():\n    all_flights_count.write.mode('overwrite').save(all_flights_count_dataset, format='parquet')\n\nall_flights_count = spark.read.load(all_flights_count_dataset)\n    \n# Store output Dataframe (or load it if already existing)\ncancelled_flights_count_dataset = '../dataset/cancelled_flights_count_dataset.parquet'\n\npath= Path(cancelled_flights_count_dataset)\nif not path.is_dir():\n    cancelled_flights_count.write.mode('overwrite').save(cancelled_flights_count_dataset, format='parquet')\ncancelled_flights_count = spark.read.load(cancelled_flights_count_dataset)\n"

In [12]:
#Rename count columns
all_flights_count = all_flights_count.select('Year', 'Month', 'DayOfMonth', all_flights_count['count'].alias('total_count'))
cancelled_flights_count = cancelled_flights_count.select('Year', 'Month', 'DayOfMonth', cancelled_flights_count['count'].alias('canceled_count'))

In [13]:
# Join the tables
unified_dataset = all_flights_count \
                .join(cancelled_flights_count, ['Year', 'Month', 'DayOfMonth'])
    
unified_dataset.show(10)

+----+-----+----------+-----------+--------------+
|Year|Month|DayOfMonth|total_count|canceled_count|
+----+-----+----------+-----------+--------------+
|1994|   10|        28|      14847|            37|
|1994|   12|        26|      14751|           159|
|1994|    2|        11|      14242|          3649|
|1994|   12|        10|      13409|            80|
|1994|    4|        13|      14450|           368|
|1994|    6|        23|      14527|            88|
|1994|    9|        26|      14680|            59|
|1994|   12|        11|      14158|            76|
|1994|   11|        24|      11524|            14|
|1994|   12|         4|      14088|            80|
+----+-----+----------+-----------+--------------+
only showing top 10 rows



In [14]:
unified_dataset = unified_dataset.withColumn("DailyCanceledFlightsPercentage", (F.col("canceled_count") / F.col("total_count"))* 100)
unified_dataset.show(10)

+----+-----+----------+-----------+--------------+------------------------------+
|Year|Month|DayOfMonth|total_count|canceled_count|DailyCanceledFlightsPercentage|
+----+-----+----------+-----------+--------------+------------------------------+
|1994|   10|        28|      14847|            37|           0.24920859432882064|
|1994|   12|        26|      14751|           159|             1.077893024201749|
|1994|    2|        11|      14242|          3649|             25.62140148855498|
|1994|   12|        10|      13409|            80|            0.5966142143336565|
|1994|    4|        13|      14450|           368|             2.546712802768166|
|1994|    6|        23|      14527|            88|             0.605768568871756|
|1994|    9|        26|      14680|            59|           0.40190735694822893|
|1994|   12|        11|      14158|            76|            0.5367989829071903|
|1994|   11|        24|      11524|            14|           0.12148559527941688|
|1994|   12|    

In [17]:
unified_dataset = unified_dataset.select('Year', 'Month', 'DayOfMonth', 'DailyCanceledFlightsPercentage')

In [18]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/canceled_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    unified_dataset.write.mode('overwrite').save(final_dataset, format='parquet')
unified_dataset = spark.read.load(final_dataset)

In [19]:
# Output a list of tuples of schema:
# ('Data', 'Percentage')
cancel_data = unified_dataset.rdd.map(tuple).collect()
print(cancel_data[:100])

[(1994, 4, 19, 0.3580773998071891), (1994, 4, 27, 1.6102280580511403), (1994, 4, 3, 0.5050135402181073), (1994, 10, 23, 0.2274827610720125), (1994, 10, 30, 0.1984689537850865), (1994, 11, 14, 0.46694186912093116), (1994, 2, 3, 0.7247906551263106), (1994, 3, 1, 1.5824581322736304), (1994, 3, 19, 0.5396949550254204), (1994, 6, 1, 0.25403364229316855), (1994, 9, 3, 0.20145668681233533), (1994, 2, 1, 0.6602768903088392), (1994, 5, 20, 0.19831771866238118), (1994, 7, 25, 0.9267461669505962), (1994, 8, 16, 0.35321287868496126), (1994, 10, 10, 0.24772362078200322), (1994, 10, 16, 0.06376647300552643), (1994, 2, 10, 8.757021460463775), (1994, 3, 31, 0.19327673086215227), (1994, 6, 20, 0.71280276816609), (1994, 8, 28, 0.30164854437039634), (1994, 10, 25, 0.2633889376646181), (1994, 11, 10, 0.7276931447225246), (1994, 4, 11, 0.9873258175197815), (1994, 5, 6, 0.2835604122000138), (1994, 5, 11, 0.2623584645125656), (1994, 1, 20, 3.1078742428612633), (1994, 6, 6, 0.5723348503654668), (1994, 6, 9, 0

# Data Visualization

In [58]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [59]:
def get_pd_dataframe(years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .select('Year', 'Month', 'DayOfMonth', 'DailyCanceledFlightsPercentage') \
             .orderBy('Year', 'Month', 'DayOfMonth') \
             .collect()
    
    nb_years = len(years)
    nb_days = 365
    data = np.zeros((nb_days, nb_years))
    for row in rows:
        year = row[0] - years[0]
        date = pd.to_datetime(str(row[0])+str(row[1])+str(row[2]), format='%Y%m%d')
        new_year_day = pd.Timestamp(year=date.year, month=1, day=1)
        day = (date - new_year_day).days + 1
        pen = row[3]

        if day > 364: continue
        data[day, year] = pen
    columns = [str(y) for y in years]
    indices = range(1, 366)
    res = pd.DataFrame(data=data, columns=columns, index=indices)
    return res
#Check sul controllo di anni bisestili

def plot_canceled_time_series(date, df,ax):
    df = get_pd_dataframe(date, df)
    title = 'Daily canceled flights percentage'
    if df.empty:
        print('No data')
    else:
        df.plot(title=title, grid=True, xticks=range(0, 53, 4), ax=ax)

In [60]:
def get_average_df(years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .groupBy('Year') \
             .avg('DailyCanceledFlightsPercentage') \
             .withColumnRenamed('avg(DailyCanceledFlightsPercentage)', 'AverageDailyCanceledFlightsPercentage') \
             .select('Year', 'AverageDailyCanceledFlightsPercentage') \
             .collect()
    
    nb_years = len(years)
    data = np.zeros(nb_years)
    for row in rows:
        year = row[0] - years[0]
        avg_pen = row[1]
        data[year] = avg_pen 
    res = pd.DataFrame({'Canceled flights': data}, index=years)
    return res

def plot_average_canceled_flights(years, df, ax):
    df = get_average_df( years, df)
    title = 'Average canceled flights percentage'
    if df.empty:
        print('No data')
    else:
        df.plot.bar(title=title, rot=0 , ax=ax)


In [61]:
def ui_callback( years, df):
    plt.figure(figsize=(15,12))
    plt.clf()
    ax = plt.subplot(211)
    plot_canceled_time_series(range(years[0], years[1] + 1), df, ax)
    
    ax = plt.subplot(212)
    plot_average_canceled_flights(range(years[0], years[1] + 1), df, ax)
    
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                    wspace=0.35)
    plt.show()

# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)
ui = widgets.HBox([years_w])

In [62]:
out = widgets.interactive_output(ui_callback, {'years': years_w, 'df': widgets.fixed(unified_dataset)})
display(ui, out)

FigureCanvasNbAgg()

HBox(children=(SelectionRangeSlider(continuous_update=False, description='Years', index=(0, 2), options=(('199…

Output()