## Initialize PySpark

In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Dev mode: False when performing real analytics
DEV = False

# Build a Spark SQL Session for DataFrames
master = 'local[2]'
appName = 'Cancelled flights percentages'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

## Load data

In [3]:
from preprocessing_utils import *
if DEV:
    # DEV preprocessing
    perform_DEV_dataset_preprocessing(spark)
else:
    # Production preprocessing
    perform_dataset_preprocessing(spark)

Starting preprocessing of ../dataset/*.csv.bz2
Preprocessing NOT performed.
Preprocessed dataset already exists: ../dataset/preprocessed_dataset.parquet



In [4]:
# Load the parquet dataset
if DEV:
    # Load DEV dataset
    df = load_DEV_preprocessed_dataset(spark)
else:
    # Load production dataset
    df = load_preprocessed_dataset(spark)

Peprocessed dataset loaded.
../dataset/preprocessed_dataset.parquet


In [5]:
# Keep only the dimensions we need
df = df.select(df['Year'], df['Month'], df['DayofMonth'], df['Cancelled'])
# Explore the data
df.printSchema()
df.show(10)

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)

+----+-----+----------+---------+
|Year|Month|DayofMonth|Cancelled|
+----+-----+----------+---------+
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
|2007|    1|         1|        0|
+----+-----+----------+---------+
only showing top 10 rows



In [6]:
df.describe('Cancelled').show()
df.describe('Year').show()

+-------+--------------------+
|summary|           Cancelled|
+-------+--------------------+
|  count|            91469371|
|   mean|0.021325903727926587|
| stddev|  0.1444683695010413|
|    min|                   0|
|    max|                   1|
+-------+--------------------+

+-------+------------------+
|summary|              Year|
+-------+------------------+
|  count|          91469371|
|   mean|2001.5266289411786|
| stddev| 4.332419506702069|
|    min|              1994|
|    max|              2008|
+-------+------------------+



In [7]:
# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())


#df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), 'Cancelled')
df = df.select(df['Year'], df['Month'], df['DayofMonth'], df['Cancelled'])

#df.show(10)

In [8]:
# Cancelled Flights
cancelled_flights = df.filter(df['Cancelled'] == 1)

# Number of flights per day 
all_flights_count = df.groupBy(['Year', 'Month', 'DayOfMonth']).count()
cancelled_flights_count = cancelled_flights.groupBy(['Year', 'Month', 'DayOfMonth']).count()

all_flights_count.show(10)
cancelled_flights_count.show(10)

+----+-----+----------+-----+
|Year|Month|DayOfMonth|count|
+----+-----+----------+-----+
|2007|    1|        18|20815|
|2007|    1|        19|20867|
|2007|    9|        22|17136|
|2006|    2|         3|19592|
|2006|    8|        23|20386|
|2004|    8|        14|18184|
|2005|    1|        15|16826|
|2005|   12|        25|16618|
|2005|   12|         1|19246|
|1995|    3|        20|15186|
+----+-----+----------+-----+
only showing top 10 rows

+----+-----+----------+-----+
|Year|Month|DayOfMonth|count|
+----+-----+----------+-----+
|2007|    1|        18|  650|
|2007|    1|        19|  511|
|2007|    9|        22|  150|
|2006|    2|         3|  413|
|2006|    8|        23|  233|
|2004|    8|        14|  863|
|2005|    1|        15|  315|
|2005|   12|         1|  286|
|2005|   12|        25|  177|
|1995|    3|        20|  232|
+----+-----+----------+-----+
only showing top 10 rows



In [9]:
#Rename count columns
all_flights_count = all_flights_count.select('Year', 'Month', 'DayOfMonth', all_flights_count['count'].alias('total_count'))
cancelled_flights_count = cancelled_flights_count.select('Year', 'Month', 'DayOfMonth', cancelled_flights_count['count'].alias('canceled_count'))

In [10]:
# Join the tables
unified_dataset = all_flights_count \
                .join(cancelled_flights_count, ['Year', 'Month', 'DayOfMonth'])
    
unified_dataset.show(10)

+----+-----+----------+-----------+--------------+
|Year|Month|DayOfMonth|total_count|canceled_count|
+----+-----+----------+-----------+--------------+
|1994|   10|        28|      14847|            37|
|1994|   12|        26|      14751|           159|
|1995|    3|        20|      15186|           232|
|1995|    5|        24|      14860|           199|
|1996|    1|        19|      14799|          1049|
|1996|    3|        25|      14910|           583|
|1996|    6|         1|      13474|           244|
|1996|   11|        28|      11529|            54|
|1996|   12|         1|      14554|            83|
|1997|    2|        16|      14335|           216|
+----+-----+----------+-----------+--------------+
only showing top 10 rows



In [11]:
unified_dataset = unified_dataset.withColumn("DailyCanceledFlightsPercentage", ((F.col("canceled_count") / F.col("total_count"))* 100))


In [12]:
unified_dataset = unified_dataset.select('Year', 'Month', 'DayOfMonth', 'DailyCanceledFlightsPercentage')
unified_dataset.show(10)

+----+-----+----------+------------------------------+
|Year|Month|DayOfMonth|DailyCanceledFlightsPercentage|
+----+-----+----------+------------------------------+
|1994|   10|        28|           0.24920859432882064|
|1994|   12|        26|             1.077893024201749|
|1995|    3|        20|             1.527722902673515|
|1995|    5|        24|             1.339165545087483|
|1996|    1|        19|             7.088316778160686|
|1996|    3|        25|             3.910127431254192|
|1996|    6|         1|             1.810895057147098|
|1996|   11|        28|             0.468384074941452|
|1996|   12|         1|            0.5702899546516421|
|1997|    2|        16|            1.5068015347052668|
+----+-----+----------+------------------------------+
only showing top 10 rows



In [111]:
unified_dataset.filter(unified_dataset['DailyCanceledFlightsPercentage'] > 80).show()

+----+-----+----------+------------------------------+
|Year|Month|DayOfMonth|DailyCanceledFlightsPercentage|
+----+-----+----------+------------------------------+
|2001|    9|        11|             85.48248871622008|
|2001|    9|        13|             91.76825794690669|
|2001|    9|        12|             99.99429744525547|
+----+-----+----------+------------------------------+



In [82]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/canceled_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    unified_dataset.write.mode('overwrite').save(final_dataset, format='parquet')
unified_dataset = spark.read.load(final_dataset)

In [83]:
# Output a list of tuples of schema:
# ('Data', 'Percentage')
cancel_data = unified_dataset.rdd.map(tuple).collect()
print(cancel_data[:100])

[(1994, 1, 20, 3.1078742428612633), (1994, 6, 6, 0.5723348503654668), (1994, 6, 9, 0.18612987729215497), (1994, 9, 23, 0.649395037254768), (1994, 10, 31, 1.561974011984111), (1995, 8, 14, 1.4915434811559463), (1996, 1, 2, 6.72579453067258), (1996, 11, 26, 1.6956980733702824), (1997, 3, 15, 1.4456593527555588), (1998, 1, 21, 2.581629944252721), (1998, 5, 20, 2.2865142323845076), (1999, 12, 27, 1.1671924290220819), (2000, 2, 20, 1.9454201567144016), (2000, 9, 3, 1.591450757284539), (2000, 9, 20, 2.0229222771967184), (2000, 10, 2, 2.3398015848256066), (2001, 3, 1, 2.9759872538978036), (2001, 5, 7, 2.5134751773049646), (2001, 5, 21, 2.6542649727767693), (2001, 10, 19, 0.39799629451725793), (2002, 1, 13, 1.0114335971855761), (2002, 2, 7, 0.9258624237020779), (2002, 9, 24, 1.0722956568653899), (2002, 11, 20, 0.799448656099242), (2003, 7, 8, 4.127447798333874), (2003, 8, 27, 1.408757138971988), (2004, 6, 15, 1.9102822580645162), (2005, 12, 13, 0.9238126290620584), (2005, 12, 21, 1.01358304177

# Data Visualization

In [84]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [85]:
#Leap years menagment
leap_years= range(1996,2009,4)

for i in range(1994, 2009):
    if i not in leap_years:
        newRow = spark.createDataFrame([[i, 2, 29, 0]])
        unified_dataset = unified_dataset.union(newRow)

In [101]:
def get_pd_dataframe(years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .select('Year', 'Month', 'DayOfMonth', 'DailyCanceledFlightsPercentage') \
             .orderBy(['Year', 'Month', 'DayOfMonth']) \
             .collect()
    
    nb_years = len(years)
    nb_days = 366
    data = np.zeros((nb_days, nb_years))
    for row in rows:
        year = row[0] - years[0]
        
        
        if(row[1]==2 and row[2]==29):
            #29th of Februrary
            day = 60
        else:
            date_format = str(row[0])
            if row[1]<10 :
                date_format = date_format + '0' + str(row[1])
            else :
                date_format = date_format + str(row[1])
            if row[2]<10 :
                date_format = date_format + '0' + str(row[2])
            else :
                date_format = date_format + str(row[2])
            
            date = pd.to_datetime(date_format, format='%Y%m%d')
            new_year_day = pd.Timestamp(year=date.year, month=1, day=1)
            day = (date - new_year_day).days
        
        pen = row[3] 
        
        if day > 365: continue
        data[day, year] = pen
    columns = [str(y) for y in years]
    indices = range(1, 367)
    res = pd.DataFrame(data=data, columns=columns, index=indices)
    return res
#Check sul controllo di anni bisestili

def plot_canceled_time_series(years, df,ax):
    df = get_pd_dataframe(years, df)
    title = 'Daily canceled flights percentage'
    if df.empty:
        print('No data')
    else:
        #print(df)
        df.plot(title=title, grid=True, xticks=range(0, 367, 10), ax=ax)

In [102]:
def get_average_df(years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .groupBy('Year') \
             .avg('DailyCanceledFlightsPercentage') \
             .withColumnRenamed('avg(DailyCanceledFlightsPercentage)', 'AverageDailyCanceledFlightsPercentage') \
             .select('Year', 'AverageDailyCanceledFlightsPercentage') \
             .collect()
    
    nb_years = len(years)
    data = np.zeros(nb_years)
    for row in rows:
        year = row[0] - years[0]
        avg_pen = row[1]
        #Leap year            
        if year not in leap_years:
            avg_pen = avg_pen * (366/365)
        data[year] = avg_pen 
    res = pd.DataFrame({'Canceled flights': data}, index=years)
    return res

def plot_average_canceled_flights(years, df, ax):
    df = get_average_df( years, df)
    title = 'Average canceled flights percentage'
    if df.empty:
        print('No data')
    else:
        #print(df)
        df.plot.bar(title=title, rot=0 , ax=ax)


In [103]:
def ui_callback(years, df):
    plt.figure(figsize=(15,12))
    plt.clf()
    ax = plt.subplot(211)
    plot_canceled_time_series(range(years[0], years[1] + 1), df, ax)
    
    ax = plt.subplot(212)
    plot_average_canceled_flights(range(years[0], years[1] + 1), df, ax)
    
    plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25, wspace=0.35)
    plt.show()

# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)
ui = widgets.HBox([years_w])

In [105]:
out = widgets.interactive_output(ui_callback, {'years': years_w, 'df': widgets.fixed(unified_dataset)})
display(ui, out)

HBox(children=(SelectionRangeSlider(continuous_update=False, description='Years', index=(0, 2), options=(('199…

Output()

NOTE: Day number 60 is the 29th of February and the value there is valid only for leap years. For other years the percentage value is set to 0. In the average of course we take account of that and for the leap years it is computed on 366 days meanwhile for other years it is computed on 365 days.