## Initialize PySpark

In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init('/Users/giacomogregori/spark')

In [2]:
from pyspark.sql import SparkSession

# Dev mode: False when performing real analytics
DEV = True

# Build a Spark SQL Session for DataFrames
master = 'local[2]'
appName = 'Cancelled flights percentages'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

## Load data

In [3]:
from preprocessing_utils import *
if DEV:
    # DEV preprocessing
    perform_DEV_dataset_preprocessing(spark)
else:
    # Production preprocessing
    perform_dataset_preprocessing(spark)

--------- DEV mode ON ---------
Starting preprocessing of ../dataset/1994.csv.bz2
Preprocessing NOT performed.
Preprocessed dataset already exists: ../dataset/preprocessed_dataset_1994.parquet



In [4]:
# Load the parquet dataset
if DEV:
    # Load DEV dataset
    df = load_DEV_preprocessed_dataset(spark)
else:
    # Load production dataset
    df = load_preprocessed_dataset(spark)

--------- DEV mode ON ---------
Peprocessed dataset loaded.
../dataset/preprocessed_dataset_1994.parquet


In [5]:
# Keep only the dimensions we need
df = df.select(df['Year'], df['Month'], df['DayofMonth'], df['Cancelled'])
# Explore the data
df.printSchema()
df.show(10)

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)

+----+-----+----------+---------+
|Year|Month|DayofMonth|Cancelled|
+----+-----+----------+---------+
|1994|    1|         7|        0|
|1994|    1|         8|        0|
|1994|    1|        10|        0|
|1994|    1|        11|        0|
|1994|    1|        12|        0|
|1994|    1|        13|        1|
|1994|    1|        14|        0|
|1994|    1|        15|        0|
|1994|    1|        17|        0|
|1994|    1|        18|        0|
+----+-----+----------+---------+
only showing top 10 rows



In [6]:
df.describe('Cancelled').show()

+-------+--------------------+
|summary|           Cancelled|
+-------+--------------------+
|  count|             5180048|
|   mean|0.012884050495284986|
| stddev|  0.1127743507776497|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [7]:
# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())


df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), 'Cancelled')
#df.show(10)

In [8]:
# Cancelled Flights
cancelled_flights = df.filter(df['Cancelled'] == 1)

# Number of flights per day 
all_flights_count = df.groupBy(['Date']).count()
cancelled_flights_count = cancelled_flights.groupBy(['Date']).count()

all_flights_count.show(10)
cancelled_flights_count.show(10)

+-------------------+-----+
|               Date|count|
+-------------------+-----+
|1994-10-26 00:00:00|14838|
|1994-10-20 00:00:00|14800|
|1994-12-02 00:00:00|14815|
|1994-07-09 00:00:00|13614|
|1994-12-16 00:00:00|14653|
|1994-02-24 00:00:00|14121|
|1994-12-01 00:00:00|14510|
|1994-02-07 00:00:00|14126|
|1994-01-19 00:00:00|13776|
|1994-04-25 00:00:00|14389|
+-------------------+-----+
only showing top 10 rows

+-------------------+-----+
|               Date|count|
+-------------------+-----+
|1994-10-26 00:00:00|   36|
|1994-10-20 00:00:00|  212|
|1994-12-02 00:00:00|   99|
|1994-07-09 00:00:00|  136|
|1994-12-16 00:00:00|   88|
|1994-02-24 00:00:00|  471|
|1994-12-01 00:00:00|  163|
|1994-02-07 00:00:00|  213|
|1994-01-19 00:00:00|  973|
|1994-04-25 00:00:00|  122|
+-------------------+-----+
only showing top 10 rows



In [9]:
# Store output Dataframe (or load it if already existing)
all_flights_count_dataset = '../dataset/all_flights_count_dataset.parquet'

path= Path(all_flights_count_dataset)
if not path.is_dir():
    all_flights_count.write.mode('overwrite').save(all_flights_count_dataset, format='parquet')
else:
    all_flights_count = spark.read.load(all_flights_count_dataset)
    
# Store output Dataframe (or load it if already existing)
cancelled_flights_count_dataset = '../dataset/cancelled_flights_count_dataset.parquet'

path= Path(cancelled_flights_count_dataset)
if not path.is_dir():
    cancelled_flights_count.write.mode('overwrite').save(cancelled_flights_count_dataset, format='parquet')
else:
    cancelled_flights_count = spark.read.load(cancelled_flights_count_dataset)

In [10]:
#Rename count columns
all_flights_count = all_flights_count.select('Date', all_flights_count['count'].alias('total_count'))
cancelled_flights_count = cancelled_flights_count.select('Date', cancelled_flights_count['count'].alias('canceled_count'))

In [11]:
# Join the tables
unified_dataset = all_flights_count \
                .join(cancelled_flights_count, ["Date"])
    
unified_dataset.show(10)

+-------------------+-----------+--------------+
|               Date|total_count|canceled_count|
+-------------------+-----------+--------------+
|1994-01-15 00:00:00|      12540|           151|
|1994-01-16 00:00:00|      13013|           623|
|1994-04-21 00:00:00|      14299|            28|
|1994-02-06 00:00:00|      13391|            79|
|1994-02-11 00:00:00|      14242|          3649|
|1994-07-04 00:00:00|      13844|            62|
|1994-08-12 00:00:00|      14722|            36|
|1994-02-05 00:00:00|      12687|            86|
|1994-03-15 00:00:00|      14554|            45|
|1994-07-17 00:00:00|      14160|            28|
+-------------------+-----------+--------------+
only showing top 10 rows



In [12]:
unified_dataset = unified_dataset.withColumn("DailyCanceledFlightsPercentage", (F.col("canceled_count") / F.col("total_count"))* 100)
unified_dataset.show(10)

+-------------------+-----------+--------------+------------------------------+
|               Date|total_count|canceled_count|DailyCanceledFlightsPercentage|
+-------------------+-----------+--------------+------------------------------+
|1994-01-15 00:00:00|      12540|           151|              1.20414673046252|
|1994-01-16 00:00:00|      13013|           623|             4.787520172135556|
|1994-04-21 00:00:00|      14299|            28|           0.19581788936289252|
|1994-02-06 00:00:00|      13391|            79|            0.5899484728549026|
|1994-02-11 00:00:00|      14242|          3649|             25.62140148855498|
|1994-07-04 00:00:00|      13844|            62|            0.4478474429355677|
|1994-08-12 00:00:00|      14722|            36|           0.24453199293574243|
|1994-02-05 00:00:00|      12687|            86|             0.677859225979349|
|1994-03-15 00:00:00|      14554|            45|            0.3091933489075168|
|1994-07-17 00:00:00|      14160|       

In [13]:
unified_dataset = unified_dataset.select('Date', 'DailyCanceledFlightsPercentage')

In [14]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/canceled_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    unified_dataset.write.mode('overwrite').save(final_dataset, format='parquet')
else:
    unified_dataset = spark.read.load(final_dataset)

In [15]:
# Output a list of tuples of schema:
# ('Data', 'Percentage')
cancel_data = unified_dataset.rdd.map(tuple).collect()
print(cancel_data[:100])

[(datetime.datetime(1994, 1, 15, 0, 0), 1.20414673046252), (datetime.datetime(1994, 1, 16, 0, 0), 4.787520172135556), (datetime.datetime(1994, 4, 21, 0, 0), 0.19581788936289252), (datetime.datetime(1994, 2, 6, 0, 0), 0.5899484728549026), (datetime.datetime(1994, 2, 11, 0, 0), 25.62140148855498), (datetime.datetime(1994, 7, 4, 0, 0), 0.4478474429355677), (datetime.datetime(1994, 8, 12, 0, 0), 0.24453199293574243), (datetime.datetime(1994, 2, 5, 0, 0), 0.677859225979349), (datetime.datetime(1994, 3, 15, 0, 0), 0.3091933489075168), (datetime.datetime(1994, 7, 17, 0, 0), 0.19774011299435026), (datetime.datetime(1994, 10, 28, 0, 0), 0.24920859432882064), (datetime.datetime(1994, 11, 24, 0, 0), 0.12148559527941688), (datetime.datetime(1994, 12, 10, 0, 0), 0.5966142143336565), (datetime.datetime(1994, 2, 14, 0, 0), 0.937742367623211), (datetime.datetime(1994, 6, 13, 0, 0), 0.9539725645846389), (datetime.datetime(1994, 6, 15, 0, 0), 0.44778175806007164), (datetime.datetime(1994, 8, 11, 0, 0), 

# Data Visualization

In [16]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [17]:
def get_pd_dataframe(years, df):
    rows = df.filter(F.col('Date').date.year.isin(*years)) \
             .select('Date', 'DailyCanceledFlightsPercentage') \
             .orderBy('Datw') \
             .collect()
    
    nb_years = len(years)
    nb_days = 365
    data = np.zeros((nb_days, nb_years))
    for row in rows:
        year = row[0].date.year - years[0]
        day = row[0].date.timetuple().yday - 1
        
        pen = row[1]

        if day > 364: continue
        data[day, year] = pen
    columns = [str(y) for y in years]
    indices = range(1, 366)
    res = pd.DataFrame(data=data, columns=columns, index=indices)
    return res
#Check sul controllo di anni bisestili

def plot_canceled_time_series(date, df):
    df = get_pd_dataframe(date, df)
    title = 'Weekly canceled flights percentage'
    if df.empty:
        print('No data')
    else:
        df.plot(title=title, grid=True, xticks=range(0, 53, 4), colormap='tab20c')

In [18]:
'''
def get_average_df(years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .groupBy('Year') \
             .avg('WeeklyWeatherDelays') \
             .withColumnRenamed('avg(WeeklyWeatherDelays)', 'AverageWeatherDelaysPercentage') \
             .select('Year', 'AverageWeatherDelaysPercentage') \
             .collect()
    
    nb_years = len(years)
    data = np.zeros(nb_years)
    for row in rows:
        year = row[0] - years[0]
        avg_pen = row[1]
        data[year] = avg_pen 
    res = pd.DataFrame({'Weather delays': data}, index=years)
    return res

def plot_average_canceled_flights(years, df):
    df = get_average_df( years, df)
    title = 'Average canceled flights percentage'
    if df.empty:
        print('No data ')
    else:
        df.plot.bar( title=title, rot=0)
'''

"\ndef get_average_df(years, df):\n    rows = df.filter(F.col('Year').isin(*years))              .groupBy('Year')              .avg('WeeklyWeatherDelays')              .withColumnRenamed('avg(WeeklyWeatherDelays)', 'AverageWeatherDelaysPercentage')              .select('Year', 'AverageWeatherDelaysPercentage')              .collect()\n    \n    nb_years = len(years)\n    data = np.zeros(nb_years)\n    for row in rows:\n        year = row[0] - years[0]\n        avg_pen = row[1]\n        data[year] = avg_pen \n    res = pd.DataFrame({'Weather delays': data}, index=years)\n    return res\n\ndef plot_average_canceled_flights(years, df):\n    df = get_average_df( years, df)\n    title = 'Average canceled flights percentage'\n    if df.empty:\n        print('No data ')\n    else:\n        df.plot.bar( title=title, rot=0)\n"

In [19]:
def ui_callback( years, df):
    plot_canceled_time_series(range(years[0], years[1] + 1), df)
    # plot_average_canceled_flights(range(years[0], years[1] + 1), df)

# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)
ui = widgets.HBox([years_w])

In [20]:
out = widgets.interactive_output(ui_callback, {'years': years_w, 'df': widgets.fixed(unified_dataset)})
display(ui, out)

AnalysisException: "Can't extract value from Date#233: need struct type but got timestamp;"

HBox(children=(SelectionRangeSlider(continuous_update=False, description='Years', index=(0, 2), options=(('199…

Output()