In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init('/Users/giacomogregori/spark')

In [2]:
from pyspark.sql import SparkSession

# Build a Spark SQL Session for DataFrames
master = 'local[2]'
appName = 'Weather delays percentages'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

In [3]:
from pathlib import Path
full_data = '../dataset/*.csv.bz2'
full_data_parquet = '../dataset/RITA_1994-2008.parquet'

path = Path(full_data_parquet)
# If reduced dataset is not found, load the full compressed dataset and reduce it.
# This is going to take lot of time. Just wait.
if not path.is_dir():
    df = spark.read.csv(full_data, inferSchema=True, header=True, sep=',')
    df.replace('NA', None) \
    .write \
    .save(full_data_parquet, format='parquet')

# Load the reduced dataset
df = spark.read.load(full_data_parquet, format='parquet')

In [4]:
df.columns

['Year',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'UniqueCarrier',
 'FlightNum',
 'TailNum',
 'ActualElapsedTime',
 'CRSElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'Origin',
 'Dest',
 'Distance',
 'TaxiIn',
 'TaxiOut',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']

In [12]:
# Drop cancelled flights
df = df.drop(df['Cancelled'] == 1)


# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())

week_year = lambda date : date.isocalendar()[1]
week_year = F.udf(week_year, IntegerType())

In [None]:
df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), \
               'DayOfWeek', 'CarrierDelay', 'WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay')
df = df.select('Date', week_year('Date').alias('WeekYear'), 'CarrierDelay', 'WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay')
df.show(10)

In [9]:
# Flights that have a WeatherDelay
weather_delayed_flights = df.filter(df['WeatherDelay'] > 0)

# Flights that have a Delay
delayed_flights = df.filter((df['CarrierDelay'] > 0) | (df['WeatherDelay'] > 0) | (df['NASDelay'] > 0) | (df['SecurityDelay'] > 0) | (df['LateAircraftDelay'] > 0))

# Number of times per week flights had a weather delay or a general delay  
weather_delays = weather_delayed_flights.groupBy([F.year('Date').alias('Year'), 'WeekYear']).count()
general_delays = delayed_flights.groupBy([F.year('Date').alias('Year'), 'WeekYear']).count()
                            
weather_delays = weather_delays.select('Year', 'WeekYear', weather_delays['count'].alias('weather_count'))
general_delays = general_delays.select('Year', 'WeekYear', general_delays['count'].alias('general_count'))

                            
weather_delays.show(10)
general_delays.show(10)

+----+--------+-------------+
|Year|WeekYear|weather_count|
+----+--------+-------------+
|2008|      35|         1413|
|2005|      29|         3018|
|2005|      49|         3513|
|2007|       6|         2165|
|2007|      52|         3271|
|2005|       5|         1267|
|2007|      28|         2940|
|2005|      51|         1584|
|2005|      25|         1653|
|2008|      28|         3249|
+----+--------+-------------+
only showing top 10 rows

+----+--------+-------------+
|Year|WeekYear|general_count|
+----+--------+-------------+
|2008|      35|        19977|
|2005|      29|        37064|
|2005|      49|        34575|
|2007|       6|        31743|
|2007|      52|        47949|
|2005|       5|        19279|
|2007|      28|        42737|
|2005|      51|        32616|
|2005|      25|        26598|
|2008|      28|        39097|
+----+--------+-------------+
only showing top 10 rows



In [31]:
# Join the tables
percentage_weather_delays = weather_delays \
                .join(general_delays, ["Year","WeekYear"])
    
percentage_weather_delays.show(10)

+----+--------+-------------+-------------+
|Year|WeekYear|weather_count|general_count|
+----+--------+-------------+-------------+
|2008|      35|         1413|        19977|
|2005|      29|         3018|        37064|
|2005|      49|         3513|        34575|
|2005|       5|         1267|        19279|
|2007|       6|         2165|        31743|
|2007|      52|         3271|        47949|
|2007|      28|         2940|        42737|
|2005|      51|         1584|        32616|
|2005|      25|         1653|        26598|
|2005|      22|         2670|        26607|
+----+--------+-------------+-------------+
only showing top 10 rows



In [32]:
percentage_weather_delays = percentage_weather_delays.withColumn("WeeklyWeatherDelays", (F.col("weather_count") / F.col("general_count"))*100)
#['WeeklyWeatherDelays']= percentage_weather_delays['weather_count'] / percentage_weather_delays['general_count']
percentage_weather_delays.show(10)

+----+--------+-------------+-------------+--------------------+
|Year|WeekYear|weather_count|general_count| WeeklyWeatherDelays|
+----+--------+-------------+-------------+--------------------+
|2008|      35|         1413|        19977| 0.07073134104219853|
|2005|      29|         3018|        37064| 0.08142672134685949|
|2005|      49|         3513|        34575| 0.10160520607375272|
|2005|       5|         1267|        19279|   0.065719176305825|
|2007|       6|         2165|        31743| 0.06820401348328765|
|2007|      52|         3271|        47949|  0.0682183152933325|
|2007|      28|         2940|        42737| 0.06879284928750264|
|2005|      51|         1584|        32616| 0.04856512141280353|
|2005|      25|         1653|        26598|0.062147529889465376|
|2005|      22|         2670|        26607| 0.10034953207802458|
+----+--------+-------------+-------------+--------------------+
only showing top 10 rows



In [33]:
percentage_weather_delays.select('Year','WeekYear','WeeklyWeatherDelays')

DataFrame[Year: int, WeekYear: int, WeeklyWeatherDelays: double]

In [4]:
# Store output Dataframe (or load it if already existing)
final_dataset = '../dataset/weather_analitics.parquet'

path= Path(final_dataset)
if not path.is_dir():
    percentage_weather_delays.write.mode('overwrite').save(final_dataset, format='parquet')
else:
    percentage_weather_delays = spark.read.load(final_dataset)

In [35]:
# Output a list of tuples of schema:
# ('Year', 'WeekYear', 'Percentage')
weather_data = percentage_weather_delays.rdd.map(tuple).collect()
print(weather_data[:100])

[(2008, 35, 1413, 19977, 0.07073134104219853), (2005, 29, 3018, 37064, 0.08142672134685949), (2005, 49, 3513, 34575, 0.10160520607375272), (2005, 5, 1267, 19279, 0.065719176305825), (2007, 6, 2165, 31743, 0.06820401348328765), (2007, 52, 3271, 47949, 0.0682183152933325), (2007, 28, 2940, 42737, 0.06879284928750264), (2005, 51, 1584, 32616, 0.04856512141280353), (2005, 25, 1653, 26598, 0.062147529889465376), (2005, 22, 2670, 26607, 0.10034953207802458), (2005, 28, 6317, 47030, 0.1343185200935573), (2008, 28, 3249, 39097, 0.08310100519221424), (2005, 19, 1216, 21523, 0.05649770013473958), (2005, 23, 3223, 37790, 0.08528711299285525), (2006, 24, 2135, 28653, 0.074512267476355), (2008, 45, 512, 17228, 0.029719061992105875), (2004, 44, 1928, 25645, 0.07518034704620784), (2008, 8, 3173, 41191, 0.07703139035226142), (2003, 50, 2617, 28252, 0.09263061022228515), (2006, 46, 2542, 34278, 0.07415835229593325), (2003, 32, 2467, 26424, 0.09336209506509234), (2004, 6, 3462, 33331, 0.1038672707089496

# Data Visualization

In [13]:
# Hide warnings if there are any
import warnings
warnings.filterwarnings('ignore')

%matplotlib ipympl

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
def get_pd_dataframe( years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .select('Year', 'WeekYear', 'WeeklyWeatherDelays') \
             .orderBy('Year', 'WeekYear') \
             .collect()
    
    nb_years = len(years)
    nb_weeks = 52
    data = np.zeros((nb_weeks, nb_years))
    for row in rows:
        year = row[0] - years[0]
        week = row[1] - 1
        per = row[2]

        if week > 51: continue
        data[week, year] = per
    columns = [str(y) for y in years]
    indices = range(1, 53)
    res = pd.DataFrame(data=data, columns=columns, index=indices)
    return res

def plot_weather_time_series(years, df):
    df = get_pd_dataframe(years, df)
    title = 'Weekly weather delays percentage'
    if df.empty:
        print('No data')
    else:
        df.plot(title=title, grid=True, xticks=range(0, 53, 4), colormap='tab20c')

In [15]:
def get_average_df(years, df):
    rows = df.filter(F.col('Year').isin(*years)) \
             .groupBy('Year') \
             .avg('WeeklyWeatherDelays') \
             .withColumnRenamed('avg(WeeklyWeatherDelays)', 'AverageWeatherDelaysPercentage') \
             .select('Year', 'AverageWeatherDelaysPercentage') \
             .collect()
    
    nb_years = len(years)
    data = np.zeros(nb_years)
    for row in rows:
        year = row[0] - years[0]
        avg_pen = row[1]
        data[year] = avg_pen 
    res = pd.DataFrame({'Weather delays': data}, index=years)
    return res

def plot_average_weather_delays(years, df):
    df = get_average_df( years, df)
    title = 'Average weather delays percentage'
    if df.empty:
        print('No data ')
    else:
        df.plot.bar( title=title, rot=0)

In [16]:
def ui_callback( years, df):
    plot_weather_time_series(range(years[0], years[1] + 1), df)
    plot_average_weather_delays(range(years[0], years[1] + 1), df)

# Years selection range
years = range(1994, 2009)
years = [(str(y), y) for y in years]
years_w = widgets.SelectionRangeSlider(options=years,
                                       index=(0, 2),
                                       description='Years',
                                       continuous_update=False)
#We could improve the visualization allowing to select the delay type percentage. 
#Now can be visualized only the weather delays percentage

ui = widgets.HBox([years_w])

In [17]:
out = widgets.interactive_output(ui_callback, {'years': years_w, 'df': widgets.fixed(percentage_weather_delays)})
display(ui, out)

FigureCanvasNbAgg()

FigureCanvasNbAgg()

HBox(children=(SelectionRangeSlider(continuous_update=False, description='Years', index=(0, 2), options=(('199…

Output()