In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init('/Users/giacomogregori/spark')

In [2]:
from pyspark.sql import SparkSession

# Build a Spark SQL Session for DataFrames
master = 'local[4]'
appName = 'Weather delays percentages'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

In [7]:
from pathlib import Path
full_data = '../dataset/*.csv.bz2'
full_data_parquet = '../dataset/RITA_1994-2008.parquet'

path = Path(full_data_parquet)
# If reduced dataset is not found, load the full compressed dataset and reduce it.
# This is going to take lot of time. Just wait.
if not path.is_dir():
    df = spark.read.csv(full_data, inferSchema=True, header=True, sep=',')
    df.replace('NA', None) \
    .write \
    .save(full_data_parquet, format='parquet')

# Load the reduced dataset
df = spark.read.load(full_data_parquet, format='parquet')

In [8]:
df.columns

['Year',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'ArrTime',
 'CRSArrTime',
 'UniqueCarrier',
 'FlightNum',
 'TailNum',
 'ActualElapsedTime',
 'CRSElapsedTime',
 'AirTime',
 'ArrDelay',
 'DepDelay',
 'Origin',
 'Dest',
 'Distance',
 'TaxiIn',
 'TaxiOut',
 'Cancelled',
 'CancellationCode',
 'Diverted',
 'CarrierDelay',
 'WeatherDelay',
 'NASDelay',
 'SecurityDelay',
 'LateAircraftDelay']

In [9]:
# Drop cancelled flights
df = df.drop(df['Cancelled'] == 1)


# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())

week_year = lambda date : date.isocalendar()[1]
week_year = F.udf(week_year, IntegerType())

df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), \
               'DayOfWeek', 'CarrierDelay', 'WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay')
df = df.select('Date', week_year('Date').alias('WeekYear'), 'CarrierDelay', 'WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay')
df.show(10)

+-------------------+--------+------------+------------+--------+-------------+-----------------+
|               Date|WeekYear|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+-------------------+--------+------------+------------+--------+-------------+-----------------+
|2007-01-01 00:00:00|       1|           0|           0|       0|            0|                0|
|2007-01-01 00:00:00|       1|           0|           0|       0|            0|                0|
|2007-01-01 00:00:00|       1|           3|           0|       0|            0|               31|
|2007-01-01 00:00:00|       1|          23|           0|       0|            0|                3|
|2007-01-01 00:00:00|       1|           0|           0|       0|            0|                0|
|2007-01-01 00:00:00|       1|           0|           0|       0|            0|                0|
|2007-01-01 00:00:00|       1|          46|           0|       0|            0|                1|
|2007-01-01 00:00:00

In [None]:
# Flights that have a WeatherDelay
weather_delayed_flights = df.filter(df['WeatherDelay'] > 0)

# Flights that have a Delay
delayed_flights = df.filter((df['CarrierDelay'] > 0) | (df['WeatherDelay'] > 0) | (df['NASDelay'] > 0) | (df['SecurityDelay'] > 0) | (df['LateAircraftDelay'] > 0)

# Number of times per week flights had a weather delay or a general delay  
weather_delays = weather_delayed_flights.groupBy([F.year('Date').alias('Year'), 'WeekYear']).count()
general_delays = delayed_flights.groupBy([F.year('Date').alias('Year'), 'WeekYear']).count()

weather_delays.show(10)
general_delays.show(10)

+----+--------+-----+
|Year|WeekYear|count|
+----+--------+-----+
|2008|      35| 1413|
|2005|      29| 3018|
|2005|      49| 3513|
|2007|       6| 2165|
|2007|      52| 3271|
|2005|       5| 1267|
|2007|      28| 2940|
|2005|      51| 1584|
|2005|      25| 1653|
|2008|      28| 3249|
+----+--------+-----+
only showing top 10 rows

