## Initialize PySpark

In [1]:
# Find Apache Spark on this machine
import findspark
findspark.init('/Users/giacomogregori/spark')

In [2]:
from pyspark.sql import SparkSession

# Build a Spark SQL Session for DataFrames
master = 'local[4]'
appName = 'Cancelled flights percentages'
spark = SparkSession \
    .builder \
    .appName(appName) \
    .master(master) \
    .getOrCreate()

## Load data

In [3]:
from pathlib import Path
full_data = '../dataset/*.csv.bz2'
full_data_parquet = '../dataset/RITA_1994-2008.parquet'

path = Path(full_data_parquet)
# If reduced dataset is not found, load the full compressed dataset and reduce it.
# This is going to take lot of time. Just wait.
if not path.is_dir():
    df = spark.read.csv(full_data, inferSchema=True, header=True, sep=',')
    df.replace('NA', None) \
    .write \
    .save(canceled_data, format='parquet')

# Load the reduced dataset
df = spark.read.load(full_data_parquet, format='parquet')

In [4]:
# Explore the data
df.printSchema()
df.show(10)

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- TaxiIn: string (nullable = true)
 |-- TaxiOut: string (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- Carr

In [5]:
df.describe('Cancelled').show()

+-------+--------------------+
|summary|           Cancelled|
+-------+--------------------+
|  count|            91469371|
|   mean|0.021325903727926587|
| stddev|  0.1444683695010413|
|    min|                   0|
|    max|                   1|
+-------+--------------------+



In [6]:
# Parse dates to datetime format
import datetime
import pyspark.sql.functions as F
from pyspark.sql.types import TimestampType, IntegerType

make_date = lambda year, month, day : datetime.datetime(year, month, day) 
make_date = F.udf(make_date, TimestampType())


df = df.select(make_date(df['Year'], df['Month'], df['DayofMonth']).alias('Date'), 'Cancelled')
#df.show(10)

In [7]:
# Cancelled Flights
cancelled_flights = df.filter(df['Cancelled'] == 1)

# Number of flights per day 
all_flights_count = df.groupBy(['Date']).count()
cancelled_flights_count = cancelled_flights.groupBy(['Date']).count()

all_flights_count.show(10)
cancelled_flights_count.show(10)

KeyboardInterrupt: 