In [1]:
#Mounted data from Amazon S3
#These are credentials to my personal AWS acount, please don't use it at other places

ACCESS_KEY = "XXXXXXXXXXXXX"
SECRET_KEY = "XXXXXXXXXXXXXXXXXX"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "otp-data"
MOUNT_NAME = "otp-data"

#dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType

#To check whether all the necessary data is mounted, "mnt" stands for the mounted folder
FOLDER = "1_year_data"
PATH = "/mnt/%s/%s" % (MOUNT_NAME,FOLDER)

display(dbutils.fs.ls(PATH))

In [2]:
#Basic loading and test the validity of the dataset, should test with hashes if time permits

DFTEMP = spark.read.csv("/mnt/otp-data/1_year_data/On_Time_On_Time_Performance_2017_1.csv", header=True, inferSchema=True)
assert (DFTEMP.count()==450017), "There is a problem with your dataset, counts don't match"

#import all csv into a single data-frame (Takes about 10 minutes, do not run frequently)

DF=0
for files in dbutils.fs.ls(PATH):
  if (DF==0):
    DF = spark.read.csv("%s/%s" % (PATH,files.name), header=True, inferSchema=True)
  else:
    DF = DF.unionAll(spark.read.csv("%s/%s" % (PATH,files.name),header=True, inferSchema=True))

assert (DF.count()==5660970), "There is a problem with your dataset, counts don't match"

#print the structure of the dataframe
display(DF.limit(10))

In [3]:
print DF.printSchema()

In [4]:
#Create smaller dataframes for better testing, one for a random sample of roughly a month's data 0.083 ~= 1/12, the other using data from January 2017, note that for sample the numbers might not add up to the fraction exactly

smallDF = DF.sample(False, 0.083)
monthDF = DF.filter(DF.Month == 1)
smallDF.cache()
monthDF.cache()

print smallDF.count()
print monthDF.count()

In [5]:
#Convert to RDD similar to the ones used in apache log and cache them, do not use fullRDD outside of production
fullRDD = DF.rdd
smallRDD = smallDF.rdd.cache()
monthRDD = monthDF.rdd.cache()

#Toggle for dataset used
FULL = 1
if (FULL):
  smallRDD=fullRDD

In [6]:
#Define simple functions
def reduceAdd (a,b):
  return a+b

#For a certain key in the RDD, compute the total number of entries  
def numberOf(key,RDD):
  return RDD.map(lambda a: (a[key],1)).reduceByKey(reduceAdd).sortBy(lambda x: -x[1])

#For a certain key in the RDD, compute the portion of it's entries compared to all 
def shareOf(key,RDD):
  fieldRDD = numberOf(key,RDD)
  count = fieldRDD.map(lambda a:a[1]).reduce(reduceAdd)
  return fieldRDD.map(lambda a: (a[0],float(a[1])/count)).sortBy(lambda x: -x[1])

In [7]:
#My part includes three main part: 1. Route delay relation. 2. Delay type investigation (Orgin? Age of Plane?)
#First, all data related is selected to form a set without useless items.
smallRoute = smallRDD.map(lambda x:(str(x.DayofMonth)+"/"+str(x.Month)+"/"+str(x.Year), x.DayOfWeek, x.Origin+"-"+x.Dest, x.DepDelayMinutes, x.ArrDelayMinutes, x.TailNum, x.CarrierDelay, x.WeatherDelay, x.NASDelay, x.SecurityDelay, x.LateAircraftDelay))
print smallRoute.take(10)

In [8]:
print smallRoute.count()

In [9]:
#Observe the data of routes
routeOccurNum = smallRoute.map(lambda x:(x[2], 1)).reduceByKey(lambda x, y: x+y).sortBy(lambda (x,y):-y)
topTwentyFrequentRoute = routeOccurNum.take(20)
display(topTwentyFrequentRoute, "Route", "Frequency")

In [10]:
print routeOccurNum.count()

In [11]:
#Number of flight in a month with respect to the frequency
#routeFreq = routeOccurNum.map(lambda x: (x[1], 1)).reduceByKey(lambda x,y:x+y).sortBy(lambda (x,y):x).sortBy(lambda (x,y):-x)
aircraft = routeOccurNum.map(lambda x: x[0]).collect()
flights = routeOccurNum.map(lambda x: x[1]).collect()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4.5), facecolor='white', edgecolor='white')
plt.axis([0, len(aircraft), 0, max(flights)])
plt.grid(b=True, which='major', axis='y')
plt.ylabel('Number of Flight')
plt.xlabel('Individual Route')
plt.plot(flights)

display(plt.show())

In [12]:
aircraft = routeFreq.map(lambda x: x[0]).take(500)
flights = routeFreq.map(lambda x: x[1]).take(500)
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4.5), facecolor='white', edgecolor='white')
plt.axis([2000, max(aircraft), 0, max(flights)])
plt.grid(b=True, which='major', axis='y')
plt.ylabel('Number of Route')
plt.xlabel('Number of Flight in a Year')
plt.plot(aircraft, flights)

display(plt.show())

In [13]:
print max(flights)
print routeFreq.filter(lambda x: x[1] == 122).collect()

In [14]:
#Then, the percentage delay and average delay time (include no delay) is calculated for departure
percentageDepDelay = smallRoute.map(lambda x:(x[2], [1 if x[3] > 0 else 0, 1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 100* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)
averageDepDelay = smallRoute.map(lambda x:(x[2], [x[3] if x[3] > 0 else 0,1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 1.0* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)


In [15]:
#Most busy route with departure delay
display(routeOccurNum.join(averageDepDelay).join(percentageDepDelay).sortBy(lambda (x,((y,z),w)):-y).map(lambda (x,((y,z),w)):(x,y,z,w)).take(10))

In [16]:
#most delay airport vs busy level
display(routeOccurNum.join(averageDepDelay).join(percentageDepDelay).sortBy(lambda (x,((y,z),w)):-w).map(lambda (x,((y,z),w)):(w,x,y,z)).take(10))


In [17]:
#least delay airport vs busy level
display(routeOccurNum.join(averageDepDelay).join(percentageDepDelay).sortBy(lambda (x,((y,z),w)):w).map(lambda (x,((y,z),w)):(w,x,y,z)).take(10))

In [18]:
#Then, the percentage delay and average delay time (include no delay) is calculated for arrival
percentageArrDelay = smallRoute.map(lambda x:(x[2], [1 if x[4] > 0 else 0, 1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 100* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)
averageArrDelay = smallRoute.map(lambda x:(x[2], [x[4] if x[4] > 0 else 0,1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 1.0* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)

In [19]:
#Most busy route with arrival delay
display(routeOccurNum.join(averageArrDelay).join(percentageArrDelay).sortBy(lambda (x,((y,z),w)):-y).map(lambda (x,((y,z),w)):(x,y,z,w)).take(10))

In [20]:
#most delay airport vs busy level
display(routeOccurNum.join(averageArrDelay).join(percentageArrDelay).sortBy(lambda (x,((y,z),w)):-w).map(lambda (x,((y,z),w)):(w,x,y,z)).take(10))

In [21]:
#least delay airport vs busy level
display(routeOccurNum.join(averageArrDelay).join(percentageArrDelay).sortBy(lambda (x,((y,z),w)):w).map(lambda (x,((y,z),w)):(w,x,y,z)).take(10))

In [22]:
#Find out the delay on air
nonullSmallRoutemid = smallRoute.filter(lambda x: (x[3])!=None)
nonullSmallRoute = nonullSmallRoutemid.filter(lambda x: (x[4])!=None)
nonullSmallRoute = nonullSmallRoute.filter(lambda x: x)
airSmallRoute = nonullSmallRoute.map(lambda x: (x[2], int(x[4]-x[3]) if x[4] > x[3] else 0))
display(airSmallRoute.take(10))

In [23]:
#the percentage delay and average delay time (include no delay) is calculated on air
percentageAirDelay = airSmallRoute.map(lambda x:(x[0], [1 if x[1] > 0 else 0, 1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 100* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)
averageAirDelay = airSmallRoute.map(lambda x:(x[0], [x[1] if x[1] > 0 else 0,1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 1.0* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)

In [24]:
routeOccurNum = routeOccurNum.filter(lambda (x,y): y>100)

In [25]:
#Most busy route with arrival delay

airDelayFull = routeOccurNum.join(percentageAirDelay).sortBy(lambda (x,(y,z)):-y)
display(airDelayFull.take(10))

In [26]:
#most delay airport vs busy level
display(routeOccurNum.join(averageAirDelay).join(percentageAirDelay).sortBy(lambda (x,((y,z),w)):-w).map(lambda (x,((y,z),w)):(w,x,y,z)).take(10))

In [27]:
#least delay airport vs busy level
display(routeOccurNum.join(averageAirDelay).join(percentageAirDelay).sortBy(lambda (x,((y,z),w)):w).map(lambda (x,((y,z),w)):(w,x,y,z)).take(10))

In [28]:
airDelaySize = airDelayFull.map(lambda x: (x[1],x[3]))
display(airDelaySize.take(10))

In [29]:
print airDelaySize.map(lambda x: x[0]).count()

In [30]:
#Number of flight in a month with respect to the frequency
aircraft = airDelaySize.map(lambda x: x[0]).take(500)
flights = airDelaySize.map(lambda x: x[1]).take(500)
fig = plt.figure(figsize=(8,4.5), facecolor='white', edgecolor='white')
plt.axis([0, max(aircraft), 0, 100])
plt.grid(b=True, which='major', axis='y')
plt.ylabel('Percentage Delay')
plt.xlabel('Number of Flight in a Month')
plt.scatter(aircraft, flights)

display(plt.show())

In [31]:
top = 100

# Scientific libraries
from numpy import arange,array,ones,asarray
from scipy import stats

#Scatter plot for airport departure delay frequency vs number of flights

# Take top x airports <Adjust>

share=averageDelayByAirport.map(lambda a:a[1][0]).cache()
flights=averageDelayByAirport.map(lambda a:float(a[1][1])/1000).cache()

#delayFlightExclude=delayedFlightShareExclude.map(lambda a:a[1][0]*100).cache()
#flightsExclude=delayedFlightShareExclude.map(lambda a:float(a[1][1])/1000).cache()


x = asarray(flights.collect()[:top])
y = asarray(share.collect()[:top])

xFull = asarray(flights.collect()[top+1:])
yFull = asarray(share.collect()[top+1:])

# Generated linear fit
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
line = slope*x+intercept

#Generate R

R = stats.pearsonr(x,y)

# Creating the dataset, and generating the plot
trace1 = go.Scatter(
    x=x,
    y=y,
    mode='markers',
    marker=go.Marker(color='rgb(255, 127, 14)'),
    name='Top 100 Airports',
    text = averageDelayByAirport.map(lambda a:"Airport: " + a[0]).collect()[:top],
)

trace2 = go.Scatter(
    x=x,
    y=line,
    mode='lines',
    marker=go.Marker(color='rgb(31, 119, 180)'),
    name='Best Fit Line (Top 100)',
)

trace3 = go.Scatter(
    x=xFull,
    y=yFull,
    mode='markers',
    marker=go.Marker(color='lightgrey'),
    name='Other Airports',
    text = averageDelayByAirport.map(lambda a:"Airport: " + a[0]).collect()[top+1:],
)

annotation = go.Annotation(
    x=3.5,
    y=1,
    text='',
    showarrow=False,
    font=go.Font(size=16)
)
layout = go.Layout(
    title='Average Delay vs Flights Per Year',
    plot_bgcolor='rgb(229, 229, 229)',
    xaxis=go.XAxis(zerolinecolor='rgb(255,255,255)', gridcolor='rgb(255,255,255)', title = "Flights per year (Thousands)"),
    yaxis=go.YAxis(zerolinecolor='rgb(255,255,255)', gridcolor='rgb(255,255,255)', title = "Average delay"),
    annotations=[annotation]
)

data = [trace3, trace1, trace2]
fig = go.Figure(data=data, layout=layout)

py.offline.iplot(fig)

print ("Slope is: %s" % slope)
print ("R is: %f" % R[0])

In [32]:
#NAS Delay Investigation
#Then, the percentage delay and average delay time (include no delay) is calculated for arrival
percentageNasDelay = smallRoute.map(lambda x:(x[2], [1 if x[8] > 0 else 0, 1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 100* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)
averageNasDelay = smallRoute.map(lambda x:(x[2], [x[8] if x[8] > 0 else 0,1])).reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]).map(lambda x: [x[0], 1.0* x[1][0] / x[1][1]]).sortBy(lambda (x,y):-y)

In [33]:
NASDelaySize = routeOccurNum.join(percentageAirDelay).sortBy(lambda (x,(y,z)):-y)
display(NASDelaySize.take(10))

In [34]:
#Number of flight in a month with respect to the frequency
aircraft = NASDelaySize.map(lambda x: x[0]).take(500)
flights = NASDelaySize.map(lambda x: x[1]).take(500)
fig = plt.figure(figsize=(8,4.5), facecolor='white', edgecolor='white')
plt.axis([0, max(aircraft), 0, 100])
plt.grid(b=True, which='major', axis='y')
plt.ylabel('Percentage Delay')
plt.xlabel('Number of Flight in a Month')
plt.scatter(aircraft, flights)

display(plt.show())

In [35]:
import pandas as pd

In [36]:
#This part import the relation between age of plane and tailnum
ageData = spark.read.csv('/FileStore/tables/DEREG.txt', header=True, inferSchema=True)

In [37]:
display(ageData.take(3))

In [38]:

ageData2 = ageData.map(lambda x: (x.N-NUMBER, x.YEAR-MFR, x.CANCEL-DATE))
display(ageData2.take(10))


In [39]:
print ageData.take(10)
print ageData.map(lambda x:(str(x[16]),x[16].isspace())).take(10)

In [40]:
notYetDied = ageData.filter(lambda x:str(x[16]).isspace())
#print ageData.take(10)
display(notYetDied.take(10))

In [41]:
print "test"
