In [1]:
from pyspark.ml import feature
from pyspark.ml import clustering
from pyspark.ml import Pipeline
from pyspark.sql import functions as fn
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql import functions as sf
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import RegexTokenizer
import requests
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import concat, col, lit, lower
from pyspark.sql.functions import isnan, when, count, col, isnull
from pyspark.sql.functions import concat_ws
from  pyspark.sql.functions import abs
# seting master("local[*]") enables multicore processing on all available logical cores on your machine
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [2]:
# Do not delete or change this cell

import os

# Define a function to determine if we are running on data bricks
# Return true if running in the data bricks environment, false otherwise
def is_databricks():
    # get the databricks runtime version
    db_env = os.getenv("DATABRICKS_RUNTIME_VERSION")
    
    # if running on data bricks
    if db_env != None:
        return True
    else:
        return False

# Define a function to read the data file.  The full path data file name is constructed
# by checking runtime environment variables to determine if the runtime environment is 
# databricks, or a student's personal computer.  The full path file name is then
# constructed based on the runtime env.
# 
# Params
#   data_file_name: The base name of the data file to load
# 
# Returns the full path file name based on the runtime env
#
def get_training_filename(data_file_name):    
    # if running on data bricks
    if is_databricks():
        # build the full path file name assuming data brick env
        full_path_name = "/FileStore/tables/%s" % data_file_name
    # else the data is assumed to be in the same dir as this notebook
    else:
        # Assume the student is running on their own computer and load the data
        # file from the same dir as this notebook
        full_path_name = data_file_name
    
    # return the full path file name to the caller
    return full_path_name

In [3]:
airlines_df = spark.read.csv(get_training_filename('airlines.csv'), header=True, inferSchema=True)
airports_df = spark.read.csv(get_training_filename('airports.csv'), header=True, inferSchema=True)
flights_df = spark.read.csv(get_training_filename('flights.csv'), header=True, inferSchema=True)

In [4]:
shape = ((flights_df.count(), len(flights_df.columns)))
print('The shape of flights_df:', shape)

In [5]:
flights_df.select([count(when(isnull(c), c)).alias(c) for c in flights_df.columns]).show()

In [6]:
flights_df = flights_df.select('MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY', 'CANCELLED')

flights_df = flights_df.filter((fn.col('CANCELLED')==0))

flights_df = flights_df.withColumn("Flight_Delayed", fn.when(fn.col("DEPARTURE_DELAY")<10, 0).otherwise(1))

from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[ 0, 100, 1000, float('Inf') ],inputCol="DISTANCE", outputCol="Distance_Bucket")
flights_df = bucketizer.setHandleInvalid("keep").transform(flights_df)

from pyspark.sql.functions import udf
from pyspark.sql.types import *

t = {0.0:"Short", 1.0: "Medium", 2.0:"Long"}
udf_foo = udf(lambda x: t[x], StringType())
flights_df = flights_df.withColumn("Flight_Distance", udf_foo("Distance_Bucket"))

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="AIRLINE", outputCol="Airline_Numeric").fit(flights_df)
flights_df = indexer.transform(flights_df)

from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="Airline_Numeric", outputCol="Airline_OHE")
flights_df= encoder.transform(flights_df)

indexer = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="OA_Numeric").fit(flights_df)
flights_df = indexer.transform(flights_df)

encoder = OneHotEncoder(inputCol="OA_Numeric", outputCol="Origin_Airport_OHE")
flights_df= encoder.transform(flights_df)

indexer = StringIndexer(inputCol="DESTINATION_AIRPORT", outputCol="DA_Numeric").fit(flights_df)
flights_df = indexer.transform(flights_df)

encoder = OneHotEncoder(inputCol="DA_Numeric", outputCol="Destination_Airport_OHE")
flights_df= encoder.transform(flights_df)

flights_df = flights_df.drop('ARRIVAL_DELAY')




In [7]:
from pyspark.sql import functions as fn
flights_df.groupBy('CANCELLED').agg(fn.count('*')).show()

In [8]:
flights_df.select("DISTANCE").rdd.max()[0]

In [9]:
flights_df.select("DISTANCE").rdd.min()[0]

In [10]:
from pyspark.sql import functions as fn
flights_df.groupBy('Airline_Numeric').agg(fn.count('*')).show()

In [11]:
flights_df.select('Airline_OHE').take(5)


In [12]:
flights_df.limit(10).toPandas()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_DELAY,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,CANCELLED,Flight_Delayed,Distance_Bucket,Flight_Distance,Airline_Numeric,Airline_OHE,OA_Numeric,Origin_Airport_OHE,DA_Numeric,Destination_Airport_OHE
0,1,1,4,AS,ANC,SEA,5,-11,1448,430,-22,0,0,2.0,Long,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",65.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,1,4,AA,LAX,PBI,10,-8,2330,750,-9,0,0,2.0,Long,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",52.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1,1,4,US,SFO,CLT,20,-2,2296,806,5,0,0,2.0,Long,8.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",14.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1,1,4,AA,LAX,MIA,20,-5,2342,805,-9,0,0,2.0,Long,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",24.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1,1,4,AS,SEA,ANC,25,-1,1448,320,-21,0,0,2.0,Long,9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",66.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,1,1,4,DL,SFO,MSP,25,-5,1589,602,8,0,0,2.0,Long,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,1,1,4,NK,LAS,MSP,25,-6,1299,526,-17,0,0,2.0,Long,10.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",9.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,1,1,4,US,LAX,CLT,30,14,2125,803,-10,0,1,2.0,Long,8.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",4.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",14.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,1,1,4,AA,SFO,DFW,30,-11,1464,545,-13,0,0,2.0,Long,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1,1,4,DL,LAS,ATL,30,3,1747,711,-15,0,0,2.0,Long,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [13]:
flights_df.printSchema()

In [14]:
training_df, testing_df = flights_df.randomSplit([0.8, 0.2])

In [15]:
testing_df.select([count(when(isnull(c), c)).alias(c) for c in training_df.columns]).show()

In [16]:

training_df.groupBy('Flight_Delayed').agg(fn.count('*')).show()

In [17]:
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(
    inputCols=["MONTH", "DAY", "DAY_OF_WEEK", "Airline_OHE", "Origin_Airport_OHE", "Destination_Airport_OHE", "SCHEDULED_DEPARTURE", "Distance_Bucket", "SCHEDULED_ARRIVAL", "CANCELLED"], outputCol="features")

In [18]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.tuning import CrossValidatorModel
bce = BinaryClassificationEvaluator(labelCol='Flight_Delayed', metricName='areaUnderROC')
from pyspark.ml import evaluation
lr_evaluator_f1 = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="f1")
rf_evaluator_f1 = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="f1")

In [19]:
lr = LogisticRegression(featuresCol='features', labelCol='Flight_Delayed', regParam=0.1)
lr_pipeline = Pipeline(stages=[va, lr]).fit(training_df)
#bce.evaluate(lr_pipeline.transform(testing_df))
lr_evaluator_f1.evaluate(lr_pipeline.transform(testing_df))

In [20]:
lr_evaluator_recall = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="weightedRecall")
lr_evaluator_recall.evaluate(lr_pipeline.transform(testing_df))

In [21]:
lr_evaluator_precision = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="weightedPrecision")
lr_evaluator_precision.evaluate(lr_pipeline.transform(testing_df))

In [22]:
lr_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="accuracy")
lr_evaluator_accuracy.evaluate(lr_pipeline.transform(testing_df))

In [23]:
rf = RandomForestClassifier(featuresCol='features', labelCol='Flight_Delayed', numTrees=3, maxDepth=4, impurity="gini")
rf_pipeline = Pipeline(stages=[va, rf]).fit(training_df)
#bce.evaluate(rf_pipeline.transform(testing_df))
rf_evaluator_f1.evaluate(rf_pipeline.transform(testing_df))

In [24]:
rf_evaluator_recall = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="weightedRecall")
rf_evaluator_recall.evaluate(rf_pipeline.transform(testing_df))

In [25]:
rf_evaluator_Precision = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="weightedPrecision")
rf_evaluator_Precision.evaluate(rf_pipeline.transform(testing_df))

In [26]:
rf_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="Flight_Delayed", metricName="accuracy")
rf_evaluator_accuracy.evaluate(rf_pipeline.transform(testing_df))

In [27]:
gbt = GBTClassifier(featuresCol='features', labelCol='Flight_Delayed')
gbt_pipeline = Pipeline(stages=[va, gbt]).fit(training_df)
#bce.evaluate(gbt_pipeline.transform(testing_df))
evaluator.evaluate(gbt_pipeline.transform(testing_df))

In [28]:
rf_pipe = Pipeline(stages=[rf])

In [29]:
paramGrid = ParamGridBuilder()\
  .addGrid(rf.maxBins, [25, 28, 31])\
  .addGrid(rf.maxDepth, [4, 6, 8])\
  .addGrid(rf.numTrees, [10, 20, 30])\
  .addGrid(rf.impurity, ["entropy", "gini"])\
  .build()

In [30]:
cv = CrossValidator()\
  .setEstimator(rf_pipe)\
  .setEvaluator(evaluator)\
  .setEstimatorParamMaps(paramGrid)\
  .setNumFolds(3)

In [31]:
cv_pipeline = Pipeline(stages=[va,cv]).fit(training_df)

In [32]:
evaluator.evaluate(cv_pipeline.transform(testing_df))

In [33]:
from pyspark.ml import classification

mlp = classification.MultilayerPerceptronClassifier(featuresCol='features', labelCol='Flight_Delayed', layers=[4,5,4,2])
mlp_pipeline = Pipeline(stages=[va, mlp]).fit(training_df)
evaluator.evaluate(mlp_pipeline.transform(testing_df))

In [34]:
from pyspark.ml.classification import LinearSVC

svm = LinearSVC(featuresCol='features', labelCol='Flight_Delayed', maxIter=10, regParam=0.1)
svm_pipeline = Pipeline(stages=[va, svm]).fit(training_df)
evaluator.evaluate(svm_pipeline.transform(testing_df))


In [35]:
from pyspark.ml import feature

pca = PCA(k=2, inputCol="features", outputCol="pca_features")
 model = pca.fit(df)
 model.transform(df).collect()

In [36]:
flights_df_sample = flights_df.sample(True, 0.5, 42)

In [37]:
shape = ((flights_df_sample.count(), len(flights_df_sample.columns)))
print('The shape of flights_df_sample:', shape)

In [38]:
flights_df_sample.printSchema()

In [39]:
flights_df_sample_pandas = flights_df_sample.toPandas()

In [40]:
flights_df_sample.groupBy('Flight_Delayed').agg(fn.count('*')).show()

In [41]:
flights_df.groupBy('Flight_Delayed').agg(fn.count('*')).show()

In [42]:
from pyspark.ml.stat import Correlation

# convert to vector column first
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=["MONTH", "DAY", "DAY_OF_WEEK", "Airline_OHE", "Origin_Airport_OHE", "Destination_Airport_OHE", "SCHEDULED_DEPARTURE", "Distance_Bucket", "SCHEDULED_ARRIVAL", "CANCELLED", "Flight_Delayed"], outputCol=vector_col)
df_vector = assembler.transform(flights_df).select(vector_col)

# get correlation matrix
matrix = Correlation.corr(df_vector, vector_col)


In [43]:
matrix.show()

In [44]:
sns.distplot(flights_df_sample_pandas['MONTH'])
display()

In [45]:
sns.distplot(flights_df_sample_pandas['DAY'])
display()

In [46]:
sns.distplot(flights_df_sample_pandas['DAY_OF_WEEK'])
display()

In [47]:
flights_df_sample_pandas['AIRLINE'].value_counts().plot(kind='bar')
display()

In [48]:
airline_count  = flights_df_sample_pandas['AIRLINE'].value_counts()
airline_count = airline_count[:10,]
plt.figure(figsize=(10,5))
sns.barplot(airline_count.index, airline_count.values, alpha=0.8)
display()

In [49]:
oa_count  = flights_df_sample_pandas['ORIGIN_AIRPORT'].value_counts()
oa_count = oa_count[:10,]
plt.figure(figsize=(10,5))
sns.barplot(oa_count.index, oa_count.values, alpha=0.8)
display()

In [50]:
da_count  = flights_df_sample_pandas['DESTINATION_AIRPORT'].value_counts()
da_count = da_count[:10,]
plt.figure(figsize=(10,5))
sns.barplot(da_count.index, da_count.values, alpha=0.8)
display()

In [51]:
sns.distplot(flights_df_sample_pandas['SCHEDULED_DEPARTURE'])
display()

In [52]:
sns.distplot(flights_df_sample_pandas['DEPARTURE_DELAY'])
display()

In [53]:
sns.distplot(flights_df_sample_pandas['DISTANCE'])
display()

In [54]:
sns.distplot(flights_df_sample_pandas['Distance_Bucket'])
display()

In [55]:
sns.distplot(flights_df_sample_pandas['SCHEDULED_ARRIVAL'])
display()

In [56]:
sns.distplot(flights_df_sample_pandas['CANCELLED'])
display()

In [57]:
sns.distplot(flights_df_sample_pandas['Flight_Delayed'])
display()

In [58]:
sns.barplot(x="MONTH", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [59]:
sns.barplot(x="MONTH", y="DEPARTURE_DELAY", hue="Flight_Distance", data=flights_df_sample_pandas)
display()

In [60]:
sns.barplot(x="DAY", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [61]:
sns.barplot(x="DAY", y="DEPARTURE_DELAY", hue="Flight_Distance", data=flights_df_sample_pandas)
display()

In [62]:
sns.barplot(x="DAY_OF_WEEK", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [63]:
sns.barplot(x="DAY_OF_WEEK", y="DEPARTURE_DELAY", hue="Flight_Distance", data=flights_df_sample_pandas)
display()

In [64]:
sns.barplot(x="AIRLINE", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [65]:
sns.barplot(x="ORIGIN_AIRPORT", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [66]:
sns.barplot(x="SCHEDULED_DEPARTURE", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [67]:
sns.barplot(x="Distance_Bucket", y="DEPARTURE_DELAY", data=flights_df_sample_pandas)
display()

In [68]:
sns.barplot(x="AIRLINE", y="DEPARTURE_DELAY", hue="Flight_Distance", data=flights_df_sample_pandas)
display()