In [1]:
#Import Statements
from pyspark.ml import feature
from pyspark.ml import clustering
from pyspark.ml import Pipeline
from pyspark.sql import functions as fn
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyspark.sql import functions as sf
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import RegexTokenizer
import requests
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import concat, col, lit, lower
from pyspark.sql.functions import isnan, when, count, col, isnull
from pyspark.sql.functions import concat_ws
from  pyspark.sql.functions import abs
# seting master("local[*]") enables multicore processing on all available logical cores on your machine
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext

In [2]:
# Do not delete or change this cell

import os

# Define a function to determine if we are running on data bricks
# Return true if running in the data bricks environment, false otherwise
def is_databricks():
    # get the databricks runtime version
    db_env = os.getenv("DATABRICKS_RUNTIME_VERSION")
    
    # if running on data bricks
    if db_env != None:
        return True
    else:
        return False

# Define a function to read the data file.  The full path data file name is constructed
# by checking runtime environment variables to determine if the runtime environment is 
# databricks, or a student's personal computer.  The full path file name is then
# constructed based on the runtime env.
# 
# Params
#   data_file_name: The base name of the data file to load
# 
# Returns the full path file name based on the runtime env
#
def get_training_filename(data_file_name):    
    # if running on data bricks
    if is_databricks():
        # build the full path file name assuming data brick env
        full_path_name = "/FileStore/tables/%s" % data_file_name
    # else the data is assumed to be in the same dir as this notebook
    else:
        # Assume the student is running on their own computer and load the data
        # file from the same dir as this notebook
        full_path_name = data_file_name
    
    # return the full path file name to the caller
    return full_path_name

In [3]:
#Importing data
airlines_df = spark.read.csv(get_training_filename('airlines.csv'), header=True, inferSchema=True)
airports_df = spark.read.csv(get_training_filename('airports.csv'), header=True, inferSchema=True)
flights_df = spark.read.csv(get_training_filename('flights.csv'), header=True, inferSchema=True)

In [4]:
# Transforming data
flights_df = flights_df.select('MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'DISTANCE', 'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY', 'CANCELLED')

# Removing NAs
flights_df = flights_df.fillna( { 'DEPARTURE_DELAY':0 } )

# Bucketinzinf distance into short,medium and long
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[ 0, 100, 1000, float('Inf') ],inputCol="DISTANCE", outputCol="Distance_Bucket")
flights_df = bucketizer.setHandleInvalid("keep").transform(flights_df)

from pyspark.sql.functions import udf
from pyspark.sql.types import *

t = {0.0:"Short", 1.0: "Medium", 2.0:"Long"}
udf_foo = udf(lambda x: t[x], StringType())
flights_df = flights_df.withColumn("Flight_Distance", udf_foo("Distance_Bucket"))

# Using String indexes for Airline column
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="AIRLINE", outputCol="Airline_Numeric").fit(flights_df)
flights_df = indexer.transform(flights_df)

# Performing one hot encoding
from pyspark.ml.feature import OneHotEncoder
encoder = OneHotEncoder(inputCol="Airline_Numeric", outputCol="Airline_OHE")
flights_df= encoder.transform(flights_df)

indexer = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="OA_Numeric").fit(flights_df)
flights_df = indexer.transform(flights_df)

encoder = OneHotEncoder(inputCol="OA_Numeric", outputCol="Origin_Airport_OHE")
flights_df= encoder.transform(flights_df)

indexer = StringIndexer(inputCol="DESTINATION_AIRPORT", outputCol="DA_Numeric").fit(flights_df)
flights_df = indexer.transform(flights_df)

encoder = OneHotEncoder(inputCol="DA_Numeric", outputCol="Destination_Airport_OHE")
flights_df= encoder.transform(flights_df)

flights_df = flights_df.drop('ARRIVAL_DELAY')


In [5]:
# Printing Schema of dataframe
flights_df.printSchema()

In [6]:
# Checking if there are nulls in datfarame
flights_df.select([count(when(isnull(c), c)).alias(c) for c in flights_df.columns]).show()

In [7]:
# Splitting data into train and test
training_df, testing_df = flights_df.randomSplit([0.9, 0.1], seed=5)

In [8]:
# undersampling data to handle unbalanced data
major_df = training_df.filter(col("CANCELLED") == 0)
minor_df = training_df.filter(col("CANCELLED") == 1)
ratio = major_df.count()/minor_df.count()
sampled_majority_df = major_df.sample(False, 3/ratio, seed=5)
combined_df_2 = sampled_majority_df.unionAll(minor_df)

In [9]:
# Checking dataframe after undersampling
combined_df_2.groupBy('CANCELLED').agg(fn.count('*')).show()

In [10]:
#create a vector assembler
from pyspark.ml.feature import VectorAssembler

va = VectorAssembler(
    inputCols=["MONTH", "DAY", "DAY_OF_WEEK", "Airline_OHE", "Origin_Airport_OHE", "Destination_Airport_OHE", "SCHEDULED_DEPARTURE", "Distance_Bucket", "SCHEDULED_ARRIVAL"], outputCol="features")
inputColumns = ["MONTH", "DAY", "DAY_OF_WEEK", "Airline_OHE", "Origin_Airport_OHE", "Destination_Airport_OHE", "SCHEDULED_DEPARTURE", "Distance_Bucket", "SCHEDULED_ARRIVAL"]

In [11]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.tuning import CrossValidatorModel
bce = BinaryClassificationEvaluator(labelCol='CANCELLED', metricName='areaUnderROC')
from pyspark.ml import evaluation

In [12]:
#train a logistic model
lr = LogisticRegression(featuresCol='features', labelCol='CANCELLED')
lr_pipeline = Pipeline(stages=[va, lr])
lr_model = lr_pipeline.fit(combined_df_2)
lr_transform = lr_model.transform(testing_df)

#logistic model metric evaluation

#AUC score
bce = BinaryClassificationEvaluator(labelCol='CANCELLED', metricName='areaUnderROC')
score_auc=bce.evaluate(lr_transform)

#Recall
lr_evaluator_recall = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="weightedRecall")
score_recall=lr_evaluator_recall.evaluate(lr_transform)

#Precision
lr_evaluator_precision = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="weightedPrecision")
score_precision=lr_evaluator_precision.evaluate(lr_transform)

#accuracy
lr_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="accuracy")
score_accuracy=lr_evaluator_accuracy.evaluate(lr_transform)

#f1 
lr_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="f1")
score_f1=lr_evaluator_accuracy.evaluate(lr_transform)

In [13]:
# ROC plot for logistic
plt.figure(figsize=(10,6))
plt.plot([0, 1], [0, 1], 'r--')
plt.scatter(lr_model.stages[-1].summary.roc.select('FPR').collect(),
            lr_model.stages[-1].summary.roc.select('TPR').collect())
plt.title('ROC Scatter Plot Flight Cancellation : TPR/FPR')
plt.xlabel('FPR')
plt.ylabel('TPR')
display()

In [14]:
# dataframe with all the metrics of logistic model
scores = [score_auc, score_recall, score_accuracy, score_f1, score_precision]
metricName = ['AUC', 'Recall', 'Balanced Accuracy', 'f1', 'Precision']
metric_df = pd.DataFrame(zip(metricName,scores),index=[1,2,3,4,5],columns=['Metric Name', 'Score'])
metric_df.head(5)

Unnamed: 0,Metric Name,Score
1,AUC,0.759853
2,Recall,0.928761
3,Accuracy,0.928761
4,f1,0.949832
5,Precision,0.974162


In [15]:
# Fitting a random forest model
rf = RandomForestClassifier(featuresCol='features', labelCol='CANCELLED')
rf_pipeline = Pipeline(stages=[va, rf])
rf_model = rf_pipeline.fit(combined_df_2)
rf_transform = rf_model.transform(testing_df)

#Random Forest metric evaluation

#AUC score
rf_bce = BinaryClassificationEvaluator(labelCol='CANCELLED', metricName='areaUnderROC')
score_auc=rf_bce.evaluate(rf_transform)

#Recall
rf_evaluator_recall = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="weightedRecall")
score_recall=rf_evaluator_recall.evaluate(rf_transform)

#Precision
rf_evaluator_precision = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="weightedPrecision")
score_precision=rf_evaluator_precision.evaluate(rf_transform)

#accuracy
rf_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="accuracy")
score_accuracy=rf_evaluator_accuracy.evaluate(rf_transform)

#f1 
rf_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="f1")
score_f1=rf_evaluator_accuracy.evaluate(rf_transform)

In [16]:
# dataframe with all the metrics of random forest model
scores = [score_auc, score_recall, score_accuracy, score_f1, score_precision]
metricName = ['AUC', 'Recall', 'Balanced Accuracy', 'f1', 'Precision']
metric_df = pd.DataFrame(zip(metricName,scores),index=[1,2,3,4,5],columns=['Metric Name', 'Score'])
metric_df.head(5)

Unnamed: 0,Metric Name,Score
1,AUC,0.691314
2,Recall,0.984729
3,Accuracy,0.984729
4,f1,0.977153
5,Precision,0.969692


In [17]:
# Finding feature importance from randrom forst model
randomForest_model = rf_model.stages[-1]
featureImportance_RF = pd.DataFrame(list(zip(inputColumns, randomForest_model.featureImportances.toArray())),
             columns = ['column', 'weight']).sort_values('weight', ascending=False)

In [18]:
#plot of feature importance for random forest model
ax1 = sns.barplot(x='weight', y='column', data=featureImportance_RF)
ax1.set(xlabel='Importance', ylabel='Features')
display()

In [19]:
# Fitting a gradient Boosting model
gbt = GBTClassifier(featuresCol='features', labelCol='CANCELLED')
gbt_pipeline = Pipeline(stages=[va, gbt])
gbt_model = gbt_pipeline.fit(combined_df_2)
gbt_transform = gbt_model.transform(testing_df)

#GBT metric evaluation

#AUC score
gbt_bce = BinaryClassificationEvaluator(labelCol='CANCELLED',metricName='areaUnderROC')
score_auc = gbt_bce.evaluate(gbt_transform)


#Recall
gbt_evaluator_recall = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="weightedRecall")
score_recall=gbt_evaluator_recall.evaluate(gbt_transform)

#Precision
gbt_evaluator_precision = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="weightedPrecision")
score_precision=gbt_evaluator_precision.evaluate(gbt_transform)

#accuracy
gbt_evaluator_accuracy = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="accuracy")
score_accuracy=gbt_evaluator_accuracy.evaluate(gbt_transform)

#f1 
gbt_evaluator_f1 = evaluation.MulticlassClassificationEvaluator(labelCol="CANCELLED", metricName="f1")
score_f1=gbt_evaluator_f1.evaluate(gbt_transform)

In [20]:
# Finding feature importance from GBT
boost_model = gbt_model.stages[-1]
gbt_df = pd.DataFrame(list(zip(inputColumns, boost_model.featureImportances.toArray())),
             columns = ['column', 'weight']).sort_values('weight', ascending=False)

In [21]:
#plot of feature importance for GBT model
ax1 = sns.barplot(x='weight', y='column', data=gbt_df)
ax1.set(xlabel='Importance', ylabel='Features')
display()

In [22]:
# dataframe with all the metrics of GBT model
scores = [score_auc, score_recall, score_accuracy, score_f1, score_precision]
metricName = ['AUC', 'Recall', 'Balanced Accuracy', 'f1', 'Precision']
metric_df = pd.DataFrame(zip(metricName,scores),index=[1,2,3,4,5],columns=['Metric Name', 'Score'])
metric_df.head(5)

Unnamed: 0,Metric Name,Score
1,AUC,0.805125
2,Recall,0.946531
3,Balanced Accuracy,0.946531
4,f1,0.959969
5,Precision,0.975763
