In [1]:
#One worker per core 
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *

sc=SparkContext(master="local[2]")
spark = SparkSession(sc)

#from pyspark.sql.session import SparkSession
#sc = SparkContext.getOrCreate()
#spark = SparkSession(sc)
#sc.stop()
import pandas as pd 
import numpy as np 

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler

In [5]:
csv=spark.read.csv("data/flights.csv", header=True, inferSchema=True)
data = csv.select("DayofMonth", "DayOfWeek", "Carrier", "OriginAirportID", "DestAirportID", "DepDelay", ((col("ArrDelay") > 15).cast("Double").alias("label")))
data.show(2)

+----------+---------+-------+---------------+-------------+--------+-----+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|label|
+----------+---------+-------+---------------+-------------+--------+-----+
|        19|        5|     DL|          11433|        13303|      -3|  0.0|
|        19|        5|     DL|          14869|        12478|       0|  0.0|
+----------+---------+-------+---------------+-------------+--------+-----+
only showing top 2 rows



In [8]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

Training Rows: 1893265  Testing Rows: 808953


In [6]:
strIdx = StringIndexer(inputCol = "Carrier", outputCol = "CarrierIdx")
catVect = VectorAssembler(inputCols = ["CarrierIdx", "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID"], outputCol="catFeatures")
catIdx = VectorIndexer(inputCol = catVect.getOutputCol(), outputCol = "idxCatFeatures")
numVect = VectorAssembler(inputCols = ["DepDelay"], outputCol="numFeatures")
minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), outputCol="normFeatures")
featVect = VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], outputCol="features")
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[strIdx, catVect, catIdx, numVect, minMax, featVect, dt])

In [9]:
piplineModel = pipeline.fit(train)
print("Pipeline complete!")

Pipeline complete!


In [10]:
prediction = piplineModel.transform(test)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show(100, truncate=False)

+---------------------------------------------------+----------+---------+
|features                                           |prediction|trueLabel|
+---------------------------------------------------+----------+---------+
|[10.0,1.0,0.0,10397.0,12264.0,0.03115264797507788] |0.0       |0.0      |
|[10.0,1.0,0.0,10397.0,13851.0,0.03167185877466251] |0.0       |0.0      |
|[10.0,1.0,0.0,10423.0,13487.0,0.029075804776739357]|0.0       |0.0      |
|[10.0,1.0,0.0,10423.0,13487.0,0.029595015576323987]|0.0       |0.0      |
|[10.0,1.0,0.0,10423.0,14869.0,0.029595015576323987]|0.0       |0.0      |
|[10.0,1.0,0.0,10529.0,11193.0,0.030114226375908618]|0.0       |0.0      |
|[10.0,1.0,0.0,10529.0,11193.0,0.03063343717549325] |0.0       |0.0      |
|[10.0,1.0,0.0,10693.0,11433.0,0.029595015576323987]|0.0       |0.0      |
|[10.0,1.0,0.0,10693.0,12478.0,0.03686396677050883] |0.0       |0.0      |
|[10.0,1.0,0.0,10721.0,11066.0,0.032191069574247146]|0.0       |0.0      |
|[10.0,1.0,0.0,10721.0,12