## Building a Machine Learning Pipeline with PySpark

model to predict flight lateness based on various features.

In [11]:
from pyspark.sql import SparkSession

In [12]:
# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession

In [13]:
# Creating SparkSession
spark = SparkSession.builder.getOrCreate()

# Print spark
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001CD24812220>


In [15]:
# Checking for existing tables in the catalog
print(spark.catalog.listTables())

[]


Data Gathering

In [24]:
# Reading in the flight data
flight_df = spark.read.csv('flights_small.csv', header=True)

In [19]:
# Creating table called flights to support SQL queries
flight_df.createOrReplaceTempView('flights')

# Checking for existing tables in the catalog
print(spark.catalog.listTables())

[Table(name='flights', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]


Data Assessment

In [21]:
# query
query = "SELECT * FROM flights LIMIT 10"

# Get the first 10 rows of flights
flights10 = spark.sql(query)

# Show the results
flights10.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
|2014|    1| 15|    1037|        7|    1

In [46]:
def dataframe_shape(df):
  """
  This function takes a Spark DataFrame and returns its shape as a tuple
  containing the number of rows and columns.

  Args:
      df: The DataFrame to get the shape of.

  Returns:
      str: the number of rows and columns.
  """
  rows = df.count()
  columns = len(df.columns)
  shape = print("Shape of DataFrame: ({}, {})".format(rows, cols))
  return shape

dataframe_shape(flight_df)

Shape of DataFrame: (10000, 16)


In [31]:
# Getting number of rows and columns
f_rows = flight_df.count()
f_columns = len(flight_df.columns)

# Printing shape of DataFrame
print("Shape of DataFrame: ({}, {})".format(f_rows, f_columns))

Shape of DataFrame: (10000, 16)


In [58]:
def count_duplicates(df):
  """
  This function counts the number of duplicate rows in a PySpark DataFrame.

  Args:
      df (pyspark.sql.DataFrame): The DataFrame to check for duplicates.

  Returns:
      str: The number of duplicate rows found.
  """
  num_duplicates = df.count() - df.dropDuplicates().count()
  dups = print("Number of duplicate rows:", num_duplicates)
  return dups

# Checking the number of duplicate rows
count_duplicates(flight_df)

Number of duplicate rows: 0


In [27]:
# Reading in the airports data
airports_df = spark.read.csv('airports.csv', header=True)

# Show the data
airports_df.show(10)

+---+--------------------+----------+------------+----+---+---+
|faa|                name|       lat|         lon| alt| tz|dst|
+---+--------------------+----------+------------+----+---+---+
|04G|   Lansdowne Airport|41.1304722| -80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|32.4605722| -85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|41.9893408| -88.1012428| 801| -6|  A|
|06N|     Randall Airport| 41.431912| -74.3915611| 523| -5|  A|
|09J|Jekyll Island Air...|31.0744722| -81.4277778|  11| -4|  A|
|0A9|Elizabethton Muni...|36.3712222| -82.1734167|1593| -4|  A|
|0G6|Williams County A...|41.4673056| -84.5067778| 730| -5|  A|
|0G7|Finger Lakes Regi...|42.8835647| -76.7812318| 492| -5|  A|
|0P2|Shoestring Aviati...|39.7948244| -76.6471914|1000| -5|  U|
|0S9|Jefferson County ...|48.0538086|-122.8106436| 108| -8|  A|
+---+--------------------+----------+------------+----+---+---+
only showing top 10 rows



In [47]:
# Printing shape of DataFrame
dataframe_shape(airports_df)

Shape of DataFrame: (1397, 16)


In [59]:
# Checking the number of duplicate rows
count_duplicates(airports_df)

Number of duplicate rows: 0


In [28]:
# Reading in the plane data
planes_df = spark.read.csv('planes.csv', header=True)

# Show the data
planes_df.show(10)

+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
|tailnum|year|                type|    manufacturer|   model|engines|seats|speed|   engine|
+-------+----+--------------------+----------------+--------+-------+-----+-----+---------+
| N102UW|1998|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N103US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N104UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N105UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N107US|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N108UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N109UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA|Turbo-fan|
| N110UW|1999|Fixed wing multi ...|AIRBUS INDUSTRIE|A320-214|      2|  182|   NA

In [61]:
# Printing shape of DataFrame
dataframe_shape(planes_df)

Shape of DataFrame: (2628, 16)


In [62]:
# Checking the number of duplicate rows
count_duplicates(planes_df)

Number of duplicate rows: 0


#### ML PIPELINE for model to predict lateness

Data Preparation

In [65]:
# Rename year column for ease of joining
planes_df = planes_df.withColumnRenamed("year", "plane_year")

In [66]:
# Join the DataFrames
model_data = flight_df.join(planes_df, on="tailnum", how="leftouter")

In [67]:
# Cast the columns to integers
model_data = model_data.withColumn("arr_delay", model_data.arr_delay.cast("integer"))
model_data = model_data.withColumn("air_time", model_data.air_time.cast("integer"))
model_data = model_data.withColumn("month", model_data.month.cast("integer"))
model_data = model_data.withColumn("plane_year", model_data.plane_year.cast("integer"))

In [68]:
# Create the column plane_age
model_data = model_data.withColumn("plane_age", model_data.year - model_data.plane_year)

In [69]:
# Create is_late
model_data = model_data.withColumn("is_late", model_data.arr_delay > 0)

In [70]:
# Convert to an integer
model_data = model_data.withColumn("label", model_data.is_late.cast("integer"))

In [71]:
# Remove missing values
model_data = model_data.filter("arr_delay is not NULL and dep_delay is not NULL and air_time is not NULL and plane_year is not NULL")

Feature Engineering for Categorical Data

In [76]:
# 
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

# Create a StringIndexer
carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")

# Create a OneHotEncoder
carr_encoder = OneHotEncoder(inputCol="carrier_index", outputCol="carrier_fact")

In [74]:
# Create a StringIndexer
dest_indexer = StringIndexer(inputCol= "dest", outputCol= "dest_index")

# Create a OneHotEncoder
dest_encoder = OneHotEncoder(inputCol= "dest_index", outputCol = "dest_fact")

Building the Machine Learning Pipeline

In [77]:
# Make a VectorAssembler
vec_assembler = VectorAssembler(inputCols= ["month", "air_time", "carrier_fact", "dest_fact", "plane_age"], outputCol= "features")

In [78]:
# Import Pipeline
from pyspark.ml import Pipeline

# Make the pipeline
flights_pipe = Pipeline(stages= [dest_indexer, dest_encoder, carr_indexer, carr_encoder, vec_assembler])


In [79]:
# Fit and transform the data
piped_data = flights_pipe.fit(model_data).transform(model_data)

Training and Evaluation

In [80]:
# Split the data into training and test sets
training, test = piped_data.randomSplit([.6, .4])

In [81]:
# Import LogisticRegression
from pyspark.ml.classification import LogisticRegression

# Create a LogisticRegression Estimator
lr = LogisticRegression()

In [82]:
# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Create a BinaryClassificationEvaluator
evaluator = evals.BinaryClassificationEvaluator(metricName="areaUnderROC")

In [84]:
# Import the tuning submodule
import numpy as np
import pyspark.ml.tuning as tune

# Create the parameter grid
grid = tune.ParamGridBuilder()

# Add the hyperparameter
grid = grid.addGrid(lr.regParam, np.arange(0, .1, .01))
grid = grid.addGrid(lr.elasticNetParam, [0, 1])

# Build the grid
grid = grid.build()

In [85]:
# Create the CrossValidator
cv = tune.CrossValidator(estimator=lr,
               estimatorParamMaps=grid,
               evaluator=evaluator
               )

In [86]:
# Fit cross validation models
models = cv.fit(training)

# Extract the best model
best_lr = models.bestModel

In [87]:
# Call lr.fit()
best_lr = lr.fit(training)

# Print best_lr
print(best_lr)

LogisticRegressionModel: uid=LogisticRegression_304443256e82, numClasses=2, numFeatures=81


In [88]:
# Use the model to predict the test set
test_results = best_lr.transform(test)

# Evaluate the predictions
print(evaluator.evaluate(test_results))

0.6845499411127688


model's AUC is 0.685