In [0]:
import numpy as np
import pandas as pd
import plotly as px
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from datetime import datetime
import time

import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.functions import isnan, when, count, col

##### Goal:
Compare the scheduled flight duration to the average duration for that route.

##### Hypothesis:
The number of air travel passengers has been reaching all time highs for the past few years, and in an effort to maximize potiential revenue, airlines have been known to try to fit more flights into their schedule.  This reduces the amount of 'buffer time' between flights that is availble to absorb small delays, or to allow the schedule to recover from a previous delay.  To catpture this information we measured the average scheduled duration for each origin-destination pair and then created a feature that is the difference between the current scheduled flight and the average. If the airlines are giving the current flight less time than is typical, we will see a negative number, and if it's longer than average we'll get a positive number.

In [0]:
# read training data
train = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/team20SSDK/strategy/model_datasets/train")

In [0]:
# get the average scheduled duration for each origin-destination pair
OD_avg_elapsed_time = train.select('ORIGIN','DEST','CRS_ELAPSED_TIME').groupBy('ORIGIN','DEST').agg({'CRS_ELAPSED_TIME':'avg'}).withColumnRenamed('avg(CRS_ELAPSED_TIME)', 'AVG_CRS_ELAPSED_TIME')

In [0]:
# add the average scheduled duration column back onto the training data
train_with_avg_crs_elapsed = train.join(OD_avg_elapsed_time, ['ORIGIN','DEST'])

In [0]:
# subtract the scheduled duration from the average scheduled duration
train_with_avg_crs_elapsed_diff = train_with_avg_crs_elapsed.withColumn('AVG_CRS_ELAPSED_TIME_DIFF', col('AVG_CRS_ELAPSED_TIME') - col('CRS_ELAPSED_TIME')).drop('AVG_CRS_ELAPSED_TIME')

In [0]:
# visualize against delays
train_with_avg_crs_elapsed_diff.registerTempTable('data')
spark.sql('SELECT count(DEP_DEL15), sum(DEP_DEL15), ROUND(AVG_CRS_ELAPSED_TIME_DIFF / 10,0) AS AVG_CRS_ELAPSED_TIME_DIFF_BIN FROM data GROUP BY ROUND(AVG_CRS_ELAPSED_TIME_DIFF / 10,0) ORDER BY AVG_CRS_ELAPSED_TIME_DIFF_BIN').display()

count(DEP_DEL15),sum(DEP_DEL15),AVG_CRS_ELAPSED_TIME_DIFF_BIN
1,1.0,-24.0
1,1.0,-20.0
2,2.0,-19.0
1,1.0,-15.0
1,1.0,-14.0
16,6.0,-12.0
2,2.0,-11.0
3,1.0,-9.0
3,0.0,-8.0
37,10.0,-7.0
