### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
from scipy.stats import chi2_contingency

### Importing dataset and selecting necessary variables

In [2]:
df = pd.read_csv("Dataset/cleaned_dataset.csv")

In [4]:
q4 = df.loc[:, ["Year", "Month", "DayofMonth", "DayOfWeek","DepTime", "TailNum", "ArrDelay",
            "Origin", "Dest","LateAircraftDelay"]]

### Can you detect cascading delays as delays in one airport create delays in others

*Converting to datetime*

In [5]:
#extracting the minutes
mins = q4["DepTime"].apply(lambda x: x%10**2)
mins =mins.astype(int)

In [6]:
#extracting the hours
hours = q4["DepTime"].apply(lambda x: np.floor(x/100))
hours = hours.astype(int)

In [7]:
#extracting the days
day = q4["DayofMonth"].astype(str) 
day = day.apply(lambda x: "{0:0>2}".format(x))

In [8]:
#converting the month
month = q4["Month"].astype(str)
month = q4["Month"].apply(lambda x:"{0:0>2}".format(x))

In [9]:
#converting to datetime format

q4["Datetime"] = pd.to_datetime(
                  day + "/" + month + "/" + q4["Year"].astype(str) + ","  +
                  hours.astype(str) + ":" + mins.astype(str), 
                  format = "%d/%m/%Y,%H:%M"
                  )

In [9]:
q4.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,TailNum,ArrDelay,Origin,Dest,LateAircraftDelay,Datetime
0,2006,1,11,3,743.0,N657AW,6.0,ATL,PHX,0,2006-01-11 07:43:00
1,2006,1,11,3,1053.0,N834AW,-5.0,ATL,PHX,0,2006-01-11 10:53:00
2,2006,1,11,3,1915.0,N605AW,-23.0,ATL,PHX,0,2006-01-11 19:15:00
3,2006,1,11,3,1753.0,N312AW,-8.0,AUS,PHX,0,2006-01-11 17:53:00
4,2006,1,11,3,824.0,N309AW,0.0,AUS,PHX,0,2006-01-11 08:24:00


*Grouping and sorting the dataframe*

In [11]:
#grouping by tailnumber and sorting by datetime
q4 = q4.sort_values(["TailNum", "Datetime"]).set_index("Datetime")

In [12]:
#comparing the current delay with the next delay
q4["Next_delay"] = q4["ArrDelay"].shift(-1)

*Plotting cascading delays*

In [None]:
sns.lmplot(data = q4, x = "ArrDelay", y = "Next_delay",
          line_kws={'color': 'yellow'})

plt.title("Cascading delays")
plt.xlabel("Current Delay") 
plt.ylabel("Next Delay")
plt.savefig("Figures/Cascading delays.png")
plt.show()

*Checking the significance of the proportion of flights that have a cascading delay*

In [13]:
#encoding the arrival delay and next delay columns

q4["HasCurrentDelay"] = q4["ArrDelay"].apply(lambda x: 1 if x>0 else 0)
q4["HasNextDelay"] = q4["Next_delay"].apply(lambda x: 1 if x>0 else 0)

In [14]:
#contingency table 
crosstab = pd.crosstab(q4["HasNextDelay"], q4["HasCurrentDelay"], rownames=["Current Delay"], colnames=["Next Delay"])
crosstab 

Next Delay,0,1
Current Delay,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5096958,2529746
1,2529745,4112479


In [15]:
pd.crosstab(q4["HasCurrentDelay"],q4["HasNextDelay"]).apply(lambda x:x/x.sum(),axis=0)

HasNextDelay,0,1
HasCurrentDelay,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.668304,0.380858
1,0.331696,0.619142


In [None]:
# it can be judged that there is a significant difference in the proportions of flights that have a cascaded effect.
# if there was a delay in the current airport it is more likely that there will be a delay in the next airport as well
# this is formally tested through the significance test below.


*Significance Test*

In [None]:
#performing a chi squared test to check the significance of the relationship between current delays and next delays

In [37]:
print("Defining the hypothesis")

print("H0: There is no association between the Current delays and the Next delays") 
print("H1: There is a association between the Current delays and the Next delays")


Defining the hypothesis
H0: There is no association between the Current delays and the Next delays
H1: There is a association between the Current delays and the Next delays


In [21]:
stat, p, dof, expected = chi2_contingency(crosstab)
print("Degree of freedom: ", dof)
print(expected)

Degree of freedom:  1
[[4076452.43405195 3550251.56594805]
 [3550250.56594805 3091973.43405195]]


In [28]:
#interpret the results
#testing at a 5% significance level
prob = 0.95
critical = stats.chi2.ppf(prob, dof)
if abs(stat) >= critical:
    print("Reject H0 - There is a significant relationship")
else:
    print("Fail to reject H0 - There is no significant relationship")

Reject H0 - There is a significant relationship


Thus it can be formally concluded that delays at the current airport have a significant relationship with the delays in the
next airport indicating cascading delayed effects. 