### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Importing the datasets and additional cleaning

In [4]:
df_cleaned = pd.read_csv("Dataset/cleaned_dataset.csv")
df_planes = pd.read_csv("Dataset/plane-data.csv")

In [5]:
#keeping only necessary columns

#df_planes
df_planes = df_planes.rename({'tailnum':'TailNum', 'year': 'Manu_year'}, axis = 1)
df_planes = df_planes.iloc[:, np.array([0,8])]

#df_cleaned
df_cleaned = df_cleaned.iloc[:,np.array([1,2,3,4,5,11,15])]

In [6]:
#Merge both dataframes based on TailNum 

q2 = pd.merge(df_cleaned, df_planes, how = "inner", on = "TailNum")
q2.shape

(12611605, 8)

In [None]:
#Checking the "year" column and removing null values along with values such as 0000 and None that is not useful for computation

q2.dropna(inplace = True)
q2 = q2[(q2["Manu_year"] != "0000") & (q2["Manu_year"] != "None")]

q2.isnull().sum()

In [None]:
#filter the Arrival delays to be greater than 0
q2 = q2[q2["ArrDelay"] >0] 

### Do Older planes suffer more delays?

In [None]:
#calculate the age of the plane = Year - Manu_year

q2["age"] = q2["Year"] - q2["Manu_year"].astype(int)
q2["age"].value_counts()

In [None]:
#age column has a value with -1 ---> remove that
q2 = q2[q2["age"] != -1]

In [None]:
#plot the relationship between age and average delays 

sns.lmplot(x="age", y="ArrDelay", data=grouped_mean_age,
          aspect = 2, height = 4, palette = "YlGnBu")


plt.title("Flight Delays according to age",
                size = 18)
plt.xticks(fontsize =13)
plt.yticks(fontsize =13)

plt.xlabel("Age", fontsize = 15)
plt.ylabel("ArrDelay", fontsize = 15)


plt.savefig("Figures/Delayed Flights according to Day.png")
plt.show()


In [None]:
#Average delays for old and new planes

print("Avg. Delay of old planes:", q2[q2["age"]>30]["ArrDelay"].mean()) 
print("Avg. Delay of new planes:", q2[q2["age"]<=30]["ArrDelay"].mean())