In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

## Load the data and sample a part to have as 'train' data

In [6]:
airports_full = pd.read_csv("../../1990.csv")
airports_full = airports_full.drop(["ArrTime", "ActualElapsedTime", "AirTime", "TaxiIn", "Diverted", "CarrierDelay",
                         "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay"], axis=1)
airports = airports_full.sample(frac=0.9, replace=False, random_state=42)

In [None]:
airports.count()

In [None]:
for col in airports.columns:
    print(col, len(airports[col].unique())) 

In [None]:
airports[airports['Cancelled'] == 1]

In [None]:
print((airports[airports['Cancelled'] == 1].count()/airports['ArrDelay'].isnull().sum())['Year'], """% of nulls in 
      arrdelay is because of cancellation.""")


In [None]:
# What to do with next instances?
airports[(airports['Cancelled'] == 0) & (airports['ArrDelay'].isnull())]

In [None]:
# Flightnums are used by different carriers to identify their trips. 
airports_full[airports_full['FlightNum'] == 1423].sort_values(by='Month', inplace=False)

## Select factors based on correlations

In [None]:
corr_matrix = airports.corr()

In [None]:
corr_matrix['ArrDelay'].sort_values()

In [None]:
# Check the scatterplot
sns.scatterplot(data=airports, x= 'DepDelay', y = 'ArrDelay')
plt.show()

In [None]:
sns.scatterplot(data=airports, x= 'Distance', y = 'ArrDelay')
plt.show()

In [None]:
airports['ResDelay'] = airports['ArrDelay'] - airports['DepDelay']
sns.scatterplot(data=airports, x= 'Distance', y = 'ResDelay')
plt.show()

## Variables based on categories
For example day of week can be seen as category. Weekends can be more busy. As is shown in data as well by second plot with mean of delay per day.
The errors in the plots are the standard error of the mean, defined as:
$$ \sigma_{\bar{x}} = \frac{\sigma}{\sqrt{n}}$$
In each of the categories there seems to be a difference. 
I have created a new variable called "ResDelay", this is a residual delay after subtracting the departure delay from the arrival delay. It could for example be that one company always departs before planned time and therefore has low arrival delays. Plots are made for different variables.

In [None]:
def plot_with_error_bars(x, y, data=airports):
    """Function that plots data with the mean of y grouped by the values of x"""
    
    grouped_month = data.groupby(x).agg([np.mean, np.std, 'count'])[y]
    # Standard error of mean is defined as std / sqrt(n) 
    grouped_month['error'] = grouped_month['std'] / np.sqrt(grouped_month['count'])
    
    if type(x) == str: 
        title = y + " by " + x
    else:
        title = y + " by " + " and ".join(x)
    grouped_month.plot(kind = "bar", y = "mean", legend = False,  
              yerr = "error", title = title)
    plt.show()

In [None]:
sns.boxplot(data=airports, x= 'DayOfWeek', y = 'ArrDelay')
plt.show()
plot_with_error_bars(x='DayOfWeek', y='ArrDelay')
plot_with_error_bars(x='DayOfWeek', y='ResDelay')

In [None]:
plot_with_error_bars(x='Month', y='ArrDelay')
plot_with_error_bars(x='Month', y='ResDelay')

In [None]:
plot_with_error_bars(x='UniqueCarrier', y='ArrDelay')
plot_with_error_bars(x='UniqueCarrier', y='ResDelay')

In [None]:
plot_with_error_bars(x='DayofMonth', y="ArrDelay")
plot_with_error_bars(x='DayofMonth', y="ResDelay")


In [None]:
airports['DepHour'] = airports["CRSDepTime"]//100

plot_with_error_bars(x='DepHour', y="ArrDelay")
plot_with_error_bars(x='DepHour', y="ResDelay")


In [None]:
airports['ArrHour'] = airports["CRSArrTime"]//100

plot_with_error_bars(x='ArrHour', y="ArrDelay")
plot_with_error_bars(x='ArrHour', y="ResDelay")


Below are some plots that are not very clear, but there seem to be some difference between origin and destinations. 

In [None]:
plot_with_error_bars(x='Origin', y="ArrDelay")
plot_with_error_bars(x='Origin', y="ResDelay")

In [None]:
plot_with_error_bars(x='Dest', y="ArrDelay")
plot_with_error_bars(x='Dest', y="ResDelay")

In [None]:
plot_with_error_bars(x=['UniqueCarrier', 'DayOfWeek'] , y='ArrDelay')
plot_with_error_bars(x=['UniqueCarrier', 'DayOfWeek'], y='ResDelay')

In [None]:
sns.scatterplot(data=airports_df, x= 'TaxiOut', y = 'ArrDelay')
plt.show()

In [None]:
airports_df['ResDelay'] = airports_df['ArrDelay'] - airports_df['DepDelay']
sns.scatterplot(data=airports_df, x= 'TaxiOut', y = 'ResDelay')
plt.show()