In [None]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
from matplotlib.artist import setp
import pandas.core.common as com
from pandas.compat import range, lrange, lmap, map, zip
from mpl_toolkits.mplot3d import Axes3D
from statsmodels.nonparametric.smoothers_lowess import lowess
import seaborn as sns
sns.set_style("whitegrid")
import pandas as pd
import numpy as np

In [None]:
"""
This module provides helper methods to carry out data distribution
analysis on flight data found on https://www.kaggle.com/usdot/flight-delays.

These methods are specific to the flight dataset and is not meant to be 
generic functions for other datasets.
"""

def scatter_matrix_all(frame, alpha=0.5, figsize=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds):
    
    df = frame
    num_cols = frame._get_numeric_data().columns.values
    n = df.columns.size
    fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = com.notnull(df)
    marker = _get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # workaround because `c='b'` is hardcoded in matplotlibs scatter method
    kwds.setdefault('c', plt.rcParams['patch.facecolor'])

    boundaries_list = []
    for a in df.columns:
        if a in num_cols:
            values = df[a].values[mask[a].values]
        else:
            values = df[a].value_counts()
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):
            ax = axes[i, j]

            if i == j:
                if a in num_cols:    # numerical variable
                    values = df[a].values[mask[a].values]
                    # Deal with the diagonal by drawing a histogram there.
                    if diagonal == 'hist':
                        ax.hist(values, **hist_kwds)
                    elif diagonal in ('kde', 'density'):
                        from scipy.stats import gaussian_kde
                        y = values
                        gkde = gaussian_kde(y)
                        ind = np.linspace(y.min(), y.max(), 1000)
                        ax.plot(ind, gkde.evaluate(ind), **density_kwds)
                    ax.set_xlim(boundaries_list[i])
                else:                # categorical variable
                    values = df[a].value_counts()
                    ax.bar(list(range(df[a].nunique())), values)
            else:
                common = (mask[a] & mask[b]).values
                # two numerical variables
                if a in num_cols and b in num_cols:
                    if i > j:
                        ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds)
                        # The following 2 lines add the lowess smoothing
                        ys = lowess(df[a][common], df[b][common])
                        ax.plot(ys[:,0], ys[:,1], 'red')
                    else:
                        pearR = df[[a, b]].corr()
                        ax.text(df[b].min(), df[a].min(), 'r = %.4f' % (pearR.iloc[0][1]))
                    ax.set_xlim(boundaries_list[j])
                    ax.set_ylim(boundaries_list[i])
                # two categorical variables
                elif a not in num_cols and b not in num_cols:
                    if i > j:
                        from statsmodels.graphics import mosaicplot
                        mosaicplot.mosaic(df, [b, a], ax, labelizer=lambda k:'')
                # one numerical variable and one categorical variable
                else:
                    if i > j:
                        tol = pd.DataFrame(df[[a, b]])
                        if a in num_cols:
                            label = [ k for k, v in tol.groupby(b) ]
                            values = [ v[a].tolist() for k, v in tol.groupby(b) ]
                            ax.boxplot(values, labels=label)
                        else:
                            label = [ k for k, v in tol.groupby(a) ]
                            values = [ v[b].tolist() for k, v in tol.groupby(a) ]
                            ax.boxplot(values, labels=label, vert=False)

            ax.set_xlabel('')
            ax.set_ylabel('')

            _label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
            _label_axis(ax, kind='y', label=a, position='left')

            if j!= 0:
                ax.yaxis.set_visible(False)
            if i != n-1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)
    return fig
    

def _label_axis(ax, kind='x', label='', position='top', ticks=True, rotate=False):
    from matplotlib.artist import setp
    if kind == 'x':
        ax.set_xlabel(label, visible=True)
        ax.xaxis.set_visible(True)
        ax.xaxis.set_ticks_position(position)
        ax.xaxis.set_label_position(position)
        if rotate:
            setp(ax.get_xticklabels(), rotation=90)
    elif kind == 'y':
        ax.yaxis.set_visible(True)
        ax.set_ylabel(label, visible=True)
        #ax.set_ylabel(a)
        ax.yaxis.set_ticks_position(position)
        ax.yaxis.set_label_position(position)
    return

def _get_marker_compat(marker):
    import matplotlib.lines as mlines
    import matplotlib as mpl
    if mpl.__version__ < '1.1.0' and marker == '.':
        return 'o'
    if marker not in mlines.lineMarkers:
        return 'o'
    return marker

def plotBarPercentage(data, groupAttr, dependencyAttr, axAttr, condition, filter=0):
    totaldf = data.groupby([groupAttr])[dependencyAttr].count()
    denomdf = data.loc[condition]
    denomdf = denomdf.groupby([groupAttr])[dependencyAttr].count()
    df  = denomdf/totaldf*100
    df  = df[df > filter]
    if len(df) > 0:
        ax = df.plot.bar(figsize=(14, 6), ax = axAttr)
        ax.set_title(dependencyAttr)
        ax.set_ylabel('Percentage')

def plotBar(data, groupAttr, dependencyAttr, axAttr, condition):
    df = data.loc[condition]
    df = df.groupby([groupAttr])[dependencyAttr].count()
    ax = df.plot.bar(figsize=(14, 6), ax = axAttr)
    ax.set_ylabel(dependencyAttr)

def plotBars(data, groupAttr, dependencyAttrs, rows, cols, conditions):
    fig, axes = plt.subplots(nrows=rows, ncols=cols)
    r = 0
    c = 0
    for i in range(len(dependencyAttrs)):
       plotBar(data, groupAttr, dependencyAttrs[i], axes[r,c], conditions[i])
       if c == cols-1:
           c = -1
           r = r + 1
       c = c + 1
        
def plotBarsPercentage(data, groupAttr, dependencyAttrs, rows, cols, conditions, filter = 0):
    fig, axes = plt.subplots(nrows=rows, ncols=cols)
    r = 0
    c = 0
    for i in range(len(dependencyAttrs)):
       if rows > 1:
          plotBarPercentage(data, groupAttr, dependencyAttrs[i], axes[r,c], conditions[i], filter)
       else:
          plotBarPercentage(data, groupAttr, dependencyAttrs[i], axes[c], conditions[i], filter)

       if c == cols-1:
           c = -1
           r = r + 1
       c = c + 1

def plotMapData(df, longAttr, latAttr, valAttr, figw=8, figh=8, initmarksize= 0.5):
    # setup Lambert Conformal basemap.
    plt.figure(figsize=(figw,figh))
    m = Basemap(width=12000000,height=9000000,projection='lcc',
                resolution='c',lat_1=45.,lat_2=55,lat_0=50,lon_0=-107.)
    # draw a boundary around the map, fill the background.
    # this background will end up being the ocean color, since
    # the continents will be drawn on top.
    m.drawmapboundary(fill_color='aqua')
    # fill continents, set lake color same as ocean color.
    m.fillcontinents(color='coral',lake_color='aqua')
    # draw parallels and meridians.
    # label parallels on right and top
    # meridians on bottom and left
    parallels = np.arange(0.,81,10.)
    # labels = [left,right,top,bottom]
    m.drawparallels(parallels,labels=[False,True,True,False])
    meridians = np.arange(10.,351.,20.)
    m.drawmeridians(meridians,labels=[True,False,False,True])
    # plot blue dot on Boulder, colorado and label it as such.

    for lon, lat, mag in zip(df[longAttr].values, df[latAttr].values, df[valAttr].values):
        xpt,ypt = m(lon, lat)
        lonpt, latpt = m(xpt,ypt,inverse=True)
        msize = mag * initmarksize
        #map.plot(x, y, marker_string, markersize=msize)
        m.plot(xpt,ypt,'bo', markersize=msize)  # plot a blue dot there  

    plt.show()

def plotJointPlotSplice0_10_240_By(x, y, delayAttr, data):
    # Create dataset based on splice conditions
    flights_greater_than_0_and_less_than_10 = data.loc[
        (data[delayAttr] > 0)
        & (data[delayAttr] <= 10)
    ]
    flights_greater_than_10_and_less_than_240 = data.loc[
        (data[delayAttr] > 10)
        & (data[delayAttr] <= 240)
    ]

    flights_greater_than_240 = data.loc[
        (data[delayAttr] > 240)
    ]
    sns.jointplot(x=x, y=y, kind="kde", data=flights_greater_than_0_and_less_than_10, size=4)
    sns.jointplot(x=x, y=y, kind="kde", data=flights_greater_than_10_and_less_than_240, size=4)
    sns.jointplot(x=x, y=y, kind="kde", data=flights_greater_than_240, size=4)

def plotJointPlot(x, y, delayAttr, data, title):
    df = data
    datasetSize = len(df)
    g = sns.jointplot(x=x, y=y, kind="kde", data=df, size=4)
    txt = plt.title(title + ",\n Dataset Size = " + str(datasetSize), fontsize = 24, y = 0.5, x = 6)
    
def plotJointPlotSplice(x, y, delayAttr, data, cond, title):
    df = data.loc[cond]
    datasetSize = len(df)
    g = sns.jointplot(x=x, y=y, kind="kde", data=df, size=4)
    txt = plt.title(title + ",\n Dataset Size = " + str(datasetSize), fontsize = 24, y = 0.5, x = 6)
    
def generateDistributionDF(data, timeAttr, monthAttr, delayAttr, aggfunc= np.sum):
    pivot = pd.pivot_table(data,index=[monthAttr, timeAttr],values=[delayAttr],aggfunc=aggfunc)
    pivot.reset_index(level=0, inplace=True)
    pivot.reset_index(level=0, inplace=True)
    return pivot

def plot3D(data, x, y, z):
    distdf = generateDistributionDF(data, y, x, z)
    distdf_avg = generateDistributionDF(data, y, x, z, np.mean)    

    fig = plt.figure(figsize=(16, 6), dpi=80)

    #---- First subplot
    ax = fig.add_subplot(1, 2, 1, projection='3d')

    surf = ax.plot_trisurf(distdf[x], distdf[y], distdf[z], cmap=plt.cm.jet, linewidth=0.03)
    fig.colorbar(surf)

    #---- Second subplot
    ax = fig.add_subplot(1, 2, 2, projection='3d')
    surf = ax.plot_trisurf(distdf_avg[x], distdf_avg[y], distdf_avg[z], cmap=plt.cm.jet, linewidth=0.03)
    fig.colorbar(surf)

    plt.show()


In [None]:
import Geohash
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import importlib
import warnings
import seaborn as sns
sns.set_style("whitegrid")
warnings.filterwarnings('ignore')

## 1.0 Introduction
This document analyzes departure delay data provided by the U.S. Department of Transportation's (DOT) Bureau of Transportation Statistics. The dataset is published on kaggle.com and is composed of ~5 Million flights, in the year of 2015. 

The purpose of this notebook is to explore the distribution and correlation of the data.  Specifically, we want to answer the following question:

<b>*What factors affect flight departure delays?*</b>

We recognize that there are many factors that may affect flight delays.  Some of these factors most likely are highly non-linear; factors such as:

* Weather - weather greatly affects flight delays.  Extreme weather can cause a flight to be delayed, diverted or cancelled.
* Airport Congestion - flights can be delayed due to a high number of flights being queued during certain time of year or during a certain time of the day.  Airports may not be able to handle the queue size.
* Carrier Maintance - the time to clean the aircraft, aircraft fueling, engineering inspections, etc. will affect flight delays.
* In-Bound Delay - delays incurred from upstream airports that add to the delay of the departure time
* En-route Weather - during the flight the aircraft may encounter severe weather that causes delays in the arrival time.

In order to simplify our analysis, we will focus our analysis on departure delays for flights from the Atlanta airport.  We picked the Atlanta airport because it contains the most number of flights, with a total of 346,836 flights.  Thus we change our initial question to a more focus question of:

<b>*What factors affect flight departure delays from the Atlanta airport?*</b>

We can ignore any features that are associated with incoming flight delays and only focus on features that contribute to the departure delay.  Note that we will only consider the take off part of the flight and we are not concerned with the airtime or landing phase of the flight.  Therefore, we will only look at the following features in the dataset:

<table>
    <th>Attribute</th><th>Description</th><th>Type</th>
    <tr><td>YEAR</td><td>Year of the Flight Trip</td><td>Numeric</td></tr>
    <tr><td>MONTH</td><td>Month of the Flight Trip</td><td>Numeric</td></tr>
    <tr><td>DAY</td><td>Day of the Flight Trip</td><td>Numeric</td></tr>
    <tr><td>DAY_OF_WEEK</td><td>Day of week of the Flight Trip</td><td>Numeric</td></tr>
    <tr><td>AIRLINE</td><td>Airline Identifier</td><td>String</td></tr>
    <tr><td>FLIGHT_NUMBER</td><td>Year of the Flight Trip</td><td>Numeric</td></tr>
    <tr><td>DEPARTURE_TIME</td><td>WHEEL_OFF - TAXI_OUT</td><td>Numeric</td></tr>
    <tr><td>TAXI_IN</td><td>The time duration elapsed between wheels-on and gate arrival at the destination airport</td><td>Numeric</td></tr>
    <tr><td>DIVERTED</td><td>Aircraft landed on airport that out of schedule</td><td>Numeric</td></tr>
    <tr><td>CANCELLED</td><td>Flight Cancelled (1 = cancelled)</td><td>Numeric</td></tr>
    <tr><td>WHEELS_OFF</td><td>The time point that the aircraft's wheels leave the ground</td><td>Numeric</td></tr>
    <tr><td>DEPARTURE_DELAY</td><td>Total Delay on Departure</td><td>Numeric</td></tr>
    <tr><td>AIR_SYSTEM_DELAY</td><td>Delay caused by air system</td><td>Numeric</td></tr>
    <tr><td>SECURITY_DELAY</td><td>Delay caused by security</td><td>Numeric</td></tr>
    <tr><td>AIRLINE_DELAY</td><td>Delay caused by the airline</td><td>Numeric</td></tr>
    <tr><td>WEATHER_DELAY</td><td>Delay caused by weather</td><td>Numeric</td></tr>
</table>

Since we're looking at the departure delay we are not interested in the scheduled departure.  We are more interested in the actual departure time.
<table>
    <th>Attribute</th><th>Description</th><th>Type</th>
    <tr><td>SCHEDULED_DEPARTURE</td><td>SCHEDULED_DEPARTURE</td><td>Numeric</td></tr>
</table>

We are not interested in features that describe any incoming flight or flight specific information such as air time, flight number, etc.  These features include:
<table>
    <th>Attribute</th><th>Description</th><th>Type</th>
    <tr><td>ORIGIN_AIRPORT</td><td>Starting Airport</td><td>String</td></tr>
    <tr><td>TAIL_NUMBER</td><td>Aircraft Identifier</td><td>Numeric</td></tr>
    <tr><td>DESTINATION_AIRPORT</td><td>Destination Airport</td><td>String</td></tr>
    <tr><td>SCHEDULED_TIME</td><td>Planned time amount needed for the flight trip</td><td>Numeric</td></tr>
    <tr><td>ELAPSED_TIME</td><td>AIR_TIME+TAXI_IN+TAXI_OFF</td><td>Numeric</td></tr>
    <tr><td>TAXI_OUT</td><td>The time duration elapsed between departure from the origin airport gate and wheels off</td><td>Numeric</td></tr>
    <tr><td>AIR_TIME</td><td>The time duration between wheels_off and wheels_on time</td><td>Numeric</td></tr>
    <tr><td>DISTANCE</td><td>Distance between two airports</td><td>Numeric</td></tr>
    <tr><td>WHEELS_ON</td><td>The time point that the aircraft's wheels touch on the ground</td><td>Numeric</td></tr>
    <tr><td>SCHEDULED_ARRIVAL</td><td>Planned arrival time</td><td>Numeric</td></tr>
    <tr><td>ARRIVAL_TIME</td><td>WHEELS_ON+TAXI_IN</td><td>Numeric</td></tr>
    <tr><td>SCHEDULED_ARRIVAL</td><td>Planned arrival time</td><td>Numeric</td></tr>
    <tr><td>ARRIVAL_DELAY</td><td>ARRIVAL_TIME-SCHEDULED_ARRIVAL</td><td>Numeric</td></tr>
    <tr><td>LATE_AIRCRAFT_DELAY</td><td>Delay caused by aircraft</td><td>Numeric</td></tr>
    <tr><td>CANCELLATION_REASON</td><td>Reason for Cancellation of flight: A - Airline/Carrier; B - Weather; C - National Air System; D - Security</td><td>String</td></tr>
</table>

Since we are analyzing data associated with the Atlanta airport we gathered Atlanta weather information from the National Centers for Environmental Information (NCEI) website. https://www.ncdc.noaa.gov.  The following Atlanta weather data was gathered for the year of 2015:
* WSF2 - Fastest 2-minute wind speed
* WSF5 - Fastest 5-second wind speed
* SNOW - Snowfall
* WESF - Water equivalent of snowfall
* PRCP - Precipitation
* SNWD - Snow depth
* WDF2 - Direction of fastest 2-minute wind
* AWND - Average wind speed
* WDF5 - Direction of fastest 5-second wind
* PGTM - Peak gust time
* TMAX - Maximum temperature
* WESD - Water equivalent of snow on the ground
* PSUN - Daily percent of possible sunshine for the period
* TAVG - Average Temperature.
* TMIN - Minimum temperature
* TSUN - Total sunshine for the period

This document will go through the following Data Analysis.

* [2.0 Tranformation The Data](#section20)
* [3.0 Exploring the Data Distribution](#section30)
* [3.1 Exploring the Data Distribution by Departure Time and Scheduled Departure Time](#section31)
* [3.1.1 Look at the distribution of depature delay based on departure time and month](#section311)
* [3.1.2 Look at the distribution of weather delay based on departure time and month](#section312)
* [3.1.3 Look at the distribution of departure delay based on day of the week and month](#section313)
* [3.1.4 Look at the distribution of weather delay based on day of the week and month](#section314)
* [4.0 Find Correlation Between Features](#section40)
* [5.0 Load Weather Data](#section50)


In [None]:
#Read data
flights = pd.read_csv('../input/atlantaairportdata/ATL_')

<a id='#section20'></a>
## 2. 0 Transforming the Data
The dataset spans a whole year and contains time data such as Year, Month, Day, Day of the Week, Departure Time and Scheduled departure.  In order to better analyze the data, we will tranfrom the data as follows:

1. Bin the Departure time into hours (i.e. 0, 100, 200, 300... 2400).  
2. Replace NA with 0.  We will assume 0 delay in cases where information is not entered

In [None]:
# Clean up data.  Fill NA with 0.  Assume 0 delay in cases where information is not entered
flights = flights.fillna(0)

# Let's create a category for Departure time based on hour (0 to 2400 hr clock)
flights['DEPARTURE_TIME_BIN'] = pd.cut(flights['DEPARTURE_TIME'], bins=np.arange(0,2400, 100), labels=np.arange(23))
# Let's create a category for Schedule Departure time based on hour (0 to 2400 hr clock)
flights['SCHEDULED_DEPARTURE_BIN'] = pd.cut(flights['SCHEDULED_DEPARTURE'], bins=np.arange(0,2400, 100), labels=np.arange(23))

<a id='#section30'></a>
## 3.0 Exploring the Data Distribution
We will first plot all the features in the dataset to get an idea of the distribution at a high level.

In [None]:
axbox = flights.plot.box(figsize=(16, 6), rot=90)

In [None]:
# Note we are only interested in flights that have a delay greater than 0
fig, axes = plt.subplots(nrows=1, ncols=4,figsize=(10, 4))
df = flights.loc[flights['SECURITY_DELAY'] > 0]
axbox = df[['SECURITY_DELAY']].plot.box(ax = axes[0], rot=90)
df = flights.loc[flights['WEATHER_DELAY'] > 0]
axbox = flights[['WEATHER_DELAY']].plot.box(ax = axes[1], rot=90)
df = flights.loc[flights['TAXI_OUT'] > 0]
axbox = flights[['TAXI_OUT']].plot.box(ax = axes[2], rot=90)
df = flights.loc[flights['DEPARTURE_DELAY'] > 0]
axbox = flights[['DEPARTURE_DELAY']].plot.box(ax = axes[3], rot=90)

In [None]:
# Note we are only interested in flights that have a delay greater than 0
overalDF = pd.DataFrame()
fig, axes = plt.subplots(nrows=2, ncols=2,figsize=(16, 8))
df = flights.loc[flights['SECURITY_DELAY'] > 0]
overalDF['SECUIRTY_DELAY'] = df['SECURITY_DELAY'].describe()
axbox = df[['SECURITY_DELAY']].plot.hist(ax = axes[0, 0], rot=90)
df = flights.loc[flights['WEATHER_DELAY'] > 0]
overalDF['WEATHER_DELAY'] = df['WEATHER_DELAY'].describe()
axbox = flights[['WEATHER_DELAY']].plot.hist(ax = axes[0, 1], rot=90)
df = flights.loc[flights['TAXI_OUT'] > 0]
overalDF['TAXI_OUT'] = df['TAXI_OUT'].describe()
axbox = flights[['TAXI_OUT']].plot.hist(ax = axes[1, 0], rot=90)
df = flights.loc[flights['DEPARTURE_DELAY'] > 0]
overalDF['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].describe()
axbox = flights[['DEPARTURE_DELAY']].plot.hist(ax = axes[1, 1], rot=90)

From the above box plots and histograms, one can make the following observations about the distribution of the data:
* There is a small dataset for security delay greater than 0
* Most of the departure delays are less than 240 minutes.  We can see outliers up to +240 minutes.  
* Most of the weather delays are less than 200 minutes.  We can see outliers up to +200 minutes.  

Let us filter out the extreme delays as they may be due to extreme conditions.  Let's replot the box plots and histograms and only consider depature delays less than 240 minutes and weather delays less than 200 minutes

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=4,figsize=(16, 8))
df = flights.loc[flights['SECURITY_DELAY'] > 0]
axbox = df[['SECURITY_DELAY']].plot.box(ax = axes[0], rot=90)
df = flights.loc[(flights['WEATHER_DELAY'] > 0) & (flights['WEATHER_DELAY'] < 200)]
axbox = df[['WEATHER_DELAY']].plot.box(ax = axes[1], rot=90)
df = flights.loc[flights['TAXI_OUT'] > 0]
axbox = flights[['TAXI_OUT']].plot.box(ax = axes[2], rot=90)
df = flights.loc[(flights['DEPARTURE_DELAY'] > 0) & (flights['DEPARTURE_DELAY'] < 240)]
axbox = df[['DEPARTURE_DELAY']].plot.box(ax = axes[3], rot=90)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2,figsize=(16, 8))
overallDF = pd.DataFrame()
df = flights.loc[flights['SECURITY_DELAY'] > 0]
overallDF['SECUIRTY_DELAY'] = df['SECURITY_DELAY'].describe()
axbox = df[['SECURITY_DELAY']].plot.hist(ax = axes[0, 0], rot=90)
df = flights.loc[(flights['WEATHER_DELAY'] > 0) & (flights['WEATHER_DELAY'] < 200)]
overallDF['WEATHER_DELAY'] = df['WEATHER_DELAY'].describe()
axbox = df[['WEATHER_DELAY']].plot.hist(ax = axes[0, 1], rot=90)
df = flights.loc[flights['TAXI_OUT'] > 0]
overallDF['TAXI_OUT'] = df['TAXI_OUT'].describe()
axbox = df[['TAXI_OUT']].plot.hist(ax = axes[1, 0], rot=90)
df = flights.loc[(flights['DEPARTURE_DELAY'] > 0) & (flights['DEPARTURE_DELAY'] < 240)]
overallDF['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].describe()
axbox = df[['DEPARTURE_DELAY']].plot.hist(ax = axes[1, 1], rot=90)

After filtering some the extreme outlier data, we can visualize the distribution better in the boxplots.  We can see that most of the delays range from 0 to 20 minutes.  The frequency of flights that have delays greater than 20 minutes significantly drops.  From a business perspective this is reasonable, as we expect minor delays due to various factors explained in the introduction section.  The following table shows the distribution statistics (i.e. mean, standard deviations, etc.) for all 4 features.

In [None]:
overallDF

<a id="#section31"></a>
### 3.1 Explore Distribution by Departure Time
Let's look at the distribution of the features by departure time.  We will use seaborn join plots so that we can plot the distribution by both Month and Departure time.

In [None]:
plotJointPlot('DEPARTURE_TIME','MONTH', 'DEPARTURE_DELAY', 
                             flights, 'All Flights')

The following observation can be made based on the above distribution:
* Number of flights is 345,836 flying into and out of the Atlanta airport
* We don't see information for the month of October.  The dataset does not contain data for the month of October.  The Kaggle.com distribution does not have flights for October for any airports!  The individual creating the dataset did not collect data in the month of October.
* Most of the flights start around 6 AM and end after midnight.  This observation makes sense since we don't expect people leaving or arriving in the airport after midnight.  People are sleeping!

<a id='#section311'></a>
#### 3.1.1 Look at the distribution of departure delay based on departure time and month. 
In order to gain more insight into the data let's look at the data set from different splices. 
* look at departure delays between 0 to 10 minutes
* look at departure delays between 10 to 240 minutes
* look at departure delays greater than 240 minutes

In [None]:
plotJointPlotSplice('DEPARTURE_TIME','MONTH', 'DEPARTURE_DELAY', 
                             flights, (flights['DEPARTURE_DELAY'] > 0) & (flights['DEPARTURE_DELAY'] <= 10),
                             'Depature Delay > 0 \n and Depature Delay <= 10')

In [None]:
plotJointPlotSplice('DEPARTURE_TIME','MONTH', 'DEPARTURE_DELAY', 
                             flights, (flights['DEPARTURE_DELAY'] > 10) & (flights['DEPARTURE_DELAY'] <= 240),
                             'Depature Delay > 10 \n and Departure Delay <= 240')

In [None]:
plotJointPlotSplice('DEPARTURE_TIME','MONTH', 'DEPARTURE_DELAY', flights, flights['DEPARTURE_DELAY'] > 240, 'Depature Delay > 240')

We can see some insight in the data when we consider departure delays greater than 10 minutes.  We can observe the following.
* More delays happen between May and August and around 8 PM to 1 AM.
* We can also see more delays happen in December.
* We can also see more delays happen in the Summer months of May to August.
* A majority of delays are between 10 to 240 minues.
* There is a small set of flight departure delays greater than 240 minutes.

This makes sense as there is more people traveling during the summer months and during the winter holidays.  This could be due to increased congestion during those time periods.

#### 3.1.2 Look at the distribution of Weather delay based on departure time and month
Let's do the same analysis on the weather delay.

In [None]:
plotJointPlotSplice('DEPARTURE_TIME','MONTH', 'WEATHER_DELAY', 
                             flights, (flights['WEATHER_DELAY'] > 0) & (flights['WEATHER_DELAY'] <= 10),
                             'Weather Delay > 0 \n and Weather Delay <= 10')

In [None]:
plotJointPlotSplice('DEPARTURE_TIME','MONTH', 'WEATHER_DELAY', 
                             flights, (flights['WEATHER_DELAY'] > 10) & (flights['WEATHER_DELAY'] <= 240),
                             'Weather Delay > 10 \n and Weather Delay <= 240')

In [None]:
plotJointPlotSplice('DEPARTURE_TIME','MONTH', 'WEATHER_DELAY', 
                             flights, (flights['WEATHER_DELAY'] > 240),
                             'Weather Delay > 240')

We can se some insight in the data when we consider weather delays greater then 10 minutes.  We can observe the following.
* More weather delays happen between May and August and around 8 PM to midnight.
* More delays happen in December.
* In the month of Feb to September we see weather delays greater than 240 minutes during the beginning and end of the day.
* There is small set of flights that have weather delays when compared to the number of flights coming into and out of the airport.

There may be some correlation between departure delay and weather delays.

####  3.1.3 Look at the distribution of departure delay based on day of the week and month
Our dataset also contains the day of the week.  Let's perform the same analysis using the day of the week and month.

In [None]:
plotJointPlotSplice('DAY_OF_WEEK','MONTH', 'DEPARTURE_DELAY', 
                             flights, (flights['DEPARTURE_DELAY'] > 0) & (flights['DEPARTURE_DELAY'] <= 10),
                             'Departure Delay > 0 \n and Departure Delay <= 10')

In [None]:
plotJointPlotSplice('DAY_OF_WEEK','MONTH', 'DEPARTURE_DELAY', 
                             flights, (flights['DEPARTURE_DELAY'] > 0) & (flights['DEPARTURE_DELAY'] <= 240),
                             'Departure Delay > 0 \n and Departure Delay <= 240')

In [None]:
plotJointPlotSplice('DAY_OF_WEEK','MONTH', 'DEPARTURE_DELAY', flights, flights['DEPARTURE_DELAY'] > 240, 'Departure Delay > 240')

We can see some insight in the data when we consider departure delays greater than 10 minutes.  We can observer the following.
* We can see from the second joint plot that there are certain hotspots of certain months and day of the week.

For the last chart it shows departure delays greater than 240 minutes.  We can make the following observations:
* More delays happen on Tuesday and Wednesday during the summer months (May, June, July).
* More delays happen on Wednesday and Thursday during December.

Note in December, New Year's Eve and Christmas Eve are on a Thursday.  There could be more flights departing or arriving during those time periods.

#### 3.1.4 Look at the distribution of weather delay based on day of the week and month
Let's repeat our analysis on the weather delays.

In [None]:
plotJointPlotSplice('DAY_OF_WEEK','MONTH', 'WEATHER_DELAY', 
                             flights, (flights['WEATHER_DELAY'] > 0) & (flights['WEATHER_DELAY'] <= 10),
                             'Weather Delay > 0 \n and Weather Delay <= 10')


In [None]:
plotJointPlotSplice('DAY_OF_WEEK','MONTH', 'WEATHER_DELAY', 
                             flights, (flights['WEATHER_DELAY'] > 10) & (flights['WEATHER_DELAY'] <= 240),
                             'Weather Delay > 10 \n and Weather Delay <= 240')

In [None]:
plotJointPlotSplice('DAY_OF_WEEK','MONTH', 'WEATHER_DELAY', 
                             flights, (flights['WEATHER_DELAY'] > 240),
                             'Weather Delay > 240')

We can see similar observations when comparing the weather delay with the departure delay.  Maybe a combination of the weather and congestion of the airport causes greater delays.

## 4.0 Find Correlation Between Features
Next let's create pair wise plots to find some correlations between the various features.

In [None]:
# only consider delays greater than 0
df = flights.loc[flights['DEPARTURE_DELAY'] > 0]
df = df[['DEPARTURE_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 
       'WEATHER_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'AIR_SYSTEM_DELAY']]

In [None]:
fig = scatter_matrix_all(df, alpha=0.4, figsize=(14,14))      

The above pairwise plot shows that most of the features are not correlated.  We can see the top correlated features with departure delays is the airline delay (0.5880) and weather delay (0.4498).  This corroborates with our previous discovery where we see that weather delay has some correlation with the departure delay.  Also note that airline delay corelates with the departure delay.  This also makes sense, since airline delays like engineering checks, airplane maintenance, etc. contributes to the departure delay.


## 5.0 Load Atlanta AirPort Weather Data
Now let's load Atlanta weather data and find any correlation.

In [None]:
# Read weather information for atlanta international airport
weather = pd.read_csv('../input/airport-weather/Weather.csv')

In [None]:
# Merge in weather data
mergeddf = pd.merge(flights, weather, on=['MONTH', "DAY"])

### 5.1 Let's see if there is correlation between Weather delay and weather data.
Next we will plot the pair wise plots for the weather data with the weather delay to find any correlations.

In [None]:
x = mergeddf[['WEATHER_DELAY', 'PRCP', 'TAVG', 'WSF2', 'WSF5']]
df = x.loc[(x['WEATHER_DELAY'] > 0) & (x['WEATHER_DELAY'] < 240)]
df = df.fillna(0)

In [None]:
fig = scatter_matrix_all(df, alpha=0.4, figsize=(12,12))   

In [None]:
import pandas
import numpy
import seaborn
import matplotlib.pyplot as plt
import scipy.stats                  # for pearson correlation

seaborn.regplot(x="WEATHER_DELAY", y="PRCP", fit_reg=True, data=df, line_kws={"color": "red"});
plt.xlabel('Precipitation');
plt.ylabel('Weather Delay');

fig.tight_layout()
plt.show()

In [None]:
seaborn.regplot(x="WEATHER_DELAY", y="TAVG", fit_reg=True, data=df, line_kws={"color": "red"});
plt.xlabel('Average Tempurature');
plt.ylabel('Weather Delay');

fig.tight_layout()
plt.show()

In [None]:
seaborn.regplot(x="WEATHER_DELAY", y="WSF2", fit_reg=True, data=df, line_kws={"color": "red"});
plt.xlabel('Wind Speed');
plt.ylabel('Weather Delay');

fig.tight_layout()
plt.show()

We only are interested in the first row to see if Weather Delay correlates with other weather features.  The top 5 features that have the highest correlation are the following:
* WSF2 - Fastest 2-minute wind speed (.221)
* WSF5 - Fastest 5-second wind speed (.2036)
* PRCP - Precipitation (0.145)
* TAVG - Average Temperature (0.106)
* TMAX - Maximum Temperature (0.108)

We can see that windspeed, precipitation and tempurature has some influence on the weather delay.

## Analyze AirLine Distribution
Let's look at the distribution of flights based on airline.  The following shows the Airport code mapped to the name of the airline.

<table>
    <tr><th>IATA_CODE</th><th>Airline Name</th></tr>
    <tr><td>UA</td><td>United Air Lines Inc.</td></tr>
    <tr><td>AA</td><td>American Airlines Inc.</td></tr>
    <tr><td>US</td><td>US Airways Inc.</td></tr>
    <tr><td>OO</td><td>Skywest Airlines Inc.</td></tr>
    <tr><td>AS</td><td>Alaska Airlines Inc.</td></tr>
    <tr><td>NK</td><td>Spirit Air Lines</td></tr>
    <tr><td>WN</td><td>Southwest Airlines Co.</td></tr>
    <tr><td>DL</td><td>Delta Air Lines Inc.</td></tr>
    <tr><td>EV</td><td>Atlantic Southeast Airlines</td></tr>
    <tr><td>MQ</td><td>American Eagle Airlines Inc.</td></tr>
</table>

In [None]:
ax = flights['AIRLINE'].value_counts().plot(kind='bar', figsize=(16,5))

From the above bar chart we can see that most of the flights, in the atlanta airport, are serviced by Delta Airlines (DL). 

In [None]:
flights['AIRLINE'].value_counts()