#### Airline Arrivals
1. Data: http://stat-computing.org/dataexpo/2009/the-data.html - chose to use the 2008 packaged data
2. Objective: Predict how late flights will be - should return a number
3. Flights are only late if >30 minutes from scheduled arrival time - target: difference between scheduled and arrival -30min

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns

from sklearn import neighbors
from sklearn import linear_model
from sklearn.cross_decomposition import PLSRegression

%matplotlib inline


In [23]:
# import the data
df = pd.read_csv("2008_flight_data.csv")
print(df['ArrDelay'].isnull().sum())
#print(df.columns)
#print(df.dtypes)


154699


In [24]:
# drop irrelevant columns, categorical and target-related columns and create a feature set 
df = df[pd.notnull(df['ArrDelay'])]
#print(df['ArrDelay'].isnull().sum())
data = df.drop(['Year','FlightNum','TailNum','DayofMonth','CancellationCode','ArrTime','CRSArrTime','Month'], axis=1)

# remove records for flights that were canceled or diverted
data = data[data.Cancelled != 1]
data = data[data.Diverted != 1]

# delete canceled and diverted fields
data = data.drop(['Cancelled','Diverted'],axis=1)

df_corr = df.drop(['Year','FlightNum','TailNum','DayofMonth','CancellationCode','ArrTime', 'CRSArrTime','Cancelled','Diverted'],axis=1)
corr_mat = df_corr.corr()
#print(corr_mat)


#create target - ArrDelay - 30. Then, if value is less than 0 - replace with 0 (because not late)
target = data['ArrDelay'] - 30
target[target < 0] = 0 
print(target.isnull().sum())

#drop target from feature set
data = data.drop(['ArrDelay'],axis=1)


#investigate null values for remaining data (potentially drop - especially if in arrival/predicted arrival fields or interpolate)

0


In [25]:
# try to use some of the airport dummies - may be some signal there (don't have to use all)


categorical = data.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())
    
# Originally crashed kernel with get_dummies - try dropping origin and dest and leaving carrier
data = data.drop(['Origin','Dest'],axis=1)

UniqueCarrier
20
Origin
303
Dest
302


In [26]:
#create features - time of year dummies(combine months), dummies for carriers, dummies - origin, destination
data = pd.get_dummies(data)

print(data.columns)


Index(['DayOfWeek', 'DepTime', 'CRSDepTime', 'ActualElapsedTime',
       'CRSElapsedTime', 'AirTime', 'DepDelay', 'Distance', 'TaxiIn',
       'TaxiOut', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
       'LateAircraftDelay', 'UniqueCarrier_9E', 'UniqueCarrier_AA',
       'UniqueCarrier_AQ', 'UniqueCarrier_AS', 'UniqueCarrier_B6',
       'UniqueCarrier_CO', 'UniqueCarrier_DL', 'UniqueCarrier_EV',
       'UniqueCarrier_F9', 'UniqueCarrier_FL', 'UniqueCarrier_HA',
       'UniqueCarrier_MQ', 'UniqueCarrier_NW', 'UniqueCarrier_OH',
       'UniqueCarrier_OO', 'UniqueCarrier_UA', 'UniqueCarrier_US',
       'UniqueCarrier_WN', 'UniqueCarrier_XE', 'UniqueCarrier_YV'],
      dtype='object')


In [27]:
# replace null values in delay fields with 0 

data = data.fillna(0,axis=1)
print(target.isnull().sum())
#data.describe()

0


In [28]:
# perform ols regression
regr = linear_model.LinearRegression()

regr.fit(data, target)
Y_pred = regr.predict(data)
print('R-squared regression:', regr.score(data, target))

R-squared regression: 0.933879162292


In [34]:
# test model on dataset from different year
df_test = pd.read_csv("2007_flight_data.csv")
#df_test.shape
df_test = df[pd.notnull(df_test['ArrDelay'])]
df_test.shape
df_test.columns


    




Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'],
      dtype='object')

In [35]:

# cleanup/create features as with the 2008 dataset
data_test = df_test.drop(['Year','FlightNum','TailNum','DayofMonth','CancellationCode','ArrTime', 'CRSArrTime','Month'], axis=1)

data_test = data_test[data_test.Cancelled != 1]
data_test = data_test[data_test.Diverted != 1]
data_test = data_test.drop(['Origin','Dest'],axis=1)
data_test = data_test.drop(['Cancelled','Diverted'],axis=1)
data_test = pd.get_dummies(data_test)
data_test = data_test.fillna(0,axis=1)

#create target - ArrDelay - 30. Then, if value is less than 0 - replace with 0 (because not late)
target_test = data_test['ArrDelay'] - 30
target_test[target_test < 0] = 0 
print(target.isnull().sum())
data_test = data_test.drop(['ArrDelay'],axis=1)
Y_pred_test = regr.predict(data_test)
print('R-squared regression:', regr.score(data_test, target_test))

0
R-squared regression: 0.93376404955
