In [1]:
import datetime, warnings, scipy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

In [2]:
merged_flights = pd.read_csv('Data/merged_flights.csv', index_col=0)
merged_flights.head()

Unnamed: 0,SCHEDULED_DATE,DAY_OF_WEEK,AIRLINE,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,TAXI_OUT,WHEELS_OFF,...,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE,Count Flights,Enplanements,area
0,2015-01-01,4,AS,N407AS,ANC,SEA,00:05:00,23:54:00,21.0,00:15:00,...,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,7888,2713843,west
1,2015-01-01,4,AS,N309AS,ANC,SEA,00:45:00,00:41:00,17.0,00:58:00,...,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,7888,2713843,west
2,2015-01-01,4,DL,N3743H,ANC,SEA,00:45:00,00:31:00,25.0,00:56:00,...,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,7888,2713843,west
3,2015-01-01,4,AS,N413AS,ANC,PDX,00:50:00,00:46:00,11.0,00:57:00,...,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,7888,2713843,west
4,2015-01-01,4,US,N804AW,ANC,PHX,01:52:00,01:43:00,21.0,02:04:00,...,ANC,Ted Stevens Anchorage International Airport,Anchorage,AK,USA,61.17432,-149.99619,7888,2713843,west


In [3]:
merged_flights.shape

(2818553, 26)

In [4]:
flight_times = pd.read_csv('Data/flight_times.csv', index_col=0)
flight_times.head()

Unnamed: 0_level_0,SCHEDULED_DATE,DAY_OF_WEEK,AIRLINE,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DEPARTURE_DELAY,SPEED
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,2015-01-01,4,AS,N407AS,ANC,SEA,00:05:00,23:54:00,21.0,00:15:00,205.0,1448,04:30:00,-22.0,-11.0,423.805
1,2015-01-01,4,AA,N3KUAA,LAX,PBI,00:10:00,00:02:00,12.0,00:14:00,280.0,2330,07:50:00,-9.0,-8.0,499.286
2,2015-01-01,4,US,N171US,SFO,CLT,00:20:00,00:18:00,16.0,00:34:00,286.0,2296,08:06:00,5.0,-2.0,481.678
3,2015-01-01,4,AA,N3HYAA,LAX,MIA,00:20:00,00:15:00,15.0,00:30:00,285.0,2342,08:05:00,-9.0,-5.0,493.053
4,2015-01-01,4,AS,N527AS,SEA,ANC,00:25:00,00:24:00,11.0,00:35:00,235.0,1448,03:20:00,-21.0,-1.0,369.702


#### Drop unnecessary (?) columns, repeated info

In [6]:
merged_flights=merged_flights.drop(columns=['TAIL_NUMBER','SCHEDULED_DEPARTURE', 'DEPARTURE_TIME','AIRPORT','COUNTRY','IATA_CODE'])
merged_flights

Unnamed: 0,SCHEDULED_DATE,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_DELAY,DEPARTURE_DELAY,SPEED,CITY,STATE,LATITUDE,LONGITUDE,Count Flights,Enplanements,area
0,2015-01-01,4,AS,ANC,SEA,21.0,00:15:00,205.0,1448,04:30:00,-22.0,-11.0,423.805,Anchorage,AK,61.17432,-149.99619,7888,2713843,west
1,2015-01-01,4,AS,ANC,SEA,17.0,00:58:00,204.0,1448,05:09:00,-14.0,-4.0,425.882,Anchorage,AK,61.17432,-149.99619,7888,2713843,west
2,2015-01-01,4,DL,ANC,SEA,25.0,00:56:00,210.0,1448,05:15:00,-24.0,-14.0,413.714,Anchorage,AK,61.17432,-149.99619,7888,2713843,west
3,2015-01-01,4,AS,ANC,PDX,11.0,00:57:00,215.0,1542,05:25:00,-18.0,-4.0,430.326,Anchorage,AK,61.17432,-149.99619,7888,2713843,west
4,2015-01-01,4,US,ANC,PHX,21.0,02:04:00,323.0,2552,09:15:00,-10.0,-9.0,474.056,Anchorage,AK,61.17432,-149.99619,7888,2713843,west
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,2015-06-25,4,B6,HYA,JFK,9.0,13:24:00,72.0,196,14:21:00,-12.0,6.0,163.333,Hyannis,MA,41.66934,-70.28036,12,31027,south
2818549,2015-06-26,5,B6,HYA,JFK,5.0,13:20:00,72.0,196,14:21:00,-18.0,6.0,163.333,Hyannis,MA,41.66934,-70.28036,12,31027,south
2818550,2015-06-27,6,B6,HYA,JFK,16.0,13:24:00,72.0,196,14:21:00,-7.0,-1.0,163.333,Hyannis,MA,41.66934,-70.28036,12,31027,south
2818551,2015-06-29,1,B6,HYA,JFK,6.0,13:11:00,72.0,196,14:21:00,-22.0,-4.0,163.333,Hyannis,MA,41.66934,-70.28036,12,31027,south


### Linear Regression - testing only on numerical features

In [7]:
LR_train =merged_flights.drop(columns=['SCHEDULED_DATE','DAY_OF_WEEK','WHEELS_OFF','SCHEDULED_ARRIVAL','LONGITUDE','LATITUDE','Enplanements'])
LR_train

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,TAXI_OUT,SCHEDULED_TIME,DISTANCE,ARRIVAL_DELAY,DEPARTURE_DELAY,SPEED,CITY,STATE,Count Flights,area
0,AS,ANC,SEA,21.0,205.0,1448,-22.0,-11.0,423.805,Anchorage,AK,7888,west
1,AS,ANC,SEA,17.0,204.0,1448,-14.0,-4.0,425.882,Anchorage,AK,7888,west
2,DL,ANC,SEA,25.0,210.0,1448,-24.0,-14.0,413.714,Anchorage,AK,7888,west
3,AS,ANC,PDX,11.0,215.0,1542,-18.0,-4.0,430.326,Anchorage,AK,7888,west
4,US,ANC,PHX,21.0,323.0,2552,-10.0,-9.0,474.056,Anchorage,AK,7888,west
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,B6,HYA,JFK,9.0,72.0,196,-12.0,6.0,163.333,Hyannis,MA,12,south
2818549,B6,HYA,JFK,5.0,72.0,196,-18.0,6.0,163.333,Hyannis,MA,12,south
2818550,B6,HYA,JFK,16.0,72.0,196,-7.0,-1.0,163.333,Hyannis,MA,12,south
2818551,B6,HYA,JFK,6.0,72.0,196,-22.0,-4.0,163.333,Hyannis,MA,12,south


In [8]:
LR_model = LinearRegression(fit_intercept=True)

LR_X = LR_train[["TAXI_OUT",'SCHEDULED_TIME','DISTANCE','DEPARTURE_DELAY','SPEED']]
LR_Y = LR_train["ARRIVAL_DELAY"]

LR_model.fit(LR_X, LR_Y)

#LR_xfit = np.linspace(6, 12, 2) # testing on train set
LR_yfit = LR_model.predict(LR_X)

print(LR_yfit)

[ -4.00482084  -2.16294871  -3.81605119 ...  -3.01797038 -14.97889068
  -7.52703667]


In [9]:
# Calculate MSE and R2

# The mean squared error 
print("Mean squared error - Train: %.2f" % mean_squared_error(LR_Y, LR_yfit))
# Explained variance score: 1 is perfect prediction
print('Variance score - Train: %.2f' % r2_score(LR_Y, LR_yfit))

Mean squared error - Train: 648.12
Variance score - Train: 0.61


#### Polynomial

In [10]:
PR_model = make_pipeline(PolynomialFeatures(3), LinearRegression(fit_intercept=True))

PR_model.fit(LR_X, LR_Y)

PR_yfit = PR_model.predict(LR_X)

print(PR_yfit)
print("Mean squared error - Train: %.2f" % mean_squared_error(LR_Y, PR_yfit))
print('Variance score - Train: %.2f' % r2_score(LR_Y, PR_yfit))

[ -7.47860307  -5.60486048  -7.10626978 ...  -6.3753303  -16.10264441
 -12.6529171 ]
Mean squared error - Train: 275.93
Variance score - Train: 0.83


### Linear Regression - One-Hot encoding

In [11]:
LR_model = LinearRegression(fit_intercept=True)

LR_X = LR_train.drop(columns=['ARRIVAL_DELAY'])
LR_Y = LR_train["ARRIVAL_DELAY"]

In [12]:
LR_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2818553 entries, 0 to 2818552
Data columns (total 12 columns):
 #   Column               Dtype  
---  ------               -----  
 0   AIRLINE              object 
 1   ORIGIN_AIRPORT       object 
 2   DESTINATION_AIRPORT  object 
 3   TAXI_OUT             float64
 4   SCHEDULED_TIME       float64
 5   DISTANCE             int64  
 6   DEPARTURE_DELAY      float64
 7   SPEED                float64
 8   CITY                 object 
 9   STATE                object 
 10  Count Flights        int64  
 11  area                 object 
dtypes: float64(4), int64(2), object(6)
memory usage: 279.6+ MB


In [13]:
# Find object columns for one-hot encoding
df_categories = LR_X.select_dtypes(include=['object']).copy()
df_categories.head()

Unnamed: 0,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,CITY,STATE,area
0,AS,ANC,SEA,Anchorage,AK,west
1,AS,ANC,SEA,Anchorage,AK,west
2,DL,ANC,SEA,Anchorage,AK,west
3,AS,ANC,PDX,Anchorage,AK,west
4,US,ANC,PHX,Anchorage,AK,west


In [14]:
#df_categories = pd.get_dummies(df_categories, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
#print(df_categories.head())

Probably need to drop some of the categorical features, (CITY, STATE, area ?)

If we encode all of those features, we will end up with over a thousand columns

In [15]:
X = LR_train.drop(columns=['ARRIVAL_DELAY'])
Y = LR_train["ARRIVAL_DELAY"]
X = pd.get_dummies(X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Count Flights,Airline_AA,Airline_AS,Airline_B6,Airline_DL,...,state_VT,state_WA,state_WI,state_WV,state_WY,area_islands,area_midwest,area_northeast,area_south,area_west
0,21.0,205.0,1448,-11.0,423.805,7888,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,17.0,204.0,1448,-4.0,425.882,7888,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,25.0,210.0,1448,-14.0,413.714,7888,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,11.0,215.0,1542,-4.0,430.326,7888,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,21.0,323.0,2552,-9.0,474.056,7888,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2818548,9.0,72.0,196,6.0,163.333,12,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2818549,5.0,72.0,196,6.0,163.333,12,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2818550,16.0,72.0,196,-1.0,163.333,12,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2818551,6.0,72.0,196,-4.0,163.333,12,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


### Training model on encoded features

In [16]:
LR_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:1000,:]
LR_Y = LR_train.loc[:1000,"ARRIVAL_DELAY"]

In [17]:
LR_X = pd.get_dummies(LR_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
LR_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Count Flights,Airline_AS,Airline_DL,Airline_UA,Airline_US,...,Dest_ORD,Dest_OTZ,Dest_PDX,Dest_PHX,Dest_SCC,Dest_SEA,Dest_SFO,city_Anchorage,state_AK,area_west
0,21.0,205.0,1448,-11.0,423.805,7888,1,0,0,0,...,0,0,0,0,0,1,0,1,1,1
1,17.0,204.0,1448,-4.0,425.882,7888,1,0,0,0,...,0,0,0,0,0,1,0,1,1,1
2,25.0,210.0,1448,-14.0,413.714,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,1,1
3,11.0,215.0,1542,-4.0,430.326,7888,1,0,0,0,...,0,0,1,0,0,0,0,1,1,1
4,21.0,323.0,2552,-9.0,474.056,7888,0,0,0,1,...,0,0,0,1,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,21.0,212.0,1448,-9.0,409.811,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,1,1
997,18.0,79.0,399,-7.0,303.038,7888,1,0,0,0,...,0,0,0,0,0,0,0,1,1,1
998,12.0,95.0,539,-1.0,340.421,7888,1,0,0,0,...,0,0,0,0,0,0,0,1,1,1
999,18.0,205.0,1448,-11.0,423.805,7888,1,0,0,0,...,0,0,0,0,0,1,0,1,1,1


In [18]:
LR_model.fit(LR_X, LR_Y)
LR_yfit = LR_model.predict(LR_X)

print(LR_yfit)
print("Mean squared error - Train: %.2f" % mean_squared_error(LR_Y, LR_yfit))
print('Variance score - Train: %.2f' % r2_score(LR_Y, LR_yfit))

[ -8.73325341  -5.06342831 -19.28953609 ...  -0.65499167 -11.56619179
  -2.29013176]
Mean squared error - Train: 78.96
Variance score - Train: 0.88


### Polynomial Regression with one-hot encoding

In [19]:
PR_model = make_pipeline(PolynomialFeatures(2), LinearRegression(fit_intercept=True))
PR_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:10000,:]
PR_Y = LR_train.loc[:10000,"ARRIVAL_DELAY"]

PR_X = pd.get_dummies(PR_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
PR_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Count Flights,Airline_AA,Airline_AS,Airline_B6,Airline_DL,...,Dest_SMF,Dest_SMX,Dest_STL,Dest_TPA,Dest_TUS,city_Anchorage,city_Los Angeles,state_AK,state_CA,area_west
0,21.0,205.0,1448,-11.0,423.805,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
1,17.0,204.0,1448,-4.0,425.882,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
2,25.0,210.0,1448,-14.0,413.714,7888,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
3,11.0,215.0,1542,-4.0,430.326,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
4,21.0,323.0,2552,-9.0,474.056,7888,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,18.0,245.0,1744,51.0,427.102,103511,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9997,19.0,77.0,308,13.0,240.000,103511,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9998,15.0,250.0,1947,4.0,467.280,103511,0,0,0,1,...,0,0,0,0,0,0,1,0,1,1
9999,15.0,60.0,209,38.0,209.000,103511,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [20]:
PR_model.fit(PR_X, PR_Y)
PR_yfit = PR_model.predict(PR_X)
print(PR_yfit)
print("Mean squared error - Train: %.2f" % mean_squared_error(PR_Y, PR_yfit))
print('Variance score - Train: %.2f' % r2_score(PR_Y, PR_yfit))

[ -5.57757046  -2.98643581 -10.04295818 ...  -1.55354687  34.37711155
 204.06038735]
Mean squared error - Train: 119.09
Variance score - Train: 0.89


### Regression Tree

In [21]:
Tree_model = DecisionTreeRegressor(max_depth=2)

Tree_X = LR_train.drop(columns=['ARRIVAL_DELAY']).loc[:10000,:]
Tree_Y = LR_train.loc[:10000,"ARRIVAL_DELAY"]
#Tree_X = LR_train.drop(columns=['ARRIVAL_DELAY'])
#Tree_Y = LR_train["ARRIVAL_DELAY"]

Tree_X = pd.get_dummies(Tree_X, columns=['AIRLINE','ORIGIN_AIRPORT','DESTINATION_AIRPORT','CITY','STATE','area'], prefix = ['Airline','Origin','Dest','city','state','area'])
Tree_X

Unnamed: 0,TAXI_OUT,SCHEDULED_TIME,DISTANCE,DEPARTURE_DELAY,SPEED,Count Flights,Airline_AA,Airline_AS,Airline_B6,Airline_DL,...,Dest_SMF,Dest_SMX,Dest_STL,Dest_TPA,Dest_TUS,city_Anchorage,city_Los Angeles,state_AK,state_CA,area_west
0,21.0,205.0,1448,-11.0,423.805,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
1,17.0,204.0,1448,-4.0,425.882,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
2,25.0,210.0,1448,-14.0,413.714,7888,0,0,0,1,...,0,0,0,0,0,1,0,1,0,1
3,11.0,215.0,1542,-4.0,430.326,7888,0,1,0,0,...,0,0,0,0,0,1,0,1,0,1
4,21.0,323.0,2552,-9.0,474.056,7888,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,18.0,245.0,1744,51.0,427.102,103511,1,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9997,19.0,77.0,308,13.0,240.000,103511,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1
9998,15.0,250.0,1947,4.0,467.280,103511,0,0,0,1,...,0,0,0,0,0,0,1,0,1,1
9999,15.0,60.0,209,38.0,209.000,103511,0,0,0,0,...,0,0,0,0,0,0,1,0,1,1


In [22]:
Tree_model.fit(Tree_X, Tree_Y)
Tree_yfit = Tree_model.predict(Tree_X)

print(Tree_yfit)
print("Mean squared error - Train: %.2f" % mean_squared_error(Tree_Y, Tree_yfit))
print('Variance score - Train: %.2f' % r2_score(Tree_Y, Tree_yfit))

[ -2.22553549  -2.22553549  -2.22553549 ...  -2.22553549  -2.22553549
 212.69512195]
Mean squared error - Train: 321.64
Variance score - Train: 0.70
