# Regression models

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
# load data
df = pd.read_csv('data/model_df_full.csv', index_col=0)

In [4]:
df.head(1)

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,tail_num,op_carrier_fl_num,origin_airport_id,origin,dest_airport_id,dest,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,arr_time,arr_delay,diverted,crs_elapsed_time,actual_elapsed_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,dep_hour,arr_hour,arr_hour_rank,month,month_rank,fl_num_speek_rank,carrier_rank,origin_precip,origin_snow,origin_windgust,origin_cloudcover,dest_precip,dest_snow,dest_windgust,dest_cloudcover,origin_precip_cat,origin_snow_cat,origin_windgust_cat,origin_cloud_cat,dest_precip_cat,dest_snow_cat,dest_windgust_cat,dest_cloud_cat,delay_flag,day,weekday,airline_delay,haul_length,dep_timeday,arr_timeday,delay_dep_h,delay_arr_h,busy_origin,busy_dest,origin_delay
0,2018-01-01,B6,3,6078,3,12478,JFK,14843,SJU,1138.0,28.0,28.0,1206.0,1626.0,4.0,1630.0,32.0,0.0,228.0,232.0,1598.0,22.0,0.0,4.0,0.0,6.0,11,15,1,1,0,1.0,0,0.0,0.0,55.4,24.9,3.909477,3.909477,3.909477,3.909477,0.0,0.0,3.0,0.0,3.909477,3.909477,3.909477,3.909477,1,1,0,3,1,1,2,1,1,3,3.0,3


In [4]:
all_use_cols = ['arr_delay','origin', 'dest', 'diverted', 'crs_elapsed_time',
       'actual_elapsed_time', 'distance', 'carrier_delay', 'weather_delay',
       'nas_delay', 'security_delay', 'late_aircraft_delay', 'dep_hour',
       'arr_hour', 'arr_hour_rank', 'month', 'month_rank', 'fl_num_speek_rank', 'carrier_rank',
       'origin_precip', 'origin_snow', 'origin_windgust', 'origin_cloudcover',
       'dest_precip', 'dest_snow', 'dest_windgust', 'dest_cloudcover',
       'origin_precip_cat', 'origin_snow_cat', 'origin_windgust_cat',
       'origin_cloud_cat', 'dest_precip_cat', 'dest_snow_cat',
       'dest_windgust_cat', 'dest_cloud_cat', 'delay_flag', 'day', 'weekday',
       'airline_delay', 'haul_length', 'dep_timeday', 'arr_timeday',
       'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest',
       'origin_delay']

In [5]:
# all columns that might conceivably be used for training
df = df[all_use_cols]

# Linear regression

### 01 - default params, without weather

In [8]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank', 'weekday',
        'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest', 'origin_delay']

X = df[cols].to_numpy()
y = df.arr_delay.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.040294051146506416

Result: 0.04

### 02 - default params, weather added

In [10]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank', 'weekday',
        'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest', 'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

X = df[cols].to_numpy()
y = df.arr_delay.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
lr2 = LinearRegression()
lr2.fit(X_train, y_train)
lr2.score(X_test, y_test)

0.056038517946014554

Score: 0.055
Weather features improve predictive power. Let's try adding polynomials.

### 03 - default params, weather added, polynomials - 2nd order

In [12]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank', 'weekday',
        'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest', 'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

X = df[cols].to_numpy()
y = df.arr_delay.to_numpy()

poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [13]:
model_03 = LinearRegression()
model_03.fit(X_train, y_train)
model_03.score(X_test, y_test)

0.07539503827411442

In [14]:
# filename = 'models/model_03.sav'
# pickle.dump(model_03, open(filename, 'wb'))

Score = 0.076 - better again. How does this compare to the training set?

In [15]:
model_03.score(X_train, y_train)

0.07680762786486939

0.078. About the same, so we're not overfitting yet. Let's try 3rd-order polynomials

### 04 - default params, weather added, polynomials - 3rd order

In [4]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank', 'weekday',
        'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest', 'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

X = df[cols].to_numpy()
y = df.arr_delay.to_numpy()

poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [5]:
model_04 = LinearRegression()
model_04.fit(X_train, y_train)
model_04.score(X_test, y_test)

0.08389678808424572

In [6]:
y_pred = model_04.predict(X_test)
mean_squared_error(y_test, y_pred)

707.1344100740351

In [7]:
mean_squared_error(y_test, y_pred, squared=False)

26.591998986049077

In [8]:
# filename = 'models/model_04.sav'
# pickle.dump(model_04, open(filename, 'wb'))

0.083 - even better. How does it compare to the test set?

In [11]:
model_04.score(X_train, y_train)

0.08888570183677313

0.089 - It fits the training set slightly better than the test set. 
But that took quite a long time, so let's see if we can trim down the features and get a similar result. That might also deal with the overfitting.

### 05 - Detect feature importances using RandomForestRegression on a sample of the data

In [16]:
sample_mask = np.random.rand(len(X_train)) < 0.1
X_train_sample = X_train[sample_mask,:]
y_train_sample = y_train[sample_mask]

In [19]:
model_05 = RandomForestRegressor(n_estimators=50)
model_05.fit(X_train_sample, y_train_sample)

RandomForestRegressor(n_estimators=50)

In [20]:
# filename = 'models/model_05.sav'
# pickle.dump(model_05, open(filename, 'wb'))

In [21]:
y_pred = model_05.predict(X_test)
print(r2_score(y_test, y_pred))

-0.06853453861644376


No.

### 06 - default params, weather added, polynomials - 3rd order; some features removed

In [29]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank',
        'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

X = df[cols].to_numpy()
y = df.arr_delay.to_numpy()

poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [30]:
model_06 = LinearRegression()
model_06.fit(X_train, y_train)
model_06.score(X_test, y_test)

0.07899706462415756

0.079

In [31]:
# filename = 'models/model_06.sav'
# pickle.dump(model_06, open(filename, 'wb'))

### 07 - Repeat 06 but Scale Features First

In [None]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank',
        'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

ss = StandardScaler()
X = ss.fit_transform(df[cols])
y = df['arr_delay'].to_numpy()

poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [35]:
model_07 = LinearRegression()
model_07.fit(X_train, y_train)
model_07.score(X_test, y_test)

0.07852707824026739

0.079 on the test data vs. 0.08 on the training data

In [36]:
# filename = 'models/model_07.sav'
# pickle.dump(model_07, open(filename, 'wb'))

In [37]:
model_07.score(X_train, y_train)

0.08082897931570343

### 08 - Hail Mary Random Forest

In [38]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank',
        'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

X = df[cols].to_numpy()
y = df.arr_delay.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model_08 = RandomForestRegressor()
model_08.fit(X_train, y_train)

RandomForestRegressor()

In [39]:
y_pred = model_08.predict(X_test)
r2_score(y_test, y_pred)

0.028683238835234715

0.029 - bad

In [None]:
# filename = 'models/model_07.sav'
# pickle.dump(model_07, open(filename, 'wb'))

# 09 - Elastic Net with 2nd order polynomials

In [41]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank',
        'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

ss = StandardScaler()
X = ss.fit_transform(df[cols])
y = df['arr_delay'].to_numpy()

poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [44]:
model_09 = ElasticNet()
model_09.fit(X_train, y_train)

ElasticNet()

In [45]:
model_09.score(X_test, y_test)

0.04668579077470347

0.045. More evidence that overfitting isn't the problem - regularisation makes it worse.

### 10 - Elastic Net with 3rd order polynomials

In [46]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank',
        'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

ss = StandardScaler()
X = ss.fit_transform(df[cols])
y = df['arr_delay'].to_numpy()

poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [48]:
model_10 = ElasticNet()
model_10.fit(X_train, y_train)

ElasticNet()

In [49]:
model_10.score(X_test, y_test)

0.05783878467534331

In [50]:
# filename = 'models/model_10.sav'
# pickle.dump(model_10, open(filename, 'wb'))

## 11 - All features; Elastic Net with 3rd order polynomials; GridSearch

In [None]:
cols = ['arr_hour_rank', 'month_rank', 'haul_length', 'fl_num_speek_rank', 'carrier_rank', 'weekday',
        'delay_dep_h', 'delay_arr_h', 'busy_origin', 'busy_dest', 'origin_delay',
       'origin_precip_cat', 'origin_snow_cat',
       'origin_windgust_cat', 'origin_cloud_cat', 'dest_precip_cat',
       'dest_snow_cat', 'dest_windgust_cat', 'dest_cloud_cat']

ss = StandardScaler()
X = ss.fit_transform(df[cols])
y = df['arr_delay'].to_numpy()

poly = PolynomialFeatures(3)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

In [None]:
params = {'alpha': [0.001, 0.01, 0.1],
         'l1_ratio': [0.9, 0.5, 0.1]}
model_11 = GridSearchCV(estimator=ElasticNet(), param_grid=params, verbose=3)
model_11.fit(X_train, y_train)

In [None]:
filename = 'models/model_11.sav'
pickle.dump(model_11, open(filename, 'wb'))