# Models on Integrated Features

In [24]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV

In [35]:
# load data
df = pd.read_csv('data/model_df_weather.csv', index_col=0)

  mask |= (ar1 == a)


In [36]:
df.head(1)

Unnamed: 0,arr_delay,precip,snow,windgust,cloudcover,carrier_speed_rank,flight_num_speed_rank,month,month_rank,dep_hour,...,snow_cat,windgust_cat,cloud_cat,fl_month,airline_delay,haul_length,dep_timeday,arr_timeday,busy_origin,busy_dest
0,-2.0,0.0,0.0,40.7,74.3,0,1,1,0,19,...,0,2,2,1,1,0,3,3,4,3.0


## Model 4: Simple LinReg

In [7]:
df.columns

Index(['arr_delay', 'precip', 'snow', 'windgust', 'cloudcover',
       'carrier_speed_rank', 'flight_num_speed_rank', 'month', 'month_rank',
       'dep_hour', 'arr_hour', 'hour_rank', 'dep_hour_rank', 'arr_hour_rank',
       'precip_cat', 'snow_cat', 'windgust_cat', 'cloud_cat', 'fl_month',
       'airline_delay', 'haul_length', 'dep_timeday', 'arr_timeday',
       'busy_origin', 'busy_dest'],
      dtype='object')

In [37]:
x_cols = ['precip_cat', 'snow_cat', 'windgust_cat', 'cloud_cat', 'fl_month',
       'airline_delay', 'haul_length', 'dep_timeday', 'arr_timeday',
       'busy_origin', 'busy_dest']

X = df[x_cols].to_numpy()
y= df.arr_delay.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [15]:
# filename = 'models/lr4.sav'
# pickle.dump(lr, open(filename, 'wb'))

In [13]:
lr.score(X_test, y_test)

0.0353581725167299

# Model5: Hail Mary Random Forest - defaults

In [19]:
rf1 = RandomForestRegressor()
rf1.fit(X_train, y_train)

filename = 'models/rf1.sav'
pickle.dump(rf1, open(filename, 'wb'))

In [21]:
y_pred = rf1.predict(X_test)
print(r2_score(y_test, y_pred))

0.05976359190644176


# Model 6: Random Forest with 500 estimators

In [23]:
rf2 = RandomForestRegressor(n_estimators=500)
rf2.fit(X_train, y_train)

filename = 'models/rf2.sav'
pickle.dump(rf2, open(filename, 'wb'))

In [30]:
y_pred = rf2.predict(X_test)
print(r2_score(y_test, y_pred))

0.060161045485841225


In [31]:
y_trainpred = rf2.predict(X_train)
print(r2_score(y_train, y_trainpred))

0.12521874844809677


# Model 7: Polynomial regression

In [26]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)

X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(X_poly, y, test_size=0.2)

In [32]:
plr = LinearRegression()
plr.fit(X_poly_train, y_poly_train)

filename = 'models/plr01.sav'
pickle.dump(plr, open(filename, 'wb'))

In [33]:
plr.score(X_poly_test, y_poly_test)

0.04628511122630863

In [34]:
plr.score(X_poly_train, y_poly_train)

0.045765259868254304

This doesn't seem to be overfitting. Maybe try higher-order polynomial features

# Model 8: Polynomial regression, third-order polynomials

In [38]:
poly3 = PolynomialFeatures(3)
X_poly3 = poly.fit_transform(X)

X_poly3_train, X_poly3_test, y_poly3_train, y_poly3_test = train_test_split(X_poly3, y, test_size=0.2)

In [39]:
plr2 = LinearRegression()
plr2.fit(X_poly_train, y_poly_train)

filename = 'models/plr02.sav'
pickle.dump(plr2, open(filename, 'wb'))

In [40]:
plr.score(X_poly3_test, y_poly3_test)

0.04510721284466834

In [41]:
plr.score(X_poly3_train, y_poly3_train)

0.04605904851084508